From c2691f68bf9d8d15a78fa36716739e00d262f0c5 Mon Sep 17 00:00:00 2001
From: Jean-Francois Dockes <jf@dockes.org>
Date: Sun, 3 Feb 2019 17:56:41 +0100
Subject: [PATCH] utf8 truncate utility function

---
 src/Makefile.am              |   1 +
 src/testmains/trutf8iter.cpp | 185 ++++++++++++++++++++++++++++++++++
 src/utils/utf8iter.cpp       | 189 ++++-------------------------------
 src/utils/utf8iter.h         |   2 +
 4 files changed, 208 insertions(+), 169 deletions(-)
 create mode 100644 src/testmains/trutf8iter.cpp
diff --git a/src/Makefile.am b/src/Makefile.am
index a0f13ee2..bf6ae668 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -258,6 +258,7 @@ utils/strmatcher.cpp \
 utils/strmatcher.h \
 utils/transcode.cpp \
 utils/transcode.h \
+utils/utf8iter.cpp \
 utils/utf8iter.h \
 utils/wipedir.cpp \
 utils/wipedir.h \
diff --git a/src/testmains/trutf8iter.cpp b/src/testmains/trutf8iter.cpp
new file mode 100644
index 00000000..68f5132b
--- /dev/null
+++ b/src/testmains/trutf8iter.cpp
@@ -0,0 +1,185 @@
+/* Copyright (C) 2005 J.F.Dockes
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program; if not, write to the
+ *   Free Software Foundation, Inc.,
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <string>
+#include <iostream>
+#include <vector>
+
+
+#include "log.h"
+#include "transcode.h"
+
+#ifndef NO_NAMESPACES
+using namespace std;
+#endif /* NO_NAMESPACES */
+
+#define UTF8ITER_CHECK
+#include "utf8iter.h"
+#include "readfile.h"
+#include "textsplit.h"
+
+void tryempty()
+{
+    Utf8Iter it("");
+    cout << "EOF ? " << it.eof() << endl;
+    TextSplit::isCJK(*it);
+    exit(0);
+}
+
+const char *thisprog;
+static char usage [] =
+    "utf8iter [opts] infile outfile\n"
+    " converts infile to 32 bits unicode (processor order), for testing\n"
+    "-v : print stuff as we go\n"
+    ;
+
+void Usage() {
+    fprintf(stderr, "%s:%s\n", thisprog, usage);
+    exit(1);
+}
+static int     op_flags;
+#define OPT_v	  0x2 
+
+int main(int argc, char **argv)
+{
+    thisprog = argv[0];
+    argc--; argv++;
+
+    while (argc > 0 && **argv == '-') {
+	(*argv)++;
+	if (!(**argv))
+	    Usage();
+	while (**argv)
+	    switch (*(*argv)++) {
+	    case 'v':   op_flags |= OPT_v; break;
+
+	    default: Usage();	break;
+	    }
+	argc--;argv++;
+    }
+
+    if (argc != 2) {
+	Usage();
+    }
+    const char *infile = *argv++;argc--;
+    const char *outfile = *argv++;argc--;
+    string in;
+    if (!file_to_string(infile, in)) {
+	cerr << "Cant read file\n" << endl;
+	exit(1);
+    }
+    
+    vector<unsigned int>ucsout1;
+    string out, out1;
+    Utf8Iter it(in);
+    FILE *fp = fopen(outfile, "w");
+    if (fp == 0) {
+	fprintf(stderr, "cant create %s\n", outfile);
+	exit(1);
+    }
+
+    int nchars = 0;
+    for (;!it.eof(); it++) {
+	unsigned int value = *it;
+	if (value == (unsigned int)-1) {
+	    cerr << "Conversion error occurred\n" << endl;
+	    exit(1);
+	}
+	if (op_flags & OPT_v) {
+	   printf("Value: 0x%x", value);
+	   if (value < 0x7f)
+	       printf(" (%c) ", value);
+	   printf("\n");
+	}
+	// UTF-32LE or BE array
+	ucsout1.push_back(value);
+	// UTF-32LE or BE file
+	fwrite(&value, 4, 1, fp);
+
+	// Reconstructed utf8 strings (2 methods)
+	if (!it.appendchartostring(out))
+	    break;
+	// conversion to string
+	out1 += it;
+	
+	// fprintf(stderr, "%s", string(it).c_str());
+	nchars++;
+    }
+    fclose(fp);
+
+    fprintf(stderr, "nchars %d\n", nchars);
+    if (in.compare(out)) {
+	fprintf(stderr, "error: out != in\n");
+	exit(1);
+    }
+    if (in != out1) {
+	fprintf(stderr, "error: out1 != in\n");
+	exit(1);
+    }
+
+    // Rewind and do it a second time
+    vector<unsigned int>ucsout2;
+    it.rewind();
+    for (int i = 0; ; i++) {
+	unsigned int value;
+	if ((value = it[i]) == (unsigned int)-1) {
+	    fprintf(stderr, "%d chars\n", i);
+	    break;
+	}
+	it++;
+	ucsout2.push_back(value);
+    }
+
+    if (ucsout1 != ucsout2) {
+	fprintf(stderr, "error: ucsout1 != ucsout2\n");
+	exit(1);
+    }
+
+    ucsout2.clear();
+    int ercnt;
+    const char *encoding = "UTF-32LE"; // note : use BE on high-endian machine
+    string ucs, ucs1;
+    for (vector<unsigned int>::iterator it = ucsout1.begin(); 
+	 it != ucsout1.end(); it++) {
+	unsigned int i = *it;
+	ucs.append((const char *)&i, 4);
+    }
+    if (!transcode(ucs, ucs1, 
+		   encoding, encoding, &ercnt) || ercnt) {
+	fprintf(stderr, "Transcode check failed, ercount: %d\n", ercnt);
+	exit(1);
+    }
+    if (ucs.compare(ucs1)) {
+	fprintf(stderr, "error: ucsout1 != ucsout2 after iconv\n");
+	exit(1);
+    }
+
+    if (!transcode(ucs, ucs1, 
+		   encoding, "UTF-8", &ercnt) || ercnt) {
+	fprintf(stderr, "Transcode back to utf-8 check failed, ercount: %d\n",
+		ercnt);
+	exit(1);
+    }
+    if (ucs1.compare(in)) {
+	fprintf(stderr, "Transcode back to utf-8 compare to in failed\n");
+	exit(1);
+    }
+    exit(0);
+}
+
diff --git a/src/utils/utf8iter.cpp b/src/utils/utf8iter.cpp
index 68f5132b..797f6189 100644
--- a/src/utils/utf8iter.cpp
+++ b/src/utils/utf8iter.cpp
@@ -1,185 +1,36 @@
-/* Copyright (C) 2005 J.F.Dockes
+/* Copyright (C) 2017-2019 J.F.Dockes
  *   This program is free software; you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or
+ *   it under the terms of the GNU Lesser General Public License as published by
+ *   the Free Software Foundation; either version 2.1 of the License, or
  *   (at your option) any later version.
  *
  *   This program is distributed in the hope that it will be useful,
  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ *   GNU Lesser General Public License for more details.
  *
- *   You should have received a copy of the GNU General Public License
+ *   You should have received a copy of the GNU Lesser General Public License
  *   along with this program; if not, write to the
  *   Free Software Foundation, Inc.,
  *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
  */
-#include <stdio.h>
-#include <stdlib.h>
 
-#include <string>
-#include <iostream>
-#include <vector>
-
-
-#include "log.h"
-#include "transcode.h"
-
-#ifndef NO_NAMESPACES
-using namespace std;
-#endif /* NO_NAMESPACES */
-
-#define UTF8ITER_CHECK
 #include "utf8iter.h"
-#include "readfile.h"
-#include "textsplit.h"
+#include <string>
 
-void tryempty()
+using std::string;
+
+void utf8truncate(std::string& s, int maxlen)
 {
-    Utf8Iter it("");
-    cout << "EOF ? " << it.eof() << endl;
-    TextSplit::isCJK(*it);
-    exit(0);
+    if (s.size() <= string::size_type(maxlen)) {
+        return;
+    }
+    Utf8Iter iter(s);
+    string::size_type pos = 0;
+    while (iter++ != string::npos)
+        if (iter.getBpos() < string::size_type(maxlen)) {
+            pos = iter.getBpos();
+        }
+
+    s.erase(pos);
 }
-
-const char *thisprog;
-static char usage [] =
-    "utf8iter [opts] infile outfile\n"
-    " converts infile to 32 bits unicode (processor order), for testing\n"
-    "-v : print stuff as we go\n"
-    ;
-
-void Usage() {
-    fprintf(stderr, "%s:%s\n", thisprog, usage);
-    exit(1);
-}
-static int     op_flags;
-#define OPT_v	  0x2 
-
-int main(int argc, char **argv)
-{
-    thisprog = argv[0];
-    argc--; argv++;
-
-    while (argc > 0 && **argv == '-') {
-	(*argv)++;
-	if (!(**argv))
-	    Usage();
-	while (**argv)
-	    switch (*(*argv)++) {
-	    case 'v':   op_flags |= OPT_v; break;
-
-	    default: Usage();	break;
-	    }
-	argc--;argv++;
-    }
-
-    if (argc != 2) {
-	Usage();
-    }
-    const char *infile = *argv++;argc--;
-    const char *outfile = *argv++;argc--;
-    string in;
-    if (!file_to_string(infile, in)) {
-	cerr << "Cant read file\n" << endl;
-	exit(1);
-    }
-    
-    vector<unsigned int>ucsout1;
-    string out, out1;
-    Utf8Iter it(in);
-    FILE *fp = fopen(outfile, "w");
-    if (fp == 0) {
-	fprintf(stderr, "cant create %s\n", outfile);
-	exit(1);
-    }
-
-    int nchars = 0;
-    for (;!it.eof(); it++) {
-	unsigned int value = *it;
-	if (value == (unsigned int)-1) {
-	    cerr << "Conversion error occurred\n" << endl;
-	    exit(1);
-	}
-	if (op_flags & OPT_v) {
-	   printf("Value: 0x%x", value);
-	   if (value < 0x7f)
-	       printf(" (%c) ", value);
-	   printf("\n");
-	}
-	// UTF-32LE or BE array
-	ucsout1.push_back(value);
-	// UTF-32LE or BE file
-	fwrite(&value, 4, 1, fp);
-
-	// Reconstructed utf8 strings (2 methods)
-	if (!it.appendchartostring(out))
-	    break;
-	// conversion to string
-	out1 += it;
-	
-	// fprintf(stderr, "%s", string(it).c_str());
-	nchars++;
-    }
-    fclose(fp);
-
-    fprintf(stderr, "nchars %d\n", nchars);
-    if (in.compare(out)) {
-	fprintf(stderr, "error: out != in\n");
-	exit(1);
-    }
-    if (in != out1) {
-	fprintf(stderr, "error: out1 != in\n");
-	exit(1);
-    }
-
-    // Rewind and do it a second time
-    vector<unsigned int>ucsout2;
-    it.rewind();
-    for (int i = 0; ; i++) {
-	unsigned int value;
-	if ((value = it[i]) == (unsigned int)-1) {
-	    fprintf(stderr, "%d chars\n", i);
-	    break;
-	}
-	it++;
-	ucsout2.push_back(value);
-    }
-
-    if (ucsout1 != ucsout2) {
-	fprintf(stderr, "error: ucsout1 != ucsout2\n");
-	exit(1);
-    }
-
-    ucsout2.clear();
-    int ercnt;
-    const char *encoding = "UTF-32LE"; // note : use BE on high-endian machine
-    string ucs, ucs1;
-    for (vector<unsigned int>::iterator it = ucsout1.begin(); 
-	 it != ucsout1.end(); it++) {
-	unsigned int i = *it;
-	ucs.append((const char *)&i, 4);
-    }
-    if (!transcode(ucs, ucs1, 
-		   encoding, encoding, &ercnt) || ercnt) {
-	fprintf(stderr, "Transcode check failed, ercount: %d\n", ercnt);
-	exit(1);
-    }
-    if (ucs.compare(ucs1)) {
-	fprintf(stderr, "error: ucsout1 != ucsout2 after iconv\n");
-	exit(1);
-    }
-
-    if (!transcode(ucs, ucs1, 
-		   encoding, "UTF-8", &ercnt) || ercnt) {
-	fprintf(stderr, "Transcode back to utf-8 check failed, ercount: %d\n",
-		ercnt);
-	exit(1);
-    }
-    if (ucs1.compare(in)) {
-	fprintf(stderr, "Transcode back to utf-8 compare to in failed\n");
-	exit(1);
-    }
-    exit(0);
-}
-
diff --git a/src/utils/utf8iter.h b/src/utils/utf8iter.h
index c5e30d1c..8416be62 100644
--- a/src/utils/utf8iter.h
+++ b/src/utils/utf8iter.h
@@ -273,4 +273,6 @@ private:
 };
 
 
+extern void utf8truncate(std::string& s, int maxlen);
+
 #endif /* _UTF8ITER_H_INCLUDED_ */