utf8 truncate utility function

2019-02-03 17:56:41 +01:00 · 2019-02-03 17:56:41 +01:00 · c2691f68bf
commit c2691f68bf
parent a16e39a92b
4 changed files with 208 additions and 169 deletions
--- a/src/Makefile.am
+++ b/src/Makefile.am
@ -258,6 +258,7 @@ utils/strmatcher.cpp \
 utils/strmatcher.h \
 utils/transcode.cpp \
 utils/transcode.h \
+utils/utf8iter.cpp \
 utils/utf8iter.h \
 utils/wipedir.cpp \
 utils/wipedir.h \
--- a/src/testmains/trutf8iter.cpp
+++ b/src/testmains/trutf8iter.cpp
@ -0,0 +1,185 @@
+/* Copyright (C) 2005 J.F.Dockes
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program; if not, write to the
+ *   Free Software Foundation, Inc.,
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <string>
+#include <iostream>
+#include <vector>
+
+
+#include "log.h"
+#include "transcode.h"
+
+#ifndef NO_NAMESPACES
+using namespace std;
+#endif /* NO_NAMESPACES */
+
+#define UTF8ITER_CHECK
+#include "utf8iter.h"
+#include "readfile.h"
+#include "textsplit.h"
+
+void tryempty()
+{
+    Utf8Iter it("");
+    cout << "EOF ? " << it.eof() << endl;
+    TextSplit::isCJK(*it);
+    exit(0);
+}
+
+const char *thisprog;
+static char usage [] =
+    "utf8iter [opts] infile outfile\n"
+    " converts infile to 32 bits unicode (processor order), for testing\n"
+    "-v : print stuff as we go\n"
+    ;
+
+void Usage() {
+    fprintf(stderr, "%s:%s\n", thisprog, usage);
+    exit(1);
+}
+static int     op_flags;
+#define OPT_v	  0x2 
+
+int main(int argc, char **argv)
+{
+    thisprog = argv[0];
+    argc--; argv++;
+
+    while (argc > 0 && **argv == '-') {
+	(*argv)++;
+	if (!(**argv))
+	    Usage();
+	while (**argv)
+	    switch (*(*argv)++) {
+	    case 'v':   op_flags |= OPT_v; break;
+
+	    default: Usage();	break;
+	    }
+	argc--;argv++;
+    }
+
+    if (argc != 2) {
+	Usage();
+    }
+    const char *infile = *argv++;argc--;
+    const char *outfile = *argv++;argc--;
+    string in;
+    if (!file_to_string(infile, in)) {
+	cerr << "Cant read file\n" << endl;
+	exit(1);
+    }
+    
+    vector<unsigned int>ucsout1;
+    string out, out1;
+    Utf8Iter it(in);
+    FILE *fp = fopen(outfile, "w");
+    if (fp == 0) {
+	fprintf(stderr, "cant create %s\n", outfile);
+	exit(1);
+    }
+
+    int nchars = 0;
+    for (;!it.eof(); it++) {
+	unsigned int value = *it;
+	if (value == (unsigned int)-1) {
+	    cerr << "Conversion error occurred\n" << endl;
+	    exit(1);
+	}
+	if (op_flags & OPT_v) {
+	   printf("Value: 0x%x", value);
+	   if (value < 0x7f)
+	       printf(" (%c) ", value);
+	   printf("\n");
+	}
+	// UTF-32LE or BE array
+	ucsout1.push_back(value);
+	// UTF-32LE or BE file
+	fwrite(&value, 4, 1, fp);
+
+	// Reconstructed utf8 strings (2 methods)
+	if (!it.appendchartostring(out))
+	    break;
+	// conversion to string
+	out1 += it;
+	
+	// fprintf(stderr, "%s", string(it).c_str());
+	nchars++;
+    }
+    fclose(fp);
+
+    fprintf(stderr, "nchars %d\n", nchars);
+    if (in.compare(out)) {
+	fprintf(stderr, "error: out != in\n");
+	exit(1);
+    }
+    if (in != out1) {
+	fprintf(stderr, "error: out1 != in\n");
+	exit(1);
+    }
+
+    // Rewind and do it a second time
+    vector<unsigned int>ucsout2;
+    it.rewind();
+    for (int i = 0; ; i++) {
+	unsigned int value;
+	if ((value = it[i]) == (unsigned int)-1) {
+	    fprintf(stderr, "%d chars\n", i);
+	    break;
+	}
+	it++;
+	ucsout2.push_back(value);
+    }
+
+    if (ucsout1 != ucsout2) {
+	fprintf(stderr, "error: ucsout1 != ucsout2\n");
+	exit(1);
+    }
+
+    ucsout2.clear();
+    int ercnt;
+    const char *encoding = "UTF-32LE"; // note : use BE on high-endian machine
+    string ucs, ucs1;
+    for (vector<unsigned int>::iterator it = ucsout1.begin(); 
+	 it != ucsout1.end(); it++) {
+	unsigned int i = *it;
+	ucs.append((const char *)&i, 4);
+    }
+    if (!transcode(ucs, ucs1, 
+		   encoding, encoding, &ercnt) || ercnt) {
+	fprintf(stderr, "Transcode check failed, ercount: %d\n", ercnt);
+	exit(1);
+    }
+    if (ucs.compare(ucs1)) {
+	fprintf(stderr, "error: ucsout1 != ucsout2 after iconv\n");
+	exit(1);
+    }
+
+    if (!transcode(ucs, ucs1, 
+		   encoding, "UTF-8", &ercnt) || ercnt) {
+	fprintf(stderr, "Transcode back to utf-8 check failed, ercount: %d\n",
+		ercnt);
+	exit(1);
+    }
+    if (ucs1.compare(in)) {
+	fprintf(stderr, "Transcode back to utf-8 compare to in failed\n");
+	exit(1);
+    }
+    exit(0);
+}
+
--- a/src/utils/utf8iter.cpp
+++ b/src/utils/utf8iter.cpp
@ -1,185 +1,36 @@
-/* Copyright (C) 2005 J.F.Dockes
+/* Copyright (C) 2017-2019 J.F.Dockes
 *   This program is free software; you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or
+ *   it under the terms of the GNU Lesser General Public License as published by
+ *   the Free Software Foundation; either version 2.1 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ *   GNU Lesser General Public License for more details.
 *
- *   You should have received a copy of the GNU General Public License
+ *   You should have received a copy of the GNU Lesser General Public License
 *   along with this program; if not, write to the
 *   Free Software Foundation, Inc.,
 *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 */
-#include <stdio.h>
-#include <stdlib.h>

-#include <string>
-#include <iostream>
-#include <vector>
-
-
-#include "log.h"
-#include "transcode.h"
-
-#ifndef NO_NAMESPACES
-using namespace std;
-#endif /* NO_NAMESPACES */
-
-#define UTF8ITER_CHECK
 #include "utf8iter.h"
-#include "readfile.h"
-#include "textsplit.h"
+#include <string>

-void tryempty()
+using std::string;
+
+void utf8truncate(std::string& s, int maxlen)
 {
-    Utf8Iter it("");
-    cout << "EOF ? " << it.eof() << endl;
-    TextSplit::isCJK(*it);
-    exit(0);
+    if (s.size() <= string::size_type(maxlen)) {
+        return;
+    }
+    Utf8Iter iter(s);
+    string::size_type pos = 0;
+    while (iter++ != string::npos)
+        if (iter.getBpos() < string::size_type(maxlen)) {
+            pos = iter.getBpos();
+        }
+
+    s.erase(pos);
 }
-
-const char *thisprog;
-static char usage [] =
-    "utf8iter [opts] infile outfile\n"
-    " converts infile to 32 bits unicode (processor order), for testing\n"
-    "-v : print stuff as we go\n"
-    ;
-
-void Usage() {
-    fprintf(stderr, "%s:%s\n", thisprog, usage);
-    exit(1);
-}
-static int     op_flags;
-#define OPT_v	  0x2 
-
-int main(int argc, char **argv)
-{
-    thisprog = argv[0];
-    argc--; argv++;
-
-    while (argc > 0 && **argv == '-') {
-	(*argv)++;
-	if (!(**argv))
-	    Usage();
-	while (**argv)
-	    switch (*(*argv)++) {
-	    case 'v':   op_flags |= OPT_v; break;
-
-	    default: Usage();	break;
-	    }
-	argc--;argv++;
-    }
-
-    if (argc != 2) {
-	Usage();
-    }
-    const char *infile = *argv++;argc--;
-    const char *outfile = *argv++;argc--;
-    string in;
-    if (!file_to_string(infile, in)) {
-	cerr << "Cant read file\n" << endl;
-	exit(1);
-    }
-    
-    vector<unsigned int>ucsout1;
-    string out, out1;
-    Utf8Iter it(in);
-    FILE *fp = fopen(outfile, "w");
-    if (fp == 0) {
-	fprintf(stderr, "cant create %s\n", outfile);
-	exit(1);
-    }
-
-    int nchars = 0;
-    for (;!it.eof(); it++) {
-	unsigned int value = *it;
-	if (value == (unsigned int)-1) {
-	    cerr << "Conversion error occurred\n" << endl;
-	    exit(1);
-	}
-	if (op_flags & OPT_v) {
-	   printf("Value: 0x%x", value);
-	   if (value < 0x7f)
-	       printf(" (%c) ", value);
-	   printf("\n");
-	}
-	// UTF-32LE or BE array
-	ucsout1.push_back(value);
-	// UTF-32LE or BE file
-	fwrite(&value, 4, 1, fp);
-
-	// Reconstructed utf8 strings (2 methods)
-	if (!it.appendchartostring(out))
-	    break;
-	// conversion to string
-	out1 += it;
-	
-	// fprintf(stderr, "%s", string(it).c_str());
-	nchars++;
-    }
-    fclose(fp);
-
-    fprintf(stderr, "nchars %d\n", nchars);
-    if (in.compare(out)) {
-	fprintf(stderr, "error: out != in\n");
-	exit(1);
-    }
-    if (in != out1) {
-	fprintf(stderr, "error: out1 != in\n");
-	exit(1);
-    }
-
-    // Rewind and do it a second time
-    vector<unsigned int>ucsout2;
-    it.rewind();
-    for (int i = 0; ; i++) {
-	unsigned int value;
-	if ((value = it[i]) == (unsigned int)-1) {
-	    fprintf(stderr, "%d chars\n", i);
-	    break;
-	}
-	it++;
-	ucsout2.push_back(value);
-    }
-
-    if (ucsout1 != ucsout2) {
-	fprintf(stderr, "error: ucsout1 != ucsout2\n");
-	exit(1);
-    }
-
-    ucsout2.clear();
-    int ercnt;
-    const char *encoding = "UTF-32LE"; // note : use BE on high-endian machine
-    string ucs, ucs1;
-    for (vector<unsigned int>::iterator it = ucsout1.begin(); 
-	 it != ucsout1.end(); it++) {
-	unsigned int i = *it;
-	ucs.append((const char *)&i, 4);
-    }
-    if (!transcode(ucs, ucs1, 
-		   encoding, encoding, &ercnt) || ercnt) {
-	fprintf(stderr, "Transcode check failed, ercount: %d\n", ercnt);
-	exit(1);
-    }
-    if (ucs.compare(ucs1)) {
-	fprintf(stderr, "error: ucsout1 != ucsout2 after iconv\n");
-	exit(1);
-    }
-
-    if (!transcode(ucs, ucs1, 
-		   encoding, "UTF-8", &ercnt) || ercnt) {
-	fprintf(stderr, "Transcode back to utf-8 check failed, ercount: %d\n",
-		ercnt);
-	exit(1);
-    }
-    if (ucs1.compare(in)) {
-	fprintf(stderr, "Transcode back to utf-8 compare to in failed\n");
-	exit(1);
-    }
-    exit(0);
-}
-
--- a/src/utils/utf8iter.h
+++ b/src/utils/utf8iter.h
@ -273,4 +273,6 @@ private:
 };


+extern void utf8truncate(std::string& s, int maxlen);
+
 #endif /* _UTF8ITER_H_INCLUDED_ */