From c2691f68bf9d8d15a78fa36716739e00d262f0c5 Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Sun, 3 Feb 2019 17:56:41 +0100 Subject: [PATCH] utf8 truncate utility function --- src/Makefile.am | 1 + src/testmains/trutf8iter.cpp | 185 ++++++++++++++++++++++++++++++++++ src/utils/utf8iter.cpp | 189 ++++------------------------------- src/utils/utf8iter.h | 2 + 4 files changed, 208 insertions(+), 169 deletions(-) create mode 100644 src/testmains/trutf8iter.cpp diff --git a/src/Makefile.am b/src/Makefile.am index a0f13ee2..bf6ae668 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -258,6 +258,7 @@ utils/strmatcher.cpp \ utils/strmatcher.h \ utils/transcode.cpp \ utils/transcode.h \ +utils/utf8iter.cpp \ utils/utf8iter.h \ utils/wipedir.cpp \ utils/wipedir.h \ diff --git a/src/testmains/trutf8iter.cpp b/src/testmains/trutf8iter.cpp new file mode 100644 index 00000000..68f5132b --- /dev/null +++ b/src/testmains/trutf8iter.cpp @@ -0,0 +1,185 @@ +/* Copyright (C) 2005 J.F.Dockes + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the + * Free Software Foundation, Inc., + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#include +#include + +#include +#include +#include + + +#include "log.h" +#include "transcode.h" + +#ifndef NO_NAMESPACES +using namespace std; +#endif /* NO_NAMESPACES */ + +#define UTF8ITER_CHECK +#include "utf8iter.h" +#include "readfile.h" +#include "textsplit.h" + +void tryempty() +{ + Utf8Iter it(""); + cout << "EOF ? " << it.eof() << endl; + TextSplit::isCJK(*it); + exit(0); +} + +const char *thisprog; +static char usage [] = + "utf8iter [opts] infile outfile\n" + " converts infile to 32 bits unicode (processor order), for testing\n" + "-v : print stuff as we go\n" + ; + +void Usage() { + fprintf(stderr, "%s:%s\n", thisprog, usage); + exit(1); +} +static int op_flags; +#define OPT_v 0x2 + +int main(int argc, char **argv) +{ + thisprog = argv[0]; + argc--; argv++; + + while (argc > 0 && **argv == '-') { + (*argv)++; + if (!(**argv)) + Usage(); + while (**argv) + switch (*(*argv)++) { + case 'v': op_flags |= OPT_v; break; + + default: Usage(); break; + } + argc--;argv++; + } + + if (argc != 2) { + Usage(); + } + const char *infile = *argv++;argc--; + const char *outfile = *argv++;argc--; + string in; + if (!file_to_string(infile, in)) { + cerr << "Cant read file\n" << endl; + exit(1); + } + + vectorucsout1; + string out, out1; + Utf8Iter it(in); + FILE *fp = fopen(outfile, "w"); + if (fp == 0) { + fprintf(stderr, "cant create %s\n", outfile); + exit(1); + } + + int nchars = 0; + for (;!it.eof(); it++) { + unsigned int value = *it; + if (value == (unsigned int)-1) { + cerr << "Conversion error occurred\n" << endl; + exit(1); + } + if (op_flags & OPT_v) { + printf("Value: 0x%x", value); + if (value < 0x7f) + printf(" (%c) ", value); + printf("\n"); + } + // UTF-32LE or BE array + ucsout1.push_back(value); + // UTF-32LE or BE file + fwrite(&value, 4, 1, fp); + + // Reconstructed utf8 strings (2 methods) + if (!it.appendchartostring(out)) + break; + // conversion to string + out1 += it; + + // fprintf(stderr, "%s", string(it).c_str()); + nchars++; + } + fclose(fp); + + fprintf(stderr, "nchars %d\n", nchars); + if (in.compare(out)) { + fprintf(stderr, "error: out != in\n"); + exit(1); + } + if (in != out1) { + fprintf(stderr, "error: out1 != in\n"); + exit(1); + } + + // Rewind and do it a second time + vectorucsout2; + it.rewind(); + for (int i = 0; ; i++) { + unsigned int value; + if ((value = it[i]) == (unsigned int)-1) { + fprintf(stderr, "%d chars\n", i); + break; + } + it++; + ucsout2.push_back(value); + } + + if (ucsout1 != ucsout2) { + fprintf(stderr, "error: ucsout1 != ucsout2\n"); + exit(1); + } + + ucsout2.clear(); + int ercnt; + const char *encoding = "UTF-32LE"; // note : use BE on high-endian machine + string ucs, ucs1; + for (vector::iterator it = ucsout1.begin(); + it != ucsout1.end(); it++) { + unsigned int i = *it; + ucs.append((const char *)&i, 4); + } + if (!transcode(ucs, ucs1, + encoding, encoding, &ercnt) || ercnt) { + fprintf(stderr, "Transcode check failed, ercount: %d\n", ercnt); + exit(1); + } + if (ucs.compare(ucs1)) { + fprintf(stderr, "error: ucsout1 != ucsout2 after iconv\n"); + exit(1); + } + + if (!transcode(ucs, ucs1, + encoding, "UTF-8", &ercnt) || ercnt) { + fprintf(stderr, "Transcode back to utf-8 check failed, ercount: %d\n", + ercnt); + exit(1); + } + if (ucs1.compare(in)) { + fprintf(stderr, "Transcode back to utf-8 compare to in failed\n"); + exit(1); + } + exit(0); +} + diff --git a/src/utils/utf8iter.cpp b/src/utils/utf8iter.cpp index 68f5132b..797f6189 100644 --- a/src/utils/utf8iter.cpp +++ b/src/utils/utf8iter.cpp @@ -1,185 +1,36 @@ -/* Copyright (C) 2005 J.F.Dockes +/* Copyright (C) 2017-2019 J.F.Dockes * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * GNU Lesser General Public License for more details. * - * You should have received a copy of the GNU General Public License + * You should have received a copy of the GNU Lesser General Public License * along with this program; if not, write to the * Free Software Foundation, Inc., * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ -#include -#include -#include -#include -#include - - -#include "log.h" -#include "transcode.h" - -#ifndef NO_NAMESPACES -using namespace std; -#endif /* NO_NAMESPACES */ - -#define UTF8ITER_CHECK #include "utf8iter.h" -#include "readfile.h" -#include "textsplit.h" +#include -void tryempty() +using std::string; + +void utf8truncate(std::string& s, int maxlen) { - Utf8Iter it(""); - cout << "EOF ? " << it.eof() << endl; - TextSplit::isCJK(*it); - exit(0); + if (s.size() <= string::size_type(maxlen)) { + return; + } + Utf8Iter iter(s); + string::size_type pos = 0; + while (iter++ != string::npos) + if (iter.getBpos() < string::size_type(maxlen)) { + pos = iter.getBpos(); + } + + s.erase(pos); } - -const char *thisprog; -static char usage [] = - "utf8iter [opts] infile outfile\n" - " converts infile to 32 bits unicode (processor order), for testing\n" - "-v : print stuff as we go\n" - ; - -void Usage() { - fprintf(stderr, "%s:%s\n", thisprog, usage); - exit(1); -} -static int op_flags; -#define OPT_v 0x2 - -int main(int argc, char **argv) -{ - thisprog = argv[0]; - argc--; argv++; - - while (argc > 0 && **argv == '-') { - (*argv)++; - if (!(**argv)) - Usage(); - while (**argv) - switch (*(*argv)++) { - case 'v': op_flags |= OPT_v; break; - - default: Usage(); break; - } - argc--;argv++; - } - - if (argc != 2) { - Usage(); - } - const char *infile = *argv++;argc--; - const char *outfile = *argv++;argc--; - string in; - if (!file_to_string(infile, in)) { - cerr << "Cant read file\n" << endl; - exit(1); - } - - vectorucsout1; - string out, out1; - Utf8Iter it(in); - FILE *fp = fopen(outfile, "w"); - if (fp == 0) { - fprintf(stderr, "cant create %s\n", outfile); - exit(1); - } - - int nchars = 0; - for (;!it.eof(); it++) { - unsigned int value = *it; - if (value == (unsigned int)-1) { - cerr << "Conversion error occurred\n" << endl; - exit(1); - } - if (op_flags & OPT_v) { - printf("Value: 0x%x", value); - if (value < 0x7f) - printf(" (%c) ", value); - printf("\n"); - } - // UTF-32LE or BE array - ucsout1.push_back(value); - // UTF-32LE or BE file - fwrite(&value, 4, 1, fp); - - // Reconstructed utf8 strings (2 methods) - if (!it.appendchartostring(out)) - break; - // conversion to string - out1 += it; - - // fprintf(stderr, "%s", string(it).c_str()); - nchars++; - } - fclose(fp); - - fprintf(stderr, "nchars %d\n", nchars); - if (in.compare(out)) { - fprintf(stderr, "error: out != in\n"); - exit(1); - } - if (in != out1) { - fprintf(stderr, "error: out1 != in\n"); - exit(1); - } - - // Rewind and do it a second time - vectorucsout2; - it.rewind(); - for (int i = 0; ; i++) { - unsigned int value; - if ((value = it[i]) == (unsigned int)-1) { - fprintf(stderr, "%d chars\n", i); - break; - } - it++; - ucsout2.push_back(value); - } - - if (ucsout1 != ucsout2) { - fprintf(stderr, "error: ucsout1 != ucsout2\n"); - exit(1); - } - - ucsout2.clear(); - int ercnt; - const char *encoding = "UTF-32LE"; // note : use BE on high-endian machine - string ucs, ucs1; - for (vector::iterator it = ucsout1.begin(); - it != ucsout1.end(); it++) { - unsigned int i = *it; - ucs.append((const char *)&i, 4); - } - if (!transcode(ucs, ucs1, - encoding, encoding, &ercnt) || ercnt) { - fprintf(stderr, "Transcode check failed, ercount: %d\n", ercnt); - exit(1); - } - if (ucs.compare(ucs1)) { - fprintf(stderr, "error: ucsout1 != ucsout2 after iconv\n"); - exit(1); - } - - if (!transcode(ucs, ucs1, - encoding, "UTF-8", &ercnt) || ercnt) { - fprintf(stderr, "Transcode back to utf-8 check failed, ercount: %d\n", - ercnt); - exit(1); - } - if (ucs1.compare(in)) { - fprintf(stderr, "Transcode back to utf-8 compare to in failed\n"); - exit(1); - } - exit(0); -} - diff --git a/src/utils/utf8iter.h b/src/utils/utf8iter.h index c5e30d1c..8416be62 100644 --- a/src/utils/utf8iter.h +++ b/src/utils/utf8iter.h @@ -273,4 +273,6 @@ private: }; +extern void utf8truncate(std::string& s, int maxlen); + #endif /* _UTF8ITER_H_INCLUDED_ */