From 5fa7c73b7cbb797d1b93b24947a0c982a3956a42 Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Sat, 3 Oct 2020 15:20:14 +0200 Subject: [PATCH] utf8truncate: max size was specified to be bytes but used as character count --- src/testmains/trutf8iter.cpp | 38 ++++++++++++++++++++++++++++++------ src/utils/utf8iter.cpp | 13 +++++------- src/utils/utf8iter.h | 19 ++++++++++++------ 3 files changed, 50 insertions(+), 20 deletions(-) diff --git a/src/testmains/trutf8iter.cpp b/src/testmains/trutf8iter.cpp index 4613d9da..fcfb5180 100644 --- a/src/testmains/trutf8iter.cpp +++ b/src/testmains/trutf8iter.cpp @@ -44,10 +44,11 @@ void tryempty() const char *thisprog; static char usage [] = - "utf8iter [opts] infile outfile\n" - " converts infile to 32 bits unicode (processor order), for testing\n" - "-v : print stuff as we go\n" - ; +"utf8iter [opts] infile outfile\n" +" converts infile to 32 bits unicode (processor order), for testing\n" +" -v : print stuff as we go\n" +"-t [-w] [-e] : test truncation\n" +; void Usage() { fprintf(stderr, "%s:%s\n", thisprog, usage); @@ -55,9 +56,23 @@ void Usage() { } static int op_flags; #define OPT_v 0x2 +#define OPT_t 0x4 +#define OPT_w 0x8 +#define OPT_e 0x10 + +int trytruncate(std::string s, int maxlen) +{ + int flag = 0; + if (op_flags & OPT_w) + flag |= UTF8T_ATWORD; + if (op_flags & OPT_e) + flag |= UTF8T_ELLIPSIS; + utf8truncate(s, maxlen, flag); + std::cout << "Truncation result:[" << s << "]\n"; + return 0; +} FILE *infout = stdout; - int main(int argc, char **argv) { thisprog = argv[0]; @@ -69,12 +84,23 @@ int main(int argc, char **argv) Usage(); while (**argv) switch (*(*argv)++) { + case 'e': op_flags |= OPT_e;break; + case 't': op_flags |= OPT_t;break; case 'v': op_flags |= OPT_v; break; - + case 'w': op_flags |= OPT_w;break; default: Usage(); break; } argc--;argv++; } + + if (op_flags & OPT_t) { + if (argc < 2) + Usage(); + std::string s = *argv++;argc--; + int maxlen = atoi(*argv++);argc--; + return trytruncate(s, maxlen); + } + string infile, outfile; if (argc == 2) { infile = *argv++;argc--; diff --git a/src/utils/utf8iter.cpp b/src/utils/utf8iter.cpp index bf2eec16..7d588319 100644 --- a/src/utils/utf8iter.cpp +++ b/src/utils/utf8iter.cpp @@ -1,4 +1,4 @@ -/* Copyright (C) 2017-2019 J.F.Dockes +/* Copyright (C) 2017-2020 J.F.Dockes * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation; either version 2.1 of the License, or @@ -17,12 +17,13 @@ #include "utf8iter.h" +#include #include #include using namespace std; -void utf8truncate(std::string& s, int maxlen, int flags, const string& ellipsis, +void utf8truncate(string& s, int maxlen, int flags, const string& ellipsis, const string& ws) { if (s.size() <= string::size_type(maxlen)) { @@ -39,11 +40,7 @@ void utf8truncate(std::string& s, int maxlen, int flags, const string& ellipsis, if (flags & UTF8T_ELLIPSIS) { size_t ellen = utf8len(ellipsis); - if (maxlen > int(ellen)) { - maxlen -= ellen; - } else { - maxlen = 0; - } + maxlen = std::max(0, maxlen - int(ellen)); } Utf8Iter iter(s); @@ -51,7 +48,7 @@ void utf8truncate(std::string& s, int maxlen, int flags, const string& ellipsis, string::size_type lastwspos = 0; for (; !iter.eof(); iter++) { unsigned int c = *iter; - if (iter.getCpos() < string::size_type(maxlen)) { + if (iter.getBpos() < string::size_type(maxlen)) { pos = iter.getBpos() + iter.getBlen(); if ((flags & UTF8T_ATWORD) && wss.find(c) != wss.end()) { lastwspos = pos; diff --git a/src/utils/utf8iter.h b/src/utils/utf8iter.h index dc61eaaa..c1c05609 100644 --- a/src/utils/utf8iter.h +++ b/src/utils/utf8iter.h @@ -281,11 +281,18 @@ private: enum Utf8TruncateFlag {UTF8T_NONE, UTF8T_ATWORD, UTF8T_ELLIPSIS}; -// maxlen is in utf-8 chars. -extern void utf8truncate(std::string& s, int maxlen, int flags = 0, - const std::string& ellipsis = "...", - const std::string& ws = " \t\n\r"); -extern size_t utf8len(const std::string& s); + +/** Truncate utf8 string, maintaining encoding integrity + * @param s input string to be modified in place + * @param maxlen maximum size after truncation in bytes + * @param flags Specify cutting at word position, adding an ellipsis + */ +void utf8truncate(std::string& s, int maxlen, int flags = 0, + const std::string& ellipsis = "...", + const std::string& ws = " \t\n\r"); + +/** Compute length in characters of utf-8 string */ +size_t utf8len(const std::string& s); /** @brief Check and possibly fix string by replacing badly encoded * characters with the standard question mark replacement character. @@ -297,7 +304,7 @@ extern size_t utf8len(const std::string& s); * @return -1 for failure (fixit false or maxrepl reached). * 0 or positive: replacement count. */ -extern int utf8check( +int utf8check( const std::string& in, std::string& out, bool fixit=false, int maxrepl=100); #endif /* _UTF8ITER_H_INCLUDED_ */