utf8truncate: max size was specified to be bytes but used as character count

2020-10-03 15:20:14 +02:00 · 2020-10-03 15:20:14 +02:00 · 5fa7c73b7c
commit 5fa7c73b7c
parent 5146fc75cf
3 changed files with 50 additions and 20 deletions
--- a/src/testmains/trutf8iter.cpp
+++ b/src/testmains/trutf8iter.cpp
@ -44,10 +44,11 @@ void tryempty()
 const char *thisprog;
 static char usage [] =
-                                    "utf8iter [opts] infile outfile\n"
+"utf8iter [opts] infile outfile\n"
-                                    " converts infile to 32 bits unicode (processor order), for testing\n"
+" converts infile to 32 bits unicode (processor order), for testing\n"
-                                    "-v : print stuff as we go\n"
+"  -v : print stuff as we go\n"
-                                    ;
+"-t [-w] [-e] <string> <maxlen> : test truncation\n"
 ;
 void Usage() {
    fprintf(stderr, "%s:%s\n", thisprog, usage);
@ -55,9 +56,23 @@ void Usage() {
 }
 static int     op_flags;
 #define OPT_v     0x2 
 #define OPT_t     0x4
 #define OPT_w     0x8
 #define OPT_e     0x10
 int trytruncate(std::string s, int maxlen)
 {
    int flag = 0;
    if (op_flags & OPT_w)
        flag |= UTF8T_ATWORD;
    if (op_flags & OPT_e)
        flag |= UTF8T_ELLIPSIS;
    utf8truncate(s, maxlen, flag);
    std::cout << "Truncation result:[" << s << "]\n";
    return 0;
 }
 FILE *infout = stdout;
 int main(int argc, char **argv)
 {
    thisprog = argv[0];
@ -69,12 +84,23 @@ int main(int argc, char **argv)
            Usage();
        while (**argv)
            switch (*(*argv)++) {
            case 'e': op_flags |= OPT_e;break;
            case 't': op_flags |= OPT_t;break;
            case 'v':   op_flags |= OPT_v; break;
-
+            case 'w': op_flags |= OPT_w;break;
            default: Usage();   break;
            }
        argc--;argv++;
    }
    if (op_flags & OPT_t) {
        if (argc < 2)
            Usage();
        std::string s = *argv++;argc--;
        int maxlen = atoi(*argv++);argc--;
        return trytruncate(s, maxlen);
    }
    string infile, outfile;
    if (argc == 2) {
        infile = *argv++;argc--;
--- a/src/utils/utf8iter.cpp
+++ b/src/utils/utf8iter.cpp
@ -1,4 +1,4 @@
-/* Copyright (C) 2017-2019 J.F.Dockes
+/* Copyright (C) 2017-2020 J.F.Dockes
 *   This program is free software; you can redistribute it and/or modify
 *   it under the terms of the GNU Lesser General Public License as published by
 *   the Free Software Foundation; either version 2.1 of the License, or
@ -17,12 +17,13 @@
 #include "utf8iter.h"
 #include <algorithm>
 #include <unordered_set>
 #include <iostream>
 using namespace std;
-void utf8truncate(std::string& s, int maxlen, int flags, const string& ellipsis,
+void utf8truncate(string& s, int maxlen, int flags, const string& ellipsis,
                  const string& ws)
 {
    if (s.size() <= string::size_type(maxlen)) {
@ -39,11 +40,7 @@ void utf8truncate(std::string& s, int maxlen, int flags, const string& ellipsis,
    if (flags & UTF8T_ELLIPSIS) {
        size_t ellen = utf8len(ellipsis);
-        if (maxlen > int(ellen)) {
+        maxlen = std::max(0, maxlen - int(ellen));
            maxlen -= ellen;
        } else {
            maxlen = 0;
        }
    }
    Utf8Iter iter(s);
@ -51,7 +48,7 @@ void utf8truncate(std::string& s, int maxlen, int flags, const string& ellipsis,
    string::size_type lastwspos = 0;
    for (; !iter.eof(); iter++) {
        unsigned int c = *iter;
-        if (iter.getCpos() < string::size_type(maxlen)) {
+        if (iter.getBpos() < string::size_type(maxlen)) {
            pos = iter.getBpos() + iter.getBlen();
            if ((flags & UTF8T_ATWORD) && wss.find(c) != wss.end()) {
                lastwspos = pos;
--- a/src/utils/utf8iter.h
+++ b/src/utils/utf8iter.h
@ -281,11 +281,18 @@ private:
 enum Utf8TruncateFlag {UTF8T_NONE, UTF8T_ATWORD, UTF8T_ELLIPSIS};
-// maxlen is in utf-8 chars.
+
-extern void utf8truncate(std::string& s, int maxlen, int flags = 0,
+/** Truncate utf8 string, maintaining encoding integrity
-                         const std::string& ellipsis = "...",
+ * @param s input string to be modified in place
-                         const std::string& ws = " \t\n\r");
+ * @param maxlen maximum size after truncation in bytes
-extern size_t utf8len(const std::string& s);
+ * @param flags Specify cutting at word position, adding an ellipsis
 */
 void utf8truncate(std::string& s, int maxlen, int flags = 0,
                  const std::string& ellipsis = "...",
                  const std::string& ws = " \t\n\r");
 /** Compute length in characters of utf-8 string */
 size_t utf8len(const std::string& s);
 /** @brief Check and possibly fix string by replacing badly encoded
 * characters with the standard question mark replacement character.
@ -297,7 +304,7 @@ extern size_t utf8len(const std::string& s);
 * @return -1 for failure (fixit false or maxrepl reached). 
 *   0 or positive: replacement count.
 */
-extern int utf8check(
+int utf8check(
    const std::string& in, std::string& out, bool fixit=false, int maxrepl=100);
 #endif /* _UTF8ITER_H_INCLUDED_ */