utf8iter: utility function giving bytes count for code

2021-11-02 09:24:06 +01:00 · 2021-11-02 09:24:06 +01:00 · e3e270fe81
commit e3e270fe81
parent fa790f52de
2 changed files with 30 additions and 1 deletions
--- a/src/testmains/trutf8iter.cpp
+++ b/src/testmains/trutf8iter.cpp
@ -48,6 +48,7 @@ static char usage [] =
 " converts infile to 32 bits unicode (processor order), for testing\n"
 "  -v : print stuff as we go\n"
 "-t [-w] [-e] <string> <maxlen> : test truncation\n"
 "-c <str> : str must be a single utf-8 char. Convert to code then show character bytes count\n"
 ;
 void Usage() {
@ -59,6 +60,7 @@ static int     op_flags;
 #define OPT_t     0x4
 #define OPT_w     0x8
 #define OPT_e     0x10
 #define OPT_c     0x20
 int trytruncate(std::string s, int maxlen)
 {
@ -86,13 +88,26 @@ int main(int argc, char **argv)
            switch (*(*argv)++) {
            case 'e': op_flags |= OPT_e;break;
            case 't': op_flags |= OPT_t;break;
-            case 'v':   op_flags |= OPT_v; break;
+            case 'v': op_flags |= OPT_v;break;
            case 'w': op_flags |= OPT_w;break;
            case 'c': op_flags |= OPT_c;break;
            default: Usage();   break;
            }
        argc--;argv++;
    }
    if (op_flags & OPT_c) {
        if (argc != 1)
            Usage();
        std::string s = *argv++;argc--;
        Utf8Iter uit(s);
        auto code = *uit;
        auto cnt = utf8codepointsize(code);
        std::cout << "0x" << std::hex << code << std::dec << " : " << cnt << " byte" <<
            (cnt>1?"s":"") << "\n";
        return 0;
    }
    if (op_flags & OPT_t) {
        if (argc < 2)
            Usage();
--- a/src/utils/utf8iter.h
+++ b/src/utils/utf8iter.h
@ -295,6 +295,20 @@ void utf8truncate(std::string& s, int maxlen, int flags = 0,
 /** Compute length in characters of utf-8 string */
 size_t utf8len(const std::string& s);
 /** Return number of bytes for Unicode character */
 inline int utf8codepointsize(uint32_t codepoint)
 {
    if (codepoint <= 0x7F) {
        return 1;
    } else if (codepoint <= 0x7FF) {
        return 2;
    } else if (codepoint < 0xFFFF) {
        return 3;
    } else {
        return 4;
    }
 }
 /** @brief Check and possibly fix string by replacing badly encoded
 * characters with the standard question mark replacement character.
 *