utf8iter: utility function giving bytes count for code

2021-11-02 09:24:06 +01:00 · 2021-11-02 09:24:06 +01:00 · e3e270fe81
commit e3e270fe81
parent fa790f52de
2 changed files with 30 additions and 1 deletions
--- a/src/testmains/trutf8iter.cpp
+++ b/src/testmains/trutf8iter.cpp
@ -48,6 +48,7 @@ static char usage [] =
 " converts infile to 32 bits unicode (processor order), for testing\n"
 "  -v : print stuff as we go\n"
 "-t [-w] [-e] <string> <maxlen> : test truncation\n"
+"-c <str> : str must be a single utf-8 char. Convert to code then show character bytes count\n"
 ;

 void Usage() {
@ -59,6 +60,7 @@ static int     op_flags;
 #define OPT_t     0x4
 #define OPT_w     0x8
 #define OPT_e     0x10
+#define OPT_c     0x20

 int trytruncate(std::string s, int maxlen)
 {
@ -86,13 +88,26 @@ int main(int argc, char **argv)
            switch (*(*argv)++) {
            case 'e': op_flags |= OPT_e;break;
            case 't': op_flags |= OPT_t;break;
-            case 'v':   op_flags |= OPT_v; break;
+            case 'v': op_flags |= OPT_v;break;
            case 'w': op_flags |= OPT_w;break;
+            case 'c': op_flags |= OPT_c;break;
            default: Usage();   break;
            }
        argc--;argv++;
    }

+    if (op_flags & OPT_c) {
+        if (argc != 1)
+            Usage();
+        std::string s = *argv++;argc--;
+        Utf8Iter uit(s);
+        auto code = *uit;
+        auto cnt = utf8codepointsize(code);
+        std::cout << "0x" << std::hex << code << std::dec << " : " << cnt << " byte" <<
+            (cnt>1?"s":"") << "\n";
+        return 0;
+    }
+
    if (op_flags & OPT_t) {
        if (argc < 2)
            Usage();
--- a/src/utils/utf8iter.h
+++ b/src/utils/utf8iter.h
@ -295,6 +295,20 @@ void utf8truncate(std::string& s, int maxlen, int flags = 0,
 /** Compute length in characters of utf-8 string */
 size_t utf8len(const std::string& s);

+/** Return number of bytes for Unicode character */
+inline int utf8codepointsize(uint32_t codepoint)
+{
+    if (codepoint <= 0x7F) {
+        return 1;
+    } else if (codepoint <= 0x7FF) {
+        return 2;
+    } else if (codepoint < 0xFFFF) {
+        return 3;
+    } else {
+        return 4;
+    }
+}
+
 /** @brief Check and possibly fix string by replacing badly encoded
 * characters with the standard question mark replacement character.
 *