From e3e270fe812f6648cf0f87be4e3c55f7dd73f8cb Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Tue, 2 Nov 2021 09:24:06 +0100 Subject: [PATCH] utf8iter: utility function giving bytes count for code --- src/testmains/trutf8iter.cpp | 17 ++++++++++++++++- src/utils/utf8iter.h | 14 ++++++++++++++ 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/src/testmains/trutf8iter.cpp b/src/testmains/trutf8iter.cpp index fcfb5180..8e83374b 100644 --- a/src/testmains/trutf8iter.cpp +++ b/src/testmains/trutf8iter.cpp @@ -48,6 +48,7 @@ static char usage [] = " converts infile to 32 bits unicode (processor order), for testing\n" " -v : print stuff as we go\n" "-t [-w] [-e] : test truncation\n" +"-c : str must be a single utf-8 char. Convert to code then show character bytes count\n" ; void Usage() { @@ -59,6 +60,7 @@ static int op_flags; #define OPT_t 0x4 #define OPT_w 0x8 #define OPT_e 0x10 +#define OPT_c 0x20 int trytruncate(std::string s, int maxlen) { @@ -86,13 +88,26 @@ int main(int argc, char **argv) switch (*(*argv)++) { case 'e': op_flags |= OPT_e;break; case 't': op_flags |= OPT_t;break; - case 'v': op_flags |= OPT_v; break; + case 'v': op_flags |= OPT_v;break; case 'w': op_flags |= OPT_w;break; + case 'c': op_flags |= OPT_c;break; default: Usage(); break; } argc--;argv++; } + if (op_flags & OPT_c) { + if (argc != 1) + Usage(); + std::string s = *argv++;argc--; + Utf8Iter uit(s); + auto code = *uit; + auto cnt = utf8codepointsize(code); + std::cout << "0x" << std::hex << code << std::dec << " : " << cnt << " byte" << + (cnt>1?"s":"") << "\n"; + return 0; + } + if (op_flags & OPT_t) { if (argc < 2) Usage(); diff --git a/src/utils/utf8iter.h b/src/utils/utf8iter.h index a8ed4c80..4dd8ce5b 100644 --- a/src/utils/utf8iter.h +++ b/src/utils/utf8iter.h @@ -295,6 +295,20 @@ void utf8truncate(std::string& s, int maxlen, int flags = 0, /** Compute length in characters of utf-8 string */ size_t utf8len(const std::string& s); +/** Return number of bytes for Unicode character */ +inline int utf8codepointsize(uint32_t codepoint) +{ + if (codepoint <= 0x7F) { + return 1; + } else if (codepoint <= 0x7FF) { + return 2; + } else if (codepoint < 0xFFFF) { + return 3; + } else { + return 4; + } +} + /** @brief Check and possibly fix string by replacing badly encoded * characters with the standard question mark replacement character. *