utf8iter: utility function giving bytes count for code

This commit is contained in:
Jean-Francois Dockes 2021-11-02 09:24:06 +01:00
parent fa790f52de
commit e3e270fe81
2 changed files with 30 additions and 1 deletions

View File

@ -48,6 +48,7 @@ static char usage [] =
" converts infile to 32 bits unicode (processor order), for testing\n"
" -v : print stuff as we go\n"
"-t [-w] [-e] <string> <maxlen> : test truncation\n"
"-c <str> : str must be a single utf-8 char. Convert to code then show character bytes count\n"
;
void Usage() {
@ -59,6 +60,7 @@ static int op_flags;
#define OPT_t 0x4
#define OPT_w 0x8
#define OPT_e 0x10
#define OPT_c 0x20
int trytruncate(std::string s, int maxlen)
{
@ -86,13 +88,26 @@ int main(int argc, char **argv)
switch (*(*argv)++) {
case 'e': op_flags |= OPT_e;break;
case 't': op_flags |= OPT_t;break;
case 'v': op_flags |= OPT_v; break;
case 'v': op_flags |= OPT_v;break;
case 'w': op_flags |= OPT_w;break;
case 'c': op_flags |= OPT_c;break;
default: Usage(); break;
}
argc--;argv++;
}
if (op_flags & OPT_c) {
if (argc != 1)
Usage();
std::string s = *argv++;argc--;
Utf8Iter uit(s);
auto code = *uit;
auto cnt = utf8codepointsize(code);
std::cout << "0x" << std::hex << code << std::dec << " : " << cnt << " byte" <<
(cnt>1?"s":"") << "\n";
return 0;
}
if (op_flags & OPT_t) {
if (argc < 2)
Usage();

View File

@ -295,6 +295,20 @@ void utf8truncate(std::string& s, int maxlen, int flags = 0,
/** Compute length in characters of utf-8 string */
size_t utf8len(const std::string& s);
/** Return number of bytes for Unicode character */
inline int utf8codepointsize(uint32_t codepoint)
{
if (codepoint <= 0x7F) {
return 1;
} else if (codepoint <= 0x7FF) {
return 2;
} else if (codepoint < 0xFFFF) {
return 3;
} else {
return 4;
}
}
/** @brief Check and possibly fix string by replacing badly encoded
* characters with the standard question mark replacement character.
*