utf8iter: utility function giving bytes count for code
This commit is contained in:
parent
fa790f52de
commit
e3e270fe81
@ -48,6 +48,7 @@ static char usage [] =
|
|||||||
" converts infile to 32 bits unicode (processor order), for testing\n"
|
" converts infile to 32 bits unicode (processor order), for testing\n"
|
||||||
" -v : print stuff as we go\n"
|
" -v : print stuff as we go\n"
|
||||||
"-t [-w] [-e] <string> <maxlen> : test truncation\n"
|
"-t [-w] [-e] <string> <maxlen> : test truncation\n"
|
||||||
|
"-c <str> : str must be a single utf-8 char. Convert to code then show character bytes count\n"
|
||||||
;
|
;
|
||||||
|
|
||||||
void Usage() {
|
void Usage() {
|
||||||
@ -59,6 +60,7 @@ static int op_flags;
|
|||||||
#define OPT_t 0x4
|
#define OPT_t 0x4
|
||||||
#define OPT_w 0x8
|
#define OPT_w 0x8
|
||||||
#define OPT_e 0x10
|
#define OPT_e 0x10
|
||||||
|
#define OPT_c 0x20
|
||||||
|
|
||||||
int trytruncate(std::string s, int maxlen)
|
int trytruncate(std::string s, int maxlen)
|
||||||
{
|
{
|
||||||
@ -86,13 +88,26 @@ int main(int argc, char **argv)
|
|||||||
switch (*(*argv)++) {
|
switch (*(*argv)++) {
|
||||||
case 'e': op_flags |= OPT_e;break;
|
case 'e': op_flags |= OPT_e;break;
|
||||||
case 't': op_flags |= OPT_t;break;
|
case 't': op_flags |= OPT_t;break;
|
||||||
case 'v': op_flags |= OPT_v; break;
|
case 'v': op_flags |= OPT_v;break;
|
||||||
case 'w': op_flags |= OPT_w;break;
|
case 'w': op_flags |= OPT_w;break;
|
||||||
|
case 'c': op_flags |= OPT_c;break;
|
||||||
default: Usage(); break;
|
default: Usage(); break;
|
||||||
}
|
}
|
||||||
argc--;argv++;
|
argc--;argv++;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (op_flags & OPT_c) {
|
||||||
|
if (argc != 1)
|
||||||
|
Usage();
|
||||||
|
std::string s = *argv++;argc--;
|
||||||
|
Utf8Iter uit(s);
|
||||||
|
auto code = *uit;
|
||||||
|
auto cnt = utf8codepointsize(code);
|
||||||
|
std::cout << "0x" << std::hex << code << std::dec << " : " << cnt << " byte" <<
|
||||||
|
(cnt>1?"s":"") << "\n";
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
if (op_flags & OPT_t) {
|
if (op_flags & OPT_t) {
|
||||||
if (argc < 2)
|
if (argc < 2)
|
||||||
Usage();
|
Usage();
|
||||||
|
|||||||
@ -295,6 +295,20 @@ void utf8truncate(std::string& s, int maxlen, int flags = 0,
|
|||||||
/** Compute length in characters of utf-8 string */
|
/** Compute length in characters of utf-8 string */
|
||||||
size_t utf8len(const std::string& s);
|
size_t utf8len(const std::string& s);
|
||||||
|
|
||||||
|
/** Return number of bytes for Unicode character */
|
||||||
|
inline int utf8codepointsize(uint32_t codepoint)
|
||||||
|
{
|
||||||
|
if (codepoint <= 0x7F) {
|
||||||
|
return 1;
|
||||||
|
} else if (codepoint <= 0x7FF) {
|
||||||
|
return 2;
|
||||||
|
} else if (codepoint < 0xFFFF) {
|
||||||
|
return 3;
|
||||||
|
} else {
|
||||||
|
return 4;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/** @brief Check and possibly fix string by replacing badly encoded
|
/** @brief Check and possibly fix string by replacing badly encoded
|
||||||
* characters with the standard question mark replacement character.
|
* characters with the standard question mark replacement character.
|
||||||
*
|
*
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user