utf8truncate: max size was specified to be bytes but used as character count

This commit is contained in:
Jean-Francois Dockes 2020-10-03 15:20:14 +02:00
parent 5146fc75cf
commit 5fa7c73b7c
3 changed files with 50 additions and 20 deletions

View File

@ -44,10 +44,11 @@ void tryempty()
const char *thisprog;
static char usage [] =
"utf8iter [opts] infile outfile\n"
" converts infile to 32 bits unicode (processor order), for testing\n"
"-v : print stuff as we go\n"
;
"utf8iter [opts] infile outfile\n"
" converts infile to 32 bits unicode (processor order), for testing\n"
" -v : print stuff as we go\n"
"-t [-w] [-e] <string> <maxlen> : test truncation\n"
;
void Usage() {
fprintf(stderr, "%s:%s\n", thisprog, usage);
@ -55,9 +56,23 @@ void Usage() {
}
static int op_flags;
#define OPT_v 0x2
#define OPT_t 0x4
#define OPT_w 0x8
#define OPT_e 0x10
int trytruncate(std::string s, int maxlen)
{
int flag = 0;
if (op_flags & OPT_w)
flag |= UTF8T_ATWORD;
if (op_flags & OPT_e)
flag |= UTF8T_ELLIPSIS;
utf8truncate(s, maxlen, flag);
std::cout << "Truncation result:[" << s << "]\n";
return 0;
}
FILE *infout = stdout;
int main(int argc, char **argv)
{
thisprog = argv[0];
@ -69,12 +84,23 @@ int main(int argc, char **argv)
Usage();
while (**argv)
switch (*(*argv)++) {
case 'e': op_flags |= OPT_e;break;
case 't': op_flags |= OPT_t;break;
case 'v': op_flags |= OPT_v; break;
case 'w': op_flags |= OPT_w;break;
default: Usage(); break;
}
argc--;argv++;
}
if (op_flags & OPT_t) {
if (argc < 2)
Usage();
std::string s = *argv++;argc--;
int maxlen = atoi(*argv++);argc--;
return trytruncate(s, maxlen);
}
string infile, outfile;
if (argc == 2) {
infile = *argv++;argc--;

View File

@ -1,4 +1,4 @@
/* Copyright (C) 2017-2019 J.F.Dockes
/* Copyright (C) 2017-2020 J.F.Dockes
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation; either version 2.1 of the License, or
@ -17,12 +17,13 @@
#include "utf8iter.h"
#include <algorithm>
#include <unordered_set>
#include <iostream>
using namespace std;
void utf8truncate(std::string& s, int maxlen, int flags, const string& ellipsis,
void utf8truncate(string& s, int maxlen, int flags, const string& ellipsis,
const string& ws)
{
if (s.size() <= string::size_type(maxlen)) {
@ -39,11 +40,7 @@ void utf8truncate(std::string& s, int maxlen, int flags, const string& ellipsis,
if (flags & UTF8T_ELLIPSIS) {
size_t ellen = utf8len(ellipsis);
if (maxlen > int(ellen)) {
maxlen -= ellen;
} else {
maxlen = 0;
}
maxlen = std::max(0, maxlen - int(ellen));
}
Utf8Iter iter(s);
@ -51,7 +48,7 @@ void utf8truncate(std::string& s, int maxlen, int flags, const string& ellipsis,
string::size_type lastwspos = 0;
for (; !iter.eof(); iter++) {
unsigned int c = *iter;
if (iter.getCpos() < string::size_type(maxlen)) {
if (iter.getBpos() < string::size_type(maxlen)) {
pos = iter.getBpos() + iter.getBlen();
if ((flags & UTF8T_ATWORD) && wss.find(c) != wss.end()) {
lastwspos = pos;

View File

@ -281,11 +281,18 @@ private:
enum Utf8TruncateFlag {UTF8T_NONE, UTF8T_ATWORD, UTF8T_ELLIPSIS};
// maxlen is in utf-8 chars.
extern void utf8truncate(std::string& s, int maxlen, int flags = 0,
const std::string& ellipsis = "...",
const std::string& ws = " \t\n\r");
extern size_t utf8len(const std::string& s);
/** Truncate utf8 string, maintaining encoding integrity
* @param s input string to be modified in place
* @param maxlen maximum size after truncation in bytes
* @param flags Specify cutting at word position, adding an ellipsis
*/
void utf8truncate(std::string& s, int maxlen, int flags = 0,
const std::string& ellipsis = "...",
const std::string& ws = " \t\n\r");
/** Compute length in characters of utf-8 string */
size_t utf8len(const std::string& s);
/** @brief Check and possibly fix string by replacing badly encoded
* characters with the standard question mark replacement character.
@ -297,7 +304,7 @@ extern size_t utf8len(const std::string& s);
* @return -1 for failure (fixit false or maxrepl reached).
* 0 or positive: replacement count.
*/
extern int utf8check(
int utf8check(
const std::string& in, std::string& out, bool fixit=false, int maxrepl=100);
#endif /* _UTF8ITER_H_INCLUDED_ */