utf8truncate: max size was specified to be bytes but used as character count
This commit is contained in:
parent
5146fc75cf
commit
5fa7c73b7c
@ -44,10 +44,11 @@ void tryempty()
|
||||
|
||||
const char *thisprog;
|
||||
static char usage [] =
|
||||
"utf8iter [opts] infile outfile\n"
|
||||
" converts infile to 32 bits unicode (processor order), for testing\n"
|
||||
"-v : print stuff as we go\n"
|
||||
;
|
||||
"utf8iter [opts] infile outfile\n"
|
||||
" converts infile to 32 bits unicode (processor order), for testing\n"
|
||||
" -v : print stuff as we go\n"
|
||||
"-t [-w] [-e] <string> <maxlen> : test truncation\n"
|
||||
;
|
||||
|
||||
void Usage() {
|
||||
fprintf(stderr, "%s:%s\n", thisprog, usage);
|
||||
@ -55,9 +56,23 @@ void Usage() {
|
||||
}
|
||||
static int op_flags;
|
||||
#define OPT_v 0x2
|
||||
#define OPT_t 0x4
|
||||
#define OPT_w 0x8
|
||||
#define OPT_e 0x10
|
||||
|
||||
int trytruncate(std::string s, int maxlen)
|
||||
{
|
||||
int flag = 0;
|
||||
if (op_flags & OPT_w)
|
||||
flag |= UTF8T_ATWORD;
|
||||
if (op_flags & OPT_e)
|
||||
flag |= UTF8T_ELLIPSIS;
|
||||
utf8truncate(s, maxlen, flag);
|
||||
std::cout << "Truncation result:[" << s << "]\n";
|
||||
return 0;
|
||||
}
|
||||
|
||||
FILE *infout = stdout;
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
thisprog = argv[0];
|
||||
@ -69,12 +84,23 @@ int main(int argc, char **argv)
|
||||
Usage();
|
||||
while (**argv)
|
||||
switch (*(*argv)++) {
|
||||
case 'e': op_flags |= OPT_e;break;
|
||||
case 't': op_flags |= OPT_t;break;
|
||||
case 'v': op_flags |= OPT_v; break;
|
||||
|
||||
case 'w': op_flags |= OPT_w;break;
|
||||
default: Usage(); break;
|
||||
}
|
||||
argc--;argv++;
|
||||
}
|
||||
|
||||
if (op_flags & OPT_t) {
|
||||
if (argc < 2)
|
||||
Usage();
|
||||
std::string s = *argv++;argc--;
|
||||
int maxlen = atoi(*argv++);argc--;
|
||||
return trytruncate(s, maxlen);
|
||||
}
|
||||
|
||||
string infile, outfile;
|
||||
if (argc == 2) {
|
||||
infile = *argv++;argc--;
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
/* Copyright (C) 2017-2019 J.F.Dockes
|
||||
/* Copyright (C) 2017-2020 J.F.Dockes
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation; either version 2.1 of the License, or
|
||||
@ -17,12 +17,13 @@
|
||||
|
||||
#include "utf8iter.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <unordered_set>
|
||||
#include <iostream>
|
||||
|
||||
using namespace std;
|
||||
|
||||
void utf8truncate(std::string& s, int maxlen, int flags, const string& ellipsis,
|
||||
void utf8truncate(string& s, int maxlen, int flags, const string& ellipsis,
|
||||
const string& ws)
|
||||
{
|
||||
if (s.size() <= string::size_type(maxlen)) {
|
||||
@ -39,11 +40,7 @@ void utf8truncate(std::string& s, int maxlen, int flags, const string& ellipsis,
|
||||
|
||||
if (flags & UTF8T_ELLIPSIS) {
|
||||
size_t ellen = utf8len(ellipsis);
|
||||
if (maxlen > int(ellen)) {
|
||||
maxlen -= ellen;
|
||||
} else {
|
||||
maxlen = 0;
|
||||
}
|
||||
maxlen = std::max(0, maxlen - int(ellen));
|
||||
}
|
||||
|
||||
Utf8Iter iter(s);
|
||||
@ -51,7 +48,7 @@ void utf8truncate(std::string& s, int maxlen, int flags, const string& ellipsis,
|
||||
string::size_type lastwspos = 0;
|
||||
for (; !iter.eof(); iter++) {
|
||||
unsigned int c = *iter;
|
||||
if (iter.getCpos() < string::size_type(maxlen)) {
|
||||
if (iter.getBpos() < string::size_type(maxlen)) {
|
||||
pos = iter.getBpos() + iter.getBlen();
|
||||
if ((flags & UTF8T_ATWORD) && wss.find(c) != wss.end()) {
|
||||
lastwspos = pos;
|
||||
|
||||
@ -281,11 +281,18 @@ private:
|
||||
|
||||
|
||||
enum Utf8TruncateFlag {UTF8T_NONE, UTF8T_ATWORD, UTF8T_ELLIPSIS};
|
||||
// maxlen is in utf-8 chars.
|
||||
extern void utf8truncate(std::string& s, int maxlen, int flags = 0,
|
||||
const std::string& ellipsis = "...",
|
||||
const std::string& ws = " \t\n\r");
|
||||
extern size_t utf8len(const std::string& s);
|
||||
|
||||
/** Truncate utf8 string, maintaining encoding integrity
|
||||
* @param s input string to be modified in place
|
||||
* @param maxlen maximum size after truncation in bytes
|
||||
* @param flags Specify cutting at word position, adding an ellipsis
|
||||
*/
|
||||
void utf8truncate(std::string& s, int maxlen, int flags = 0,
|
||||
const std::string& ellipsis = "...",
|
||||
const std::string& ws = " \t\n\r");
|
||||
|
||||
/** Compute length in characters of utf-8 string */
|
||||
size_t utf8len(const std::string& s);
|
||||
|
||||
/** @brief Check and possibly fix string by replacing badly encoded
|
||||
* characters with the standard question mark replacement character.
|
||||
@ -297,7 +304,7 @@ extern size_t utf8len(const std::string& s);
|
||||
* @return -1 for failure (fixit false or maxrepl reached).
|
||||
* 0 or positive: replacement count.
|
||||
*/
|
||||
extern int utf8check(
|
||||
int utf8check(
|
||||
const std::string& in, std::string& out, bool fixit=false, int maxrepl=100);
|
||||
|
||||
#endif /* _UTF8ITER_H_INCLUDED_ */
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user