utf8truncate: max size was specified to be bytes but used as character count
This commit is contained in:
parent
5146fc75cf
commit
5fa7c73b7c
@ -44,10 +44,11 @@ void tryempty()
|
|||||||
|
|
||||||
const char *thisprog;
|
const char *thisprog;
|
||||||
static char usage [] =
|
static char usage [] =
|
||||||
"utf8iter [opts] infile outfile\n"
|
"utf8iter [opts] infile outfile\n"
|
||||||
" converts infile to 32 bits unicode (processor order), for testing\n"
|
" converts infile to 32 bits unicode (processor order), for testing\n"
|
||||||
"-v : print stuff as we go\n"
|
" -v : print stuff as we go\n"
|
||||||
;
|
"-t [-w] [-e] <string> <maxlen> : test truncation\n"
|
||||||
|
;
|
||||||
|
|
||||||
void Usage() {
|
void Usage() {
|
||||||
fprintf(stderr, "%s:%s\n", thisprog, usage);
|
fprintf(stderr, "%s:%s\n", thisprog, usage);
|
||||||
@ -55,9 +56,23 @@ void Usage() {
|
|||||||
}
|
}
|
||||||
static int op_flags;
|
static int op_flags;
|
||||||
#define OPT_v 0x2
|
#define OPT_v 0x2
|
||||||
|
#define OPT_t 0x4
|
||||||
|
#define OPT_w 0x8
|
||||||
|
#define OPT_e 0x10
|
||||||
|
|
||||||
|
int trytruncate(std::string s, int maxlen)
|
||||||
|
{
|
||||||
|
int flag = 0;
|
||||||
|
if (op_flags & OPT_w)
|
||||||
|
flag |= UTF8T_ATWORD;
|
||||||
|
if (op_flags & OPT_e)
|
||||||
|
flag |= UTF8T_ELLIPSIS;
|
||||||
|
utf8truncate(s, maxlen, flag);
|
||||||
|
std::cout << "Truncation result:[" << s << "]\n";
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
FILE *infout = stdout;
|
FILE *infout = stdout;
|
||||||
|
|
||||||
int main(int argc, char **argv)
|
int main(int argc, char **argv)
|
||||||
{
|
{
|
||||||
thisprog = argv[0];
|
thisprog = argv[0];
|
||||||
@ -69,12 +84,23 @@ int main(int argc, char **argv)
|
|||||||
Usage();
|
Usage();
|
||||||
while (**argv)
|
while (**argv)
|
||||||
switch (*(*argv)++) {
|
switch (*(*argv)++) {
|
||||||
|
case 'e': op_flags |= OPT_e;break;
|
||||||
|
case 't': op_flags |= OPT_t;break;
|
||||||
case 'v': op_flags |= OPT_v; break;
|
case 'v': op_flags |= OPT_v; break;
|
||||||
|
case 'w': op_flags |= OPT_w;break;
|
||||||
default: Usage(); break;
|
default: Usage(); break;
|
||||||
}
|
}
|
||||||
argc--;argv++;
|
argc--;argv++;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (op_flags & OPT_t) {
|
||||||
|
if (argc < 2)
|
||||||
|
Usage();
|
||||||
|
std::string s = *argv++;argc--;
|
||||||
|
int maxlen = atoi(*argv++);argc--;
|
||||||
|
return trytruncate(s, maxlen);
|
||||||
|
}
|
||||||
|
|
||||||
string infile, outfile;
|
string infile, outfile;
|
||||||
if (argc == 2) {
|
if (argc == 2) {
|
||||||
infile = *argv++;argc--;
|
infile = *argv++;argc--;
|
||||||
|
|||||||
@ -1,4 +1,4 @@
|
|||||||
/* Copyright (C) 2017-2019 J.F.Dockes
|
/* Copyright (C) 2017-2020 J.F.Dockes
|
||||||
* This program is free software; you can redistribute it and/or modify
|
* This program is free software; you can redistribute it and/or modify
|
||||||
* it under the terms of the GNU Lesser General Public License as published by
|
* it under the terms of the GNU Lesser General Public License as published by
|
||||||
* the Free Software Foundation; either version 2.1 of the License, or
|
* the Free Software Foundation; either version 2.1 of the License, or
|
||||||
@ -17,12 +17,13 @@
|
|||||||
|
|
||||||
#include "utf8iter.h"
|
#include "utf8iter.h"
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
#include <unordered_set>
|
#include <unordered_set>
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
|
|
||||||
using namespace std;
|
using namespace std;
|
||||||
|
|
||||||
void utf8truncate(std::string& s, int maxlen, int flags, const string& ellipsis,
|
void utf8truncate(string& s, int maxlen, int flags, const string& ellipsis,
|
||||||
const string& ws)
|
const string& ws)
|
||||||
{
|
{
|
||||||
if (s.size() <= string::size_type(maxlen)) {
|
if (s.size() <= string::size_type(maxlen)) {
|
||||||
@ -39,11 +40,7 @@ void utf8truncate(std::string& s, int maxlen, int flags, const string& ellipsis,
|
|||||||
|
|
||||||
if (flags & UTF8T_ELLIPSIS) {
|
if (flags & UTF8T_ELLIPSIS) {
|
||||||
size_t ellen = utf8len(ellipsis);
|
size_t ellen = utf8len(ellipsis);
|
||||||
if (maxlen > int(ellen)) {
|
maxlen = std::max(0, maxlen - int(ellen));
|
||||||
maxlen -= ellen;
|
|
||||||
} else {
|
|
||||||
maxlen = 0;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Utf8Iter iter(s);
|
Utf8Iter iter(s);
|
||||||
@ -51,7 +48,7 @@ void utf8truncate(std::string& s, int maxlen, int flags, const string& ellipsis,
|
|||||||
string::size_type lastwspos = 0;
|
string::size_type lastwspos = 0;
|
||||||
for (; !iter.eof(); iter++) {
|
for (; !iter.eof(); iter++) {
|
||||||
unsigned int c = *iter;
|
unsigned int c = *iter;
|
||||||
if (iter.getCpos() < string::size_type(maxlen)) {
|
if (iter.getBpos() < string::size_type(maxlen)) {
|
||||||
pos = iter.getBpos() + iter.getBlen();
|
pos = iter.getBpos() + iter.getBlen();
|
||||||
if ((flags & UTF8T_ATWORD) && wss.find(c) != wss.end()) {
|
if ((flags & UTF8T_ATWORD) && wss.find(c) != wss.end()) {
|
||||||
lastwspos = pos;
|
lastwspos = pos;
|
||||||
|
|||||||
@ -281,11 +281,18 @@ private:
|
|||||||
|
|
||||||
|
|
||||||
enum Utf8TruncateFlag {UTF8T_NONE, UTF8T_ATWORD, UTF8T_ELLIPSIS};
|
enum Utf8TruncateFlag {UTF8T_NONE, UTF8T_ATWORD, UTF8T_ELLIPSIS};
|
||||||
// maxlen is in utf-8 chars.
|
|
||||||
extern void utf8truncate(std::string& s, int maxlen, int flags = 0,
|
/** Truncate utf8 string, maintaining encoding integrity
|
||||||
const std::string& ellipsis = "...",
|
* @param s input string to be modified in place
|
||||||
const std::string& ws = " \t\n\r");
|
* @param maxlen maximum size after truncation in bytes
|
||||||
extern size_t utf8len(const std::string& s);
|
* @param flags Specify cutting at word position, adding an ellipsis
|
||||||
|
*/
|
||||||
|
void utf8truncate(std::string& s, int maxlen, int flags = 0,
|
||||||
|
const std::string& ellipsis = "...",
|
||||||
|
const std::string& ws = " \t\n\r");
|
||||||
|
|
||||||
|
/** Compute length in characters of utf-8 string */
|
||||||
|
size_t utf8len(const std::string& s);
|
||||||
|
|
||||||
/** @brief Check and possibly fix string by replacing badly encoded
|
/** @brief Check and possibly fix string by replacing badly encoded
|
||||||
* characters with the standard question mark replacement character.
|
* characters with the standard question mark replacement character.
|
||||||
@ -297,7 +304,7 @@ extern size_t utf8len(const std::string& s);
|
|||||||
* @return -1 for failure (fixit false or maxrepl reached).
|
* @return -1 for failure (fixit false or maxrepl reached).
|
||||||
* 0 or positive: replacement count.
|
* 0 or positive: replacement count.
|
||||||
*/
|
*/
|
||||||
extern int utf8check(
|
int utf8check(
|
||||||
const std::string& in, std::string& out, bool fixit=false, int maxrepl=100);
|
const std::string& in, std::string& out, bool fixit=false, int maxrepl=100);
|
||||||
|
|
||||||
#endif /* _UTF8ITER_H_INCLUDED_ */
|
#endif /* _UTF8ITER_H_INCLUDED_ */
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user