diff --git a/packaging/debian/buildppa.sh b/packaging/debian/buildppa.sh index 858caf34..4f078f79 100644 --- a/packaging/debian/buildppa.sh +++ b/packaging/debian/buildppa.sh @@ -6,7 +6,7 @@ PPA_KEYID=7808CE96D38B9201 -RCLVERS=1.27.2 +RCLVERS=1.27.3 SCOPEVERS=1.20.2.4 GSSPVERS=1.0.0 PPAVERS=2 @@ -51,7 +51,7 @@ debdir=debian # 19.10 eoan 2020-07 # 20.04LTS focal 2025-04 series="xenial bionic eoan focal" -series= +#series= if test "X$series" != X ; then check_recoll_orig @@ -137,7 +137,7 @@ done # 19.10 eoan 2020-07 # 20.04LTS focal 2025-04 series="xenial bionic eoan focal" -series=focal +series= debdir=debiangssp if test ! -d ${debdir}/ ; then diff --git a/src/qtgui/confgui/confgui.cpp b/src/qtgui/confgui/confgui.cpp index 27222440..e7d78f22 100644 --- a/src/qtgui/confgui/confgui.cpp +++ b/src/qtgui/confgui/confgui.cpp @@ -132,7 +132,7 @@ int ConfTabsW::addForeignPanel(ConfPanelWIF* w, const QString& title) m_widgets.push_back(w); QWidget *qw = dynamic_cast(w); if (qw == 0) { - qDebug() << "Can't cast panel to QWidget"; + qDebug() << "addForeignPanel: can't cast panel to QWidget"; abort(); } return tabWidget->addTab(qw, title); @@ -227,9 +227,11 @@ ConfParamW *ConfTabsW::findParamW(const QString& varname) } void ConfTabsW::endOfList(int tabindex) { - ConfPanelW *panel = (ConfPanelW*)tabWidget->widget(tabindex); - if (nullptr == panel) + ConfPanelW *panel = dynamic_cast(tabWidget->widget(tabindex)); + // panel may be null if this is a foreign panel (not a conftabsw) + if (nullptr == panel) { return; + } panel->endOfList(); } diff --git a/src/utils/log.cpp b/src/utils/log.cpp index cc923552..c48a15f1 100644 --- a/src/utils/log.cpp +++ b/src/utils/log.cpp @@ -19,6 +19,11 @@ #include #include +#include + +#ifdef _MSC_VER +#define localtime_r(A,B) localtime_s(B,A) +#endif using namespace std; @@ -54,6 +59,17 @@ bool Logger::reopen(const std::string& fn) return true; } +const char *Logger::datestring() +{ + time_t clk = time(0); + struct tm tmb; + localtime_r(&clk, &tmb); + if (strftime(m_datebuf, LOGGER_DATESIZE, m_datefmt.c_str(), &tmb)) { + return m_datebuf; + } else { + return ""; + } +} static Logger *theLog; Logger *Logger::getTheLog(const string& fn) diff --git a/src/utils/log.h b/src/utils/log.h index 7460623c..d4d885ef 100644 --- a/src/utils/log.h +++ b/src/utils/log.h @@ -18,11 +18,9 @@ #define _LOG_H_X_INCLUDED_ #include - #include #include #include -#include #ifndef LOGGER_THREADSAFE #define LOGGER_THREADSAFE 1 @@ -33,30 +31,74 @@ #endif // Can't use the symbolic Logger::LLXX names in preproc. 6 is LLDEB1 +// STATICVERBOSITY is the level above which logging statements are +// preproc'ed out (can't be dynamically turned on). #ifndef LOGGER_STATICVERBOSITY #define LOGGER_STATICVERBOSITY 5 #endif +#define LOGGER_DATESIZE 100 + +/** @brief This is a singleton class. The logger pointer is obtained + * when needed by calls to @ref getTheLog(), only the first of which + * actually creates the object and initializes the output. */ class Logger { public: /** Initialize logging to file name. Use "stderr" for stderr - output. Creates the singleton logger object */ + * output. Creates the singleton logger object. Only the first + * call changes the state, further ones just return the Logger + * pointer. */ static Logger *getTheLog(const std::string& fn); + /** Close and reopen the output file. For rotating the log: rename + * then reopen. */ bool reopen(const std::string& fn); - + + /** Retrieve the output stream in case you need to write directly + * to it. In a multithreaded program, you probably also need to obtain + * the mutex with @ref getmutex, and lock it. */ std::ostream& getstream() { return m_tocerr ? std::cerr : m_stream; } + + /** @brief Log level values. Messages at level above the current will + * not be printed. Messages at a level above + * LOGGER_STATICVERBOSITY will not even be compiled in. */ enum LogLevel {LLNON=0, LLFAT=1, LLERR=2, LLINF=3, LLDEB=4, LLDEB0=5, LLDEB1=6, LLDEB2=7}; + + /** @brief Set the log dynamic verbosity level */ void setLogLevel(LogLevel level) { m_loglevel = level; } - int getloglevel() { + /** @brief Set the log dynamic verbosity level */ + void setloglevel(LogLevel level) { + m_loglevel = level; + } + + /** @brief Retrieve the current log level */ + int getloglevel() const { return m_loglevel; } + /** @brief turn date logging on or off (default is off) */ + void logthedate(bool onoff) { + m_logdate = onoff; + } + + bool loggingdate() const { + return m_logdate; + } + + /** @brief Set the date format, as an strftime() format string. + * Default: "%Y%m%d-%H%M%S" . */ + void setdateformat(const std::string fmt) { + m_datefmt = fmt; + } + + /** Call with log locked */ + const char *datestring(); + #if LOGGER_THREADSAFE std::recursive_mutex& getmutex() { return m_mutex; @@ -65,13 +107,15 @@ public: private: bool m_tocerr{false}; + bool m_logdate{false}; int m_loglevel{LLERR}; + std::string m_datefmt{"%Y%m%d-%H%M%S"}; std::string m_fn; std::ofstream m_stream; #if LOGGER_THREADSAFE std::recursive_mutex m_mutex; #endif - + char m_datebuf[LOGGER_DATESIZE]; Logger(const std::string& fn); Logger(const Logger &); Logger& operator=(const Logger &); @@ -93,10 +137,14 @@ private: #define LOGGER_LEVEL (Logger::getTheLog("")->getloglevel() + \ LOGGER_LOCAL_LOGINC) -#define LOGGER_DOLOG(L,X) LOGGER_PRT << ":" << L << ":" << \ - __FILE__ << ":" << __LINE__ << "::" << X \ +#define LOGGER_DATE (Logger::getTheLog("")->loggingdate() ? \ + Logger::getTheLog("")->datestring() : "") + +#define LOGGER_DOLOG(L,X) LOGGER_PRT << LOGGER_DATE << ":" << L << ":" << \ + __FILE__ << ":" << __LINE__ << "::" << X \ << std::flush + #if LOGGER_STATICVERBOSITY >= 7 #define LOGDEB2(X) { \ if (LOGGER_LEVEL >= Logger::LLDEB2) { \ @@ -142,6 +190,10 @@ private: #endif #if LOGGER_STATICVERBOSITY >= 3 +/** Log a message at level INFO. Other macros exist for other levels (LOGFAT, + * LOGERR, LOGINF, LOGDEB, LOGDEB0... Use as: + * LOGINF("some text" << other stuff << ... << "\n"); + */ #define LOGINF(X) { \ if (LOGGER_LEVEL >= Logger::LLINF) { \ LOGGER_LOCK; \ diff --git a/src/utils/pathut.cpp b/src/utils/pathut.cpp index 6a900fdb..2ddb39a1 100644 --- a/src/utils/pathut.cpp +++ b/src/utils/pathut.cpp @@ -56,7 +56,6 @@ #include #include -#include #include #include #include @@ -88,12 +87,12 @@ #endif #define WIN32_LEAN_AND_MEAN #define NOGDI -#define MAXPATHLEN PATH_MAX - #include #include - #include +#include +#include +#include #if !defined(S_IFLNK) #define S_IFLNK 0 @@ -104,11 +103,13 @@ #ifndef S_ISREG # define S_ISREG(ST_MODE) (((ST_MODE) & _S_IFMT) == _S_IFREG) #endif - -#include - -#include -#include +#define MAXPATHLEN PATH_MAX +#ifndef PATH_MAX +#define PATH_MAX MAX_PATH +#endif +#ifndef R_OK +#define R_OK 4 +#endif #define STAT _wstati64 #define LSTAT _wstati64 @@ -130,7 +131,9 @@ // For getpid #include #define getpid _getpid -#endif + +#define PATHUT_SSIZE_T int +#endif // _MSC_VER #else /* !_WIN32 -> */ @@ -159,8 +162,19 @@ #endif /* !_WIN32 */ +#ifdef _MSC_VER +#include +#else // !_MSC_VER +#include +#endif // _MSC_VER + using namespace std; +#ifndef PATHUT_SSIZE_T +#define PATHUT_SSIZE_T ssize_t +#endif + + #ifdef _WIN32 std::string wchartoutf8(const wchar_t *in, size_t len) @@ -1401,7 +1415,7 @@ int Pidfile::write_pid() char pidstr[20]; sprintf(pidstr, "%u", int(getpid())); lseek(m_fd, 0, 0); - if (::write(m_fd, pidstr, strlen(pidstr)) != (ssize_t)strlen(pidstr)) { + if (::write(m_fd, pidstr, strlen(pidstr)) != (PATHUT_SSIZE_T)strlen(pidstr)) { m_reason = "write failed"; return -1; } diff --git a/src/utils/pathut.h b/src/utils/pathut.h index cf447e3e..cb390473 100644 --- a/src/utils/pathut.h +++ b/src/utils/pathut.h @@ -143,11 +143,7 @@ extern bool path_unlink(const std::string& path); * extension. On other OSes, just builds the fstream. We'd need to * find a way to make this work with g++. It would be easier in this * case to use a FILE (_openw(), then fdopen()), but conftree really - * depends on std::iostream. One possible workaround for g++ would be - * to use shortpaths (which we already use to pass file names to - * xapian and aspell). Most of the problems are caused by the home - * directory name being non-ASCII, so returning a short path in - * path_home() would probably solve everything (but not pretty). + * depends on std::iostream. * * @param path an utf-8 file path. * @param mode is an std::fstream mode (ios::in etc.) */ diff --git a/src/utils/smallut.h b/src/utils/smallut.h index 7e142945..afe18a4a 100644 --- a/src/utils/smallut.h +++ b/src/utils/smallut.h @@ -66,7 +66,7 @@ extern void stringtolower(std::string& io); extern std::string stringtolower(const std::string& io); extern void stringtoupper(std::string& io); extern std::string stringtoupper(const std::string& io); -extern bool beginswith(const std::string& big, const std::string& small); +extern bool beginswith(const std::string& bg, const std::string& sml); // Is one string the end part of the other ? extern int stringisuffcmp(const std::string& s1, const std::string& s2); diff --git a/src/utils/utf8iter.cpp b/src/utils/utf8iter.cpp index 1d6361cd..64e4661d 100644 --- a/src/utils/utf8iter.cpp +++ b/src/utils/utf8iter.cpp @@ -16,21 +16,114 @@ */ #include "utf8iter.h" -#include -using std::string; +#include +#include -void utf8truncate(std::string& s, int maxlen) +using namespace std; + +void utf8truncate(std::string& s, int maxlen, int flags, string ellipsis, + string ws) { if (s.size() <= string::size_type(maxlen)) { return; } + unordered_set wss; + if (flags & UTF8T_ATWORD) { + Utf8Iter iter(ws); + for (; !iter.eof(); iter++) { + unsigned int c = *iter; + wss.insert(c); + } + } + + if (flags & UTF8T_ELLIPSIS) { + size_t ellen = utf8len(ellipsis); + if (maxlen > int(ellen)) { + maxlen -= ellen; + } else { + maxlen = 0; + } + } + Utf8Iter iter(s); string::size_type pos = 0; - while (iter++ != string::npos) - if (iter.getBpos() < string::size_type(maxlen)) { - pos = iter.getBpos(); + string::size_type lastwspos = 0; + for (; !iter.eof(); iter++) { + unsigned int c = *iter; + if (iter.getCpos() < string::size_type(maxlen)) { + pos = iter.getBpos() + iter.getBlen(); + if ((flags & UTF8T_ATWORD) && wss.find(c) != wss.end()) { + lastwspos = pos; + } + } else { + break; } + } - s.erase(pos); + if (flags & UTF8T_ATWORD) { + s.erase(lastwspos); + for (;;) { + Utf8Iter iter(s); + unsigned int c = 0; + for (; !iter.eof(); iter++) { + c = *iter; + pos = iter.getBpos(); + } + if (wss.find(c) == wss.end()) { + break; + } + s.erase(pos); + } + } else { + s.erase(pos); + } + + if (flags & UTF8T_ELLIPSIS) { + s += ellipsis; + } +} + +size_t utf8len(const string& s) +{ + size_t len = 0; + Utf8Iter iter(s); + while (iter++ != string::npos) { + len++; + } + return len; +} + +static const std::string replchar{"\xef\xbf\xbd"}; + +// Check utf-8 encoding, replacing errors with the ? char above +int utf8check(const std::string& in, std::string& out, bool fixit, int maxrepl) +{ + int cnt = 0; + Utf8Iter it(in); + for (;!it.eof(); it++) { + if (it.error()) { + if (!fixit) { + return -1; + } + out += replchar; + ++cnt; + for (; cnt < maxrepl; cnt++) { + it.retryfurther(); + if (it.eof()) + return cnt; + if (!it.error()) + break; + out += replchar; + } + if (it.error()) { + return -1; + } + } + // We have reached a good char and eof is false + if (fixit) { + it.appendchartostring(out); + } + } + return cnt; } diff --git a/src/utils/utf8iter.h b/src/utils/utf8iter.h index 0925c999..0d7e6391 100644 --- a/src/utils/utf8iter.h +++ b/src/utils/utf8iter.h @@ -32,110 +32,120 @@ class Utf8Iter { public: Utf8Iter(const std::string &in) - : m_sp(&in) { - update_cl(); + : m_sp(&in) { + update_cl(); } const std::string& buffer() const { return *m_sp; } - + void rewind() { - m_cl = 0; - m_pos = 0; - m_charpos = 0; - update_cl(); + m_cl = 0; + m_pos = 0; + m_charpos = 0; + update_cl(); } + void retryfurther() { + if (eof()) + return; + m_pos++; + if (eof()) { + return; + } + update_cl(); + } + /** "Direct" access. Awfully inefficient as we skip from start or current * position at best. This can only be useful for a lookahead from the * current position */ unsigned int operator[](std::string::size_type charpos) const { - std::string::size_type mypos = 0; - unsigned int mycp = 0; - if (charpos >= m_charpos) { - mypos = m_pos; - mycp = m_charpos; - } - int l; - while (mypos < m_sp->length() && mycp != charpos) { - l = get_cl(mypos); - if (l <= 0 || !poslok(mypos, l) || !checkvalidat(mypos, l)) + std::string::size_type mypos = 0; + unsigned int mycp = 0; + if (charpos >= m_charpos) { + mypos = m_pos; + mycp = m_charpos; + } + int l; + while (mypos < m_sp->length() && mycp != charpos) { + l = get_cl(mypos); + if (l <= 0 || !poslok(mypos, l) || !checkvalidat(mypos, l)) + return (unsigned int)-1; + mypos += l; + ++mycp; + } + if (mypos < m_sp->length() && mycp == charpos) { + l = get_cl(mypos); + if (poslok(mypos, l) && checkvalidat(mypos, l)) + return getvalueat(mypos, l); + } return (unsigned int)-1; - mypos += l; - ++mycp; - } - if (mypos < m_sp->length() && mycp == charpos) { - l = get_cl(mypos); - if (poslok(mypos, l) && checkvalidat(mypos, l)) - return getvalueat(mypos, l); - } - return (unsigned int)-1; } /** Increment current position to next utf-8 char */ std::string::size_type operator++(int) { - // Note: m_cl may be zero at eof if user's test not right - // this shouldn't crash the program until actual data access + // Note: m_cl may be zero at eof if user's test not right + // this shouldn't crash the program until actual data access #ifdef UTF8ITER_CHECK - assert(m_cl != 0); + assert(m_cl != 0); #endif - if (m_cl == 0) - return std::string::npos; + if (m_cl == 0) + return std::string::npos; - m_pos += m_cl; - m_charpos++; - update_cl(); - return m_pos; + m_pos += m_cl; + m_charpos++; + update_cl(); + return m_pos; } /** operator* returns the ucs4 value as a machine integer*/ unsigned int operator*() { #ifdef UTF8ITER_CHECK - assert(m_cl > 0); + assert(m_cl > 0); #endif - return m_cl == 0 ? (unsigned int)-1 : getvalueat(m_pos, m_cl); + return m_cl == 0 ? (unsigned int)-1 : getvalueat(m_pos, m_cl); } /** Append current utf-8 possibly multi-byte character to string param. - This needs to be fast. No error checking. */ + This needs to be fast. No error checking. */ unsigned int appendchartostring(std::string &out) const { #ifdef UTF8ITER_CHECK - assert(m_cl != 0); + assert(m_cl != 0); #endif - out.append(&(*m_sp)[m_pos], m_cl); - return m_cl; + out.append(&(*m_sp)[m_pos], m_cl); + return m_cl; } /** Return current character as string */ operator std::string() { #ifdef UTF8ITER_CHECK - assert(m_cl != 0); + assert(m_cl != 0); #endif - return m_cl > 0 ? m_sp->substr(m_pos, m_cl) : std::string(); + return m_cl > 0 ? m_sp->substr(m_pos, m_cl) : std::string(); } bool eof() const { - return m_pos == m_sp->length(); + return m_pos == m_sp->length(); } bool error() const { - return m_cl == 0; + return m_cl == 0; } /** Return current byte offset in input string */ std::string::size_type getBpos() const { - return m_pos; + return m_pos; } /** Return current character length */ std::string::size_type getBlen() const { - return m_cl; + return m_cl; } /** Return current unicode character offset in input string */ std::string::size_type getCpos() const { - return m_charpos; + return m_charpos; } private: @@ -151,128 +161,143 @@ private: // Check position and cl against string length bool poslok(std::string::size_type p, int l) const { -#ifdef UTF8ITER_CHECK - assert(p != std::string::npos && l > 0 && p + l <= m_sp->length()); -#endif - return p != std::string::npos && l > 0 && p + l <= m_sp->length(); + return p != std::string::npos && l > 0 && p + l <= m_sp->length(); } // Update current char length in object state, check // for errors inline void update_cl() { - m_cl = 0; - if (m_pos >= m_sp->length()) - return; - m_cl = get_cl(m_pos); - if (!poslok(m_pos, m_cl)) { - // Used to set eof here for safety, but this is bad because it - // basically prevents the caller to discriminate error and eof. - // m_pos = m_sp->length(); m_cl = 0; - return; - } - if (!checkvalidat(m_pos, m_cl)) { - m_cl = 0; - } + if (m_pos >= m_sp->length()) + return; + m_cl = get_cl(m_pos); + if (!poslok(m_pos, m_cl)) { + // Used to set eof here for safety, but this is bad because it + // basically prevents the caller to discriminate error and eof. + // m_pos = m_sp->length(); + m_cl = 0; + return; + } + if (!checkvalidat(m_pos, m_cl)) { + m_cl = 0; + } } inline bool checkvalidat(std::string::size_type p, int l) const { - switch (l) { - case 1: - return (unsigned char)(*m_sp)[p] < 128; - case 2: - return (((unsigned char)(*m_sp)[p]) & 224) == 192 - && (((unsigned char)(*m_sp)[p+1]) & 192) == 128; - case 3: - return (((unsigned char)(*m_sp)[p]) & 240) == 224 - && (((unsigned char)(*m_sp)[p+1]) & 192) == 128 - && (((unsigned char)(*m_sp)[p+2]) & 192) == 128 - ; - case 4: - return (((unsigned char)(*m_sp)[p]) & 248) == 240 - && (((unsigned char)(*m_sp)[p+1]) & 192) == 128 - && (((unsigned char)(*m_sp)[p+2]) & 192) == 128 - && (((unsigned char)(*m_sp)[p+3]) & 192) == 128 - ; - default: - return false; - } + switch (l) { + case 1: + return (unsigned char)(*m_sp)[p] < 128; + case 2: + return (((unsigned char)(*m_sp)[p]) & 224) == 192 + && (((unsigned char)(*m_sp)[p+1]) & 192) == 128; + case 3: + return (((unsigned char)(*m_sp)[p]) & 240) == 224 + && (((unsigned char)(*m_sp)[p+1]) & 192) == 128 + && (((unsigned char)(*m_sp)[p+2]) & 192) == 128 + ; + case 4: + return (((unsigned char)(*m_sp)[p]) & 248) == 240 + && (((unsigned char)(*m_sp)[p+1]) & 192) == 128 + && (((unsigned char)(*m_sp)[p+2]) & 192) == 128 + && (((unsigned char)(*m_sp)[p+3]) & 192) == 128 + ; + default: + return false; + } } // Get character byte length at specified position. Returns 0 for error. inline int get_cl(std::string::size_type p) const { - unsigned int z = (unsigned char)(*m_sp)[p]; - if (z <= 127) { - return 1; - } else if ((z & 224) == 192) { - return 2; - } else if ((z & 240) == 224) { - return 3; - } else if ((z & 248) == 240) { - return 4; - } + unsigned int z = (unsigned char)(*m_sp)[p]; + if (z <= 127) { + return 1; + } else if ((z & 224) == 192) { + return 2; + } else if ((z & 240) == 224) { + return 3; + } else if ((z & 248) == 240) { + return 4; + } #ifdef UTF8ITER_CHECK - assert(z <= 127 || (z & 224) == 192 || (z & 240) == 224 || - (z & 248) == 240); + assert(z <= 127 || (z & 224) == 192 || (z & 240) == 224 || + (z & 248) == 240); #endif - return 0; + return 0; } // Compute value at given position. No error checking. inline unsigned int getvalueat(std::string::size_type p, int l) const { - switch (l) { - case 1: + switch (l) { + case 1: #ifdef UTF8ITER_CHECK - assert((unsigned char)(*m_sp)[p] < 128); + assert((unsigned char)(*m_sp)[p] < 128); #endif - return (unsigned char)(*m_sp)[p]; - case 2: + return (unsigned char)(*m_sp)[p]; + case 2: #ifdef UTF8ITER_CHECK - assert( - ((unsigned char)(*m_sp)[p] & 224) == 192 - && ((unsigned char)(*m_sp)[p+1] & 192) == 128 - ); + assert( + ((unsigned char)(*m_sp)[p] & 224) == 192 + && ((unsigned char)(*m_sp)[p+1] & 192) == 128 + ); #endif - return ((unsigned char)(*m_sp)[p] - 192) * 64 + - (unsigned char)(*m_sp)[p+1] - 128 ; - case 3: + return ((unsigned char)(*m_sp)[p] - 192) * 64 + + (unsigned char)(*m_sp)[p+1] - 128 ; + case 3: #ifdef UTF8ITER_CHECK - assert( - (((unsigned char)(*m_sp)[p]) & 240) == 224 - && (((unsigned char)(*m_sp)[p+1]) & 192) == 128 - && (((unsigned char)(*m_sp)[p+2]) & 192) == 128 - ); + assert( + (((unsigned char)(*m_sp)[p]) & 240) == 224 + && (((unsigned char)(*m_sp)[p+1]) & 192) == 128 + && (((unsigned char)(*m_sp)[p+2]) & 192) == 128 + ); #endif - return ((unsigned char)(*m_sp)[p] - 224) * 4096 + - ((unsigned char)(*m_sp)[p+1] - 128) * 64 + - (unsigned char)(*m_sp)[p+2] - 128; - case 4: + return ((unsigned char)(*m_sp)[p] - 224) * 4096 + + ((unsigned char)(*m_sp)[p+1] - 128) * 64 + + (unsigned char)(*m_sp)[p+2] - 128; + case 4: #ifdef UTF8ITER_CHECK - assert( - (((unsigned char)(*m_sp)[p]) & 248) == 240 - && (((unsigned char)(*m_sp)[p+1]) & 192) == 128 - && (((unsigned char)(*m_sp)[p+2]) & 192) == 128 - && (((unsigned char)(*m_sp)[p+3]) & 192) == 128 - ); + assert( + (((unsigned char)(*m_sp)[p]) & 248) == 240 + && (((unsigned char)(*m_sp)[p+1]) & 192) == 128 + && (((unsigned char)(*m_sp)[p+2]) & 192) == 128 + && (((unsigned char)(*m_sp)[p+3]) & 192) == 128 + ); #endif - return ((unsigned char)(*m_sp)[p]-240)*262144 + - ((unsigned char)(*m_sp)[p+1]-128)*4096 + - ((unsigned char)(*m_sp)[p+2]-128)*64 + - (unsigned char)(*m_sp)[p+3]-128; + return ((unsigned char)(*m_sp)[p]-240)*262144 + + ((unsigned char)(*m_sp)[p+1]-128)*4096 + + ((unsigned char)(*m_sp)[p+2]-128)*64 + + (unsigned char)(*m_sp)[p+3]-128; - default: + default: #ifdef UTF8ITER_CHECK - assert(l <= 4); + assert(l <= 4); #endif - return (unsigned int)-1; - } + return (unsigned int)-1; + } } }; -extern void utf8truncate(std::string& s, int maxlen); +enum Utf8TruncateFlag {UTF8T_NONE, UTF8T_ATWORD, UTF8T_ELLIPSIS}; +// maxlen is in utf-8 chars. +extern void utf8truncate(std::string& s, int maxlen, int flags = 0, + std::string ellipsis = "...", + std::string ws = " \t\n\r"); +extern size_t utf8len(const std::string& s); + +/** @brief Check and possibly fix string by replacing badly encoded + * characters with the standard question mark replacement character. + * + * @param in the string to check + * @param[out] if fixit is true, the fixed output string + * @param fixit if true, copy a fixed string to out + * @param maxrepl maximum replacements before we bail out + * @return -1 for failure (fixit false or maxrepl reached). + * 0 or positive: replacement count. + */ +extern int utf8check( + const std::string& in, std::string& out, bool fixit=false, int maxrepl=100); #endif /* _UTF8ITER_H_INCLUDED_ */