shared

2020-07-13 10:46:05 +02:00 · 2020-07-13 10:46:05 +02:00 · 476a3ba743
commit 476a3ba743
parent 4508b6b064
9 changed files with 370 additions and 172 deletions
--- a/packaging/debian/buildppa.sh
+++ b/packaging/debian/buildppa.sh
@ -6,7 +6,7 @@

 PPA_KEYID=7808CE96D38B9201

-RCLVERS=1.27.2
+RCLVERS=1.27.3
 SCOPEVERS=1.20.2.4
 GSSPVERS=1.0.0
 PPAVERS=2
@ -51,7 +51,7 @@ debdir=debian
 # 19.10    eoan   2020-07
 # 20.04LTS focal  2025-04
 series="xenial bionic eoan focal"
-series=
+#series=

 if test "X$series" != X ; then
    check_recoll_orig
@ -137,7 +137,7 @@ done
 # 19.10    eoan   2020-07
 # 20.04LTS focal  2025-04
 series="xenial bionic eoan focal"
-series=focal
+series=

 debdir=debiangssp
 if test ! -d ${debdir}/ ; then
--- a/src/qtgui/confgui/confgui.cpp
+++ b/src/qtgui/confgui/confgui.cpp
@ -132,7 +132,7 @@ int ConfTabsW::addForeignPanel(ConfPanelWIF* w, const QString& title)
    m_widgets.push_back(w);
    QWidget *qw = dynamic_cast<QWidget *>(w);
    if (qw == 0) {
-        qDebug() << "Can't cast panel to QWidget";
+        qDebug() << "addForeignPanel: can't cast panel to QWidget";
        abort();
    }
    return tabWidget->addTab(qw, title);
@ -227,9 +227,11 @@ ConfParamW *ConfTabsW::findParamW(const QString& varname)
 }
 void ConfTabsW::endOfList(int tabindex)
 {
-    ConfPanelW *panel = (ConfPanelW*)tabWidget->widget(tabindex);
-    if (nullptr == panel) 
+    ConfPanelW *panel = dynamic_cast<ConfPanelW*>(tabWidget->widget(tabindex));
+    // panel may be null if this is a foreign panel (not a conftabsw)
+    if (nullptr == panel) {
        return;
+    }
    panel->endOfList();
 }

--- a/src/utils/log.cpp
+++ b/src/utils/log.cpp
@ -19,6 +19,11 @@

 #include <errno.h>
 #include <fstream>
+#include <time.h>
+
+#ifdef _MSC_VER
+#define localtime_r(A,B) localtime_s(B,A)
+#endif

 using namespace std;

@ -54,6 +59,17 @@ bool Logger::reopen(const std::string& fn)
    return true;
 }

+const char *Logger::datestring()
+{
+    time_t clk = time(0);
+    struct tm tmb;
+    localtime_r(&clk, &tmb);
+    if (strftime(m_datebuf, LOGGER_DATESIZE, m_datefmt.c_str(), &tmb)) {
+        return m_datebuf;
+    } else {
+        return "";
+    }
+}
 static Logger *theLog;

 Logger *Logger::getTheLog(const string& fn)
--- a/src/utils/log.h
+++ b/src/utils/log.h
@ -18,11 +18,9 @@
 #define _LOG_H_X_INCLUDED_

 #include <string.h>
-
 #include <fstream> 
 #include <iostream>
 #include <string>
-#include <mutex>

 #ifndef LOGGER_THREADSAFE
 #define LOGGER_THREADSAFE 1
@ -33,30 +31,74 @@
 #endif

 // Can't use the symbolic Logger::LLXX names in preproc. 6 is LLDEB1
+// STATICVERBOSITY is the level above which logging statements are
+// preproc'ed out (can't be dynamically turned on).
 #ifndef LOGGER_STATICVERBOSITY
 #define LOGGER_STATICVERBOSITY 5
 #endif

+#define LOGGER_DATESIZE 100
+
+/** @brief This is a singleton class. The logger pointer is obtained
+ * when needed by calls to @ref getTheLog(), only the first of which
+ * actually creates the object and initializes the output. */
 class Logger {
 public:
    /** Initialize logging to file name. Use "stderr" for stderr
-        output. Creates the singleton logger object */
+     * output. Creates the singleton logger object. Only the first
+     * call changes the state, further ones just return the Logger
+     * pointer. */
    static Logger *getTheLog(const std::string& fn);

+    /** Close and reopen the output file. For rotating the log: rename
+     * then reopen. */
    bool reopen(const std::string& fn);
-    
+
+    /** Retrieve the output stream in case you need to write directly
+     * to it. In a multithreaded program, you probably also need to obtain
+     * the mutex with @ref getmutex, and lock it. */
    std::ostream& getstream() {
        return m_tocerr ? std::cerr : m_stream;
    }
+
+    /** @brief Log level values. Messages at level above the current will
+     * not be printed. Messages at a level above
+     * LOGGER_STATICVERBOSITY will not even be compiled in. */
    enum LogLevel {LLNON=0, LLFAT=1, LLERR=2, LLINF=3, LLDEB=4,
                   LLDEB0=5, LLDEB1=6, LLDEB2=7};
+
+    /** @brief Set the log dynamic verbosity level */
    void setLogLevel(LogLevel level) {
        m_loglevel = level;
    }
-    int getloglevel() {
+    /** @brief Set the log dynamic verbosity level */
+    void setloglevel(LogLevel level) {
+        m_loglevel = level;
+    }
+
+    /** @brief Retrieve the current log level */
+    int getloglevel() const {
        return m_loglevel;
    }

+    /** @brief turn date logging on or off (default is off) */
+    void logthedate(bool onoff) {
+        m_logdate = onoff;
+    }
+
+    bool loggingdate() const {
+        return m_logdate;
+    }
+    
+    /** @brief Set the date format, as an strftime() format string. 
+     * Default: "%Y%m%d-%H%M%S" . */
+    void setdateformat(const std::string fmt) {
+        m_datefmt = fmt;
+    }
+
+    /** Call with log locked */
+    const char *datestring();
+    
 #if LOGGER_THREADSAFE
    std::recursive_mutex& getmutex() {
        return m_mutex;
@ -65,13 +107,15 @@ public:
    
 private:
    bool m_tocerr{false};
+    bool m_logdate{false};
    int m_loglevel{LLERR};
+    std::string m_datefmt{"%Y%m%d-%H%M%S"};
    std::string m_fn;
    std::ofstream m_stream;
 #if LOGGER_THREADSAFE
    std::recursive_mutex m_mutex;
 #endif
-
+    char m_datebuf[LOGGER_DATESIZE];
    Logger(const std::string& fn);
    Logger(const Logger &);
    Logger& operator=(const Logger &);
@ -93,10 +137,14 @@ private:
 #define LOGGER_LEVEL (Logger::getTheLog("")->getloglevel() +    \
                      LOGGER_LOCAL_LOGINC)

-#define LOGGER_DOLOG(L,X) LOGGER_PRT << ":" << L << ":" <<              \
-                                                                      __FILE__ << ":" << __LINE__ << "::" << X \
+#define LOGGER_DATE (Logger::getTheLog("")->loggingdate() ? \
+                     Logger::getTheLog("")->datestring() : "")
+
+#define LOGGER_DOLOG(L,X) LOGGER_PRT << LOGGER_DATE << ":" << L << ":" << \
+                             __FILE__ << ":" << __LINE__ << "::" << X \
    << std::flush

+
 #if LOGGER_STATICVERBOSITY >= 7
 #define LOGDEB2(X) {                            \
        if (LOGGER_LEVEL >= Logger::LLDEB2) {   \
@ -142,6 +190,10 @@ private:
 #endif

 #if LOGGER_STATICVERBOSITY >= 3
+/** Log a message at level INFO. Other macros exist for other levels (LOGFAT,
+ * LOGERR, LOGINF, LOGDEB, LOGDEB0... Use as: 
+ * LOGINF("some text" << other stuff << ... << "\n");
+ */
 #define LOGINF(X) {                             \
        if (LOGGER_LEVEL >= Logger::LLINF) {    \
            LOGGER_LOCK;                        \
--- a/src/utils/pathut.cpp
+++ b/src/utils/pathut.cpp
@ -56,7 +56,6 @@

 #include <cstdlib>
 #include <cstring>
-#include <dirent.h>
 #include <errno.h>
 #include <fstream>
 #include <iostream>
@ -88,12 +87,12 @@
 #endif
 #define WIN32_LEAN_AND_MEAN
 #define NOGDI
-#define MAXPATHLEN PATH_MAX
-
 #include <windows.h>
 #include <io.h>
-
 #include <sys/stat.h>
+#include <direct.h>
+#include <Shlobj.h>
+#include <Stringapiset.h>

 #if !defined(S_IFLNK)
 #define S_IFLNK 0
@ -104,11 +103,13 @@
 #ifndef S_ISREG
 # define S_ISREG(ST_MODE) (((ST_MODE) & _S_IFMT) == _S_IFREG)
 #endif
-
-#include <direct.h>
-
-#include <Shlobj.h>
-#include <Stringapiset.h>
+#define MAXPATHLEN PATH_MAX
+#ifndef PATH_MAX
+#define PATH_MAX MAX_PATH
+#endif
+#ifndef R_OK
+#define R_OK 4
+#endif

 #define STAT _wstati64
 #define LSTAT _wstati64
@ -130,7 +131,9 @@
 // For getpid
 #include <process.h>
 #define getpid _getpid
-#endif
+
+#define PATHUT_SSIZE_T int
+#endif // _MSC_VER

 #else /* !_WIN32 -> */

@ -159,8 +162,19 @@

 #endif /* !_WIN32 */

+#ifdef _MSC_VER
+#include <msvc_dirent.h>
+#else // !_MSC_VER
+#include <dirent.h>
+#endif // _MSC_VER
+
 using namespace std;

+#ifndef PATHUT_SSIZE_T
+#define PATHUT_SSIZE_T ssize_t
+#endif
+
+
 #ifdef _WIN32

 std::string wchartoutf8(const wchar_t *in, size_t len)
@ -1401,7 +1415,7 @@ int Pidfile::write_pid()
    char pidstr[20];
    sprintf(pidstr, "%u", int(getpid()));
    lseek(m_fd, 0, 0);
-    if (::write(m_fd, pidstr, strlen(pidstr)) != (ssize_t)strlen(pidstr)) {
+    if (::write(m_fd, pidstr, strlen(pidstr)) != (PATHUT_SSIZE_T)strlen(pidstr)) {
        m_reason = "write failed";
        return -1;
    }
--- a/src/utils/pathut.h
+++ b/src/utils/pathut.h
@ -143,11 +143,7 @@ extern bool path_unlink(const std::string& path);
 * extension. On other OSes, just builds the fstream.  We'd need to
 * find a way to make this work with g++. It would be easier in this
 * case to use a FILE (_openw(), then fdopen()), but conftree really
- * depends on std::iostream. One possible workaround for g++ would be
- * to use shortpaths (which we already use to pass file names to
- * xapian and aspell). Most of the problems are caused by the home
- * directory name being non-ASCII, so returning a short path in
- * path_home() would probably solve everything (but not pretty).
+ * depends on std::iostream. 
 *
 * @param path an utf-8 file path.
 * @param mode is an std::fstream mode (ios::in etc.) */
--- a/src/utils/smallut.h
+++ b/src/utils/smallut.h
@ -66,7 +66,7 @@ extern void stringtolower(std::string& io);
 extern std::string stringtolower(const std::string& io);
 extern void stringtoupper(std::string& io);
 extern std::string stringtoupper(const std::string& io);
-extern bool beginswith(const std::string& big, const std::string& small);
+extern bool beginswith(const std::string& bg, const std::string& sml);

 // Is one string the end part of the other ?
 extern int stringisuffcmp(const std::string& s1, const std::string& s2);
--- a/src/utils/utf8iter.cpp
+++ b/src/utils/utf8iter.cpp
@ -16,21 +16,114 @@
 */

 #include "utf8iter.h"
-#include <string>

-using std::string;
+#include <unordered_set>
+#include <iostream>

-void utf8truncate(std::string& s, int maxlen)
+using namespace std;
+
+void utf8truncate(std::string& s, int maxlen, int flags, string ellipsis,
+                  string ws)
 {
    if (s.size() <= string::size_type(maxlen)) {
        return;
    }
+    unordered_set<int> wss;
+    if (flags & UTF8T_ATWORD) {
+        Utf8Iter iter(ws);
+        for (; !iter.eof(); iter++) {
+            unsigned int c = *iter;
+            wss.insert(c);
+        }
+    }
+
+    if (flags & UTF8T_ELLIPSIS) {
+        size_t ellen = utf8len(ellipsis);
+        if (maxlen > int(ellen)) {
+            maxlen -= ellen;
+        } else {
+            maxlen = 0;
+        }
+    }
+
    Utf8Iter iter(s);
    string::size_type pos = 0;
-    while (iter++ != string::npos)
-        if (iter.getBpos() < string::size_type(maxlen)) {
-            pos = iter.getBpos();
+    string::size_type lastwspos = 0;
+    for (; !iter.eof(); iter++) {
+        unsigned int c = *iter;
+        if (iter.getCpos() < string::size_type(maxlen)) {
+            pos = iter.getBpos() + iter.getBlen();
+            if ((flags & UTF8T_ATWORD) && wss.find(c) != wss.end()) {
+                lastwspos = pos;
+            }
+        } else {
+            break;
        }
+    }

-    s.erase(pos);
+    if (flags & UTF8T_ATWORD) {
+        s.erase(lastwspos);
+        for (;;) {
+            Utf8Iter iter(s);
+            unsigned int c = 0;
+            for (; !iter.eof(); iter++) {
+                c = *iter;
+                pos = iter.getBpos();
+            }
+            if (wss.find(c) == wss.end()) {
+                break;
+            }
+            s.erase(pos);
+        }
+    } else {
+        s.erase(pos);
+    }
+
+    if (flags & UTF8T_ELLIPSIS) {
+        s += ellipsis;
+    }
+}
+
+size_t utf8len(const string& s)
+{
+    size_t len = 0;
+    Utf8Iter iter(s);
+    while (iter++ != string::npos) {
+        len++;
+    }
+    return len;
+}
+
+static const std::string replchar{"\xef\xbf\xbd"};
+
+// Check utf-8 encoding, replacing errors with the ? char above
+int utf8check(const std::string& in, std::string& out, bool fixit, int maxrepl)
+{
+    int cnt = 0;
+    Utf8Iter it(in);
+    for (;!it.eof(); it++) {
+        if (it.error()) {
+            if (!fixit) {
+                return -1;
+            }
+            out += replchar;
+            ++cnt;
+            for (; cnt < maxrepl; cnt++) {
+                it.retryfurther();
+                if (it.eof())
+                    return cnt;
+                if (!it.error())
+                    break;
+                out += replchar;
+            }
+            if (it.error()) {
+                return -1;
+            }
+        }
+        // We have reached a good char and eof is false
+        if (fixit) {
+            it.appendchartostring(out);
+        }
+    }
+    return cnt;
 }
--- a/src/utils/utf8iter.h
+++ b/src/utils/utf8iter.h
@ -32,110 +32,120 @@
 class Utf8Iter {
 public:
    Utf8Iter(const std::string &in) 
-    : m_sp(&in) {
-    update_cl();
+        : m_sp(&in) {
+        update_cl();
    }

    const std::string& buffer() const {
        return *m_sp;
    }
-
+    
    void rewind() {
-    m_cl = 0; 
-    m_pos = 0; 
-    m_charpos = 0; 
-    update_cl();
+        m_cl = 0; 
+        m_pos = 0; 
+        m_charpos = 0; 
+        update_cl();
    }

+    void retryfurther() {
+        if (eof())
+            return;
+        m_pos++;
+        if (eof()) {
+            return;
+        }
+        update_cl();
+    }
+    
    /** "Direct" access. Awfully inefficient as we skip from start or current
     * position at best. This can only be useful for a lookahead from the
     * current position */
    unsigned int operator[](std::string::size_type charpos) const {
-    std::string::size_type mypos = 0;
-    unsigned int mycp = 0;
-    if (charpos >= m_charpos) {
-        mypos = m_pos;
-        mycp = m_charpos;
-    }
-    int l;
-    while (mypos < m_sp->length() && mycp != charpos) {
-        l = get_cl(mypos);
-        if (l <= 0 || !poslok(mypos, l) || !checkvalidat(mypos, l))
+        std::string::size_type mypos = 0;
+        unsigned int mycp = 0;
+        if (charpos >= m_charpos) {
+            mypos = m_pos;
+            mycp = m_charpos;
+        }
+        int l;
+        while (mypos < m_sp->length() && mycp != charpos) {
+            l = get_cl(mypos);
+            if (l <= 0 || !poslok(mypos, l) || !checkvalidat(mypos, l))
+                return (unsigned int)-1;
+            mypos += l;
+            ++mycp;
+        }
+        if (mypos < m_sp->length() && mycp == charpos) {
+            l = get_cl(mypos);
+            if (poslok(mypos, l) && checkvalidat(mypos, l))
+                return getvalueat(mypos, l);
+        }
        return (unsigned int)-1;
-        mypos += l;
-        ++mycp;
-    }
-    if (mypos < m_sp->length() && mycp == charpos) {
-        l = get_cl(mypos);
-        if (poslok(mypos, l) && checkvalidat(mypos, l))
-        return getvalueat(mypos, l);
-    }
-    return (unsigned int)-1;
    }

    /** Increment current position to next utf-8 char */
    std::string::size_type operator++(int) {
-    // Note: m_cl may be zero at eof if user's test not right
-    // this shouldn't crash the program until actual data access
+        // Note: m_cl may be zero at eof if user's test not right
+        // this shouldn't crash the program until actual data access
 #ifdef UTF8ITER_CHECK
-    assert(m_cl != 0);
+        assert(m_cl != 0);
 #endif
-    if (m_cl == 0)
-        return std::string::npos;
+        if (m_cl == 0)
+            return std::string::npos;

-    m_pos += m_cl;
-    m_charpos++;
-    update_cl();
-    return m_pos;
+        m_pos += m_cl;
+        m_charpos++;
+        update_cl();
+        return m_pos;
    }

    /** operator* returns the ucs4 value as a machine integer*/
    unsigned int operator*() {
 #ifdef UTF8ITER_CHECK
-    assert(m_cl > 0);
+        assert(m_cl > 0);
 #endif
-    return m_cl == 0 ? (unsigned int)-1 : getvalueat(m_pos, m_cl);
+        return m_cl == 0 ? (unsigned int)-1 : getvalueat(m_pos, m_cl);
    }

    /** Append current utf-8 possibly multi-byte character to string param.
-    This needs to be fast. No error checking. */
+        This needs to be fast. No error checking. */
    unsigned int appendchartostring(std::string &out) const {
 #ifdef UTF8ITER_CHECK
-    assert(m_cl != 0);
+        assert(m_cl != 0);
 #endif
-    out.append(&(*m_sp)[m_pos], m_cl);
-    return m_cl;
+        out.append(&(*m_sp)[m_pos], m_cl);
+        return m_cl;
    }

    /** Return current character as string */
    operator std::string() {
 #ifdef UTF8ITER_CHECK
-    assert(m_cl != 0);
+        assert(m_cl != 0);
 #endif
-    return m_cl > 0 ? m_sp->substr(m_pos, m_cl) : std::string();
+        return m_cl > 0 ? m_sp->substr(m_pos, m_cl) : std::string();
    }

    bool eof() const {
-    return m_pos == m_sp->length();
+        return m_pos == m_sp->length();
    }

    bool error() const {
-    return m_cl == 0;
+        return m_cl == 0;
    }

    /** Return current byte offset in input string */
    std::string::size_type getBpos() const {
-    return m_pos;
+        return m_pos;
    }

    /** Return current character length */
    std::string::size_type getBlen() const {
-    return m_cl;
+        return m_cl;
    }

    /** Return current unicode character offset in input string */
    std::string::size_type getCpos() const {
-    return m_charpos;
+        return m_charpos;
    }

 private:
@ -151,128 +161,143 @@ private:

    // Check position and cl against string length
    bool poslok(std::string::size_type p, int l) const {
-#ifdef UTF8ITER_CHECK
-    assert(p != std::string::npos && l > 0 && p + l <= m_sp->length());
-#endif
-    return p != std::string::npos && l > 0 && p + l <= m_sp->length();
+        return p != std::string::npos && l > 0 && p + l <= m_sp->length();
    }

    // Update current char length in object state, check
    // for errors
    inline void update_cl() {
-    m_cl = 0;
-    if (m_pos >= m_sp->length())
-        return;
-    m_cl = get_cl(m_pos);
-    if (!poslok(m_pos, m_cl)) {
-        // Used to set eof here for safety, but this is bad because it
-        // basically prevents the caller to discriminate error and eof.
-        //        m_pos = m_sp->length();
        m_cl = 0;
-        return;
-    }
-    if (!checkvalidat(m_pos, m_cl)) {
-        m_cl = 0;
-    }
+        if (m_pos >= m_sp->length())
+            return;
+        m_cl = get_cl(m_pos);
+        if (!poslok(m_pos, m_cl)) {
+            // Used to set eof here for safety, but this is bad because it
+            // basically prevents the caller to discriminate error and eof.
+            //        m_pos = m_sp->length();
+            m_cl = 0;
+            return;
+        }
+        if (!checkvalidat(m_pos, m_cl)) {
+            m_cl = 0;
+        }
    }

    inline bool checkvalidat(std::string::size_type p, int l) const {
-    switch (l) {
-    case 1: 
-        return (unsigned char)(*m_sp)[p] < 128;
-    case 2: 
-        return (((unsigned char)(*m_sp)[p]) & 224) == 192
-        && (((unsigned char)(*m_sp)[p+1]) & 192) == 128;
-    case 3: 
-        return (((unsigned char)(*m_sp)[p]) & 240) == 224
-           && (((unsigned char)(*m_sp)[p+1]) & 192) ==  128
-           && (((unsigned char)(*m_sp)[p+2]) & 192) ==  128
-           ;
-    case 4: 
-        return (((unsigned char)(*m_sp)[p]) & 248) == 240
-           && (((unsigned char)(*m_sp)[p+1]) & 192) ==  128
-           && (((unsigned char)(*m_sp)[p+2]) & 192) ==  128
-           && (((unsigned char)(*m_sp)[p+3]) & 192) ==  128
-        ;
-    default:
-        return false;
-    }
+        switch (l) {
+        case 1: 
+            return (unsigned char)(*m_sp)[p] < 128;
+        case 2: 
+            return (((unsigned char)(*m_sp)[p]) & 224) == 192
+                                               && (((unsigned char)(*m_sp)[p+1]) & 192) == 128;
+        case 3: 
+            return (((unsigned char)(*m_sp)[p]) & 240) == 224
+                                               && (((unsigned char)(*m_sp)[p+1]) & 192) ==  128
+                                               && (((unsigned char)(*m_sp)[p+2]) & 192) ==  128
+                                               ;
+        case 4: 
+            return (((unsigned char)(*m_sp)[p]) & 248) == 240
+                                               && (((unsigned char)(*m_sp)[p+1]) & 192) ==  128
+                                               && (((unsigned char)(*m_sp)[p+2]) & 192) ==  128
+                                               && (((unsigned char)(*m_sp)[p+3]) & 192) ==  128
+                                               ;
+        default:
+            return false;
+        }
    }

    // Get character byte length at specified position. Returns 0 for error.
    inline int get_cl(std::string::size_type p) const {
-    unsigned int z = (unsigned char)(*m_sp)[p];
-    if (z <= 127) {
-        return 1;
-    } else if ((z & 224) == 192) {
-        return 2;
-    } else if ((z & 240) == 224) {
-        return 3;
-    } else if ((z & 248) == 240) {
-        return 4;
-    }
+        unsigned int z = (unsigned char)(*m_sp)[p];
+        if (z <= 127) {
+            return 1;
+        } else if ((z & 224) == 192) {
+            return 2;
+        } else if ((z & 240) == 224) {
+            return 3;
+        } else if ((z & 248) == 240) {
+            return 4;
+        }
 #ifdef UTF8ITER_CHECK
-    assert(z <= 127 || (z & 224) == 192 || (z & 240) == 224 ||
-           (z & 248) == 240);
+        assert(z <= 127 || (z & 224) == 192 || (z & 240) == 224 ||
+               (z & 248) == 240);
 #endif
-    return 0;
+        return 0;
    }

    // Compute value at given position. No error checking.
    inline unsigned int getvalueat(std::string::size_type p, int l) const {
-    switch (l) {
-    case 1: 
+        switch (l) {
+        case 1: 
 #ifdef UTF8ITER_CHECK
-        assert((unsigned char)(*m_sp)[p] < 128);
+            assert((unsigned char)(*m_sp)[p] < 128);
 #endif
-        return (unsigned char)(*m_sp)[p];
-    case 2: 
+            return (unsigned char)(*m_sp)[p];
+        case 2: 
 #ifdef UTF8ITER_CHECK
-        assert(
-           ((unsigned char)(*m_sp)[p] & 224) == 192
-           && ((unsigned char)(*m_sp)[p+1] & 192) ==  128
-           );
+            assert(
+                ((unsigned char)(*m_sp)[p] & 224) == 192
+                && ((unsigned char)(*m_sp)[p+1] & 192) ==  128
+                );
 #endif
-        return ((unsigned char)(*m_sp)[p] - 192) * 64 + 
-        (unsigned char)(*m_sp)[p+1] - 128 ;
-    case 3: 
+            return ((unsigned char)(*m_sp)[p] - 192) * 64 + 
+                (unsigned char)(*m_sp)[p+1] - 128 ;
+        case 3: 
 #ifdef UTF8ITER_CHECK
-        assert(
-           (((unsigned char)(*m_sp)[p]) & 240) == 224
-           && (((unsigned char)(*m_sp)[p+1]) & 192) ==  128
-           && (((unsigned char)(*m_sp)[p+2]) & 192) ==  128
-           );
+            assert(
+                (((unsigned char)(*m_sp)[p]) & 240) == 224
+                && (((unsigned char)(*m_sp)[p+1]) & 192) ==  128
+                && (((unsigned char)(*m_sp)[p+2]) & 192) ==  128
+                );
 #endif

-        return ((unsigned char)(*m_sp)[p] - 224) * 4096 + 
-        ((unsigned char)(*m_sp)[p+1] - 128) * 64 + 
-        (unsigned char)(*m_sp)[p+2] - 128;
-    case 4: 
+            return ((unsigned char)(*m_sp)[p] - 224) * 4096 + 
+                ((unsigned char)(*m_sp)[p+1] - 128) * 64 + 
+                (unsigned char)(*m_sp)[p+2] - 128;
+        case 4: 
 #ifdef UTF8ITER_CHECK
-        assert(
-           (((unsigned char)(*m_sp)[p]) & 248) == 240
-           && (((unsigned char)(*m_sp)[p+1]) & 192) ==  128
-           && (((unsigned char)(*m_sp)[p+2]) & 192) ==  128
-           && (((unsigned char)(*m_sp)[p+3]) & 192) ==  128
-           );
+            assert(
+                (((unsigned char)(*m_sp)[p]) & 248) == 240
+                && (((unsigned char)(*m_sp)[p+1]) & 192) ==  128
+                && (((unsigned char)(*m_sp)[p+2]) & 192) ==  128
+                && (((unsigned char)(*m_sp)[p+3]) & 192) ==  128
+                );
 #endif

-        return ((unsigned char)(*m_sp)[p]-240)*262144 + 
-        ((unsigned char)(*m_sp)[p+1]-128)*4096 + 
-        ((unsigned char)(*m_sp)[p+2]-128)*64 + 
-        (unsigned char)(*m_sp)[p+3]-128;
+            return ((unsigned char)(*m_sp)[p]-240)*262144 + 
+                ((unsigned char)(*m_sp)[p+1]-128)*4096 + 
+                ((unsigned char)(*m_sp)[p+2]-128)*64 + 
+                (unsigned char)(*m_sp)[p+3]-128;

-    default:
+        default:
 #ifdef UTF8ITER_CHECK
-        assert(l <= 4);
+            assert(l <= 4);
 #endif
-        return (unsigned int)-1;
-    }
+            return (unsigned int)-1;
+        }
    }

 };


-extern void utf8truncate(std::string& s, int maxlen);
+enum Utf8TruncateFlag {UTF8T_NONE, UTF8T_ATWORD, UTF8T_ELLIPSIS};
+// maxlen is in utf-8 chars.
+extern void utf8truncate(std::string& s, int maxlen, int flags = 0,
+                         std::string ellipsis = "...",
+                         std::string ws = " \t\n\r");
+extern size_t utf8len(const std::string& s);
+
+/** @brief Check and possibly fix string by replacing badly encoded
+ * characters with the standard question mark replacement character.
+ *
+ * @param in the string to check
+ * @param[out] if fixit is true, the fixed output string
+ * @param fixit if true, copy a fixed string to out
+ * @param maxrepl maximum replacements before we bail out
+ * @return -1 for failure (fixit false or maxrepl reached). 
+ *   0 or positive: replacement count.
+ */
+extern int utf8check(
+    const std::string& in, std::string& out, bool fixit=false, int maxrepl=100);

 #endif /* _UTF8ITER_H_INCLUDED_ */