take care of splitting user string with respect to unicode white space, not only ascii

2008-12-05 11:09:31 +00:00 · 2008-12-05 11:09:31 +00:00 · 3414963810
commit 3414963810
parent 07dc3e0dd0
8 changed files with 179 additions and 32 deletions
--- a/src/common/textsplit.cpp
+++ b/src/common/textsplit.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.36 2007-12-13 06:58:21 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.37 2008-12-05 11:09:31 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 /*
 *   This program is free software; you can redistribute it and/or modify
@ -59,6 +59,7 @@ enum CharClass {LETTER=256, SPACE=257, DIGIT=258, WILD=259};
 static int charclasses[256];

 static set<unsigned int> unicign;
+static set<unsigned int> visiblewhite;
 static void setcharclasses()
 {
    static int init = 0;
@ -91,10 +92,15 @@ static void setcharclasses()
    for (i = 0; i  < strlen(special); i++)
 	charclasses[int(special[i])] = special[i];

-    for (i = 0; i < sizeof(uniign); i++) 
+    for (i = 0; i < sizeof(uniign) / sizeof(int); i++) {
 	unicign.insert(uniign[i]);
+    }
    unicign.insert((unsigned int)-1);

+    for (i = 0; i < sizeof(avsbwht) / sizeof(int); i++) {
+	visiblewhite.insert(avsbwht[i]);
+    }
+
    init = 1;
 }

@ -533,6 +539,102 @@ int TextSplit::countWords(const string& s, TextSplit::Flags flgs)
    return cb.wcnt;
 }

+bool TextSplit::hasVisibleWhite(const string &in)
+{
+    setcharclasses();
+    Utf8Iter it(in);
+    for (; !it.eof(); it++) {
+	unsigned int c = *it;
+	LOGDEB3(("TextSplit::hasVisibleWhite: testing 0x%04x\n", c));
+	if (c == (unsigned int)-1) {
+	    LOGERR(("hasVisibleWhite: error while scanning UTF-8 string\n"));
+	    return false;
+	}
+	if (visiblewhite.find(c) != visiblewhite.end())
+	    return true;
+    }
+    return false;
+}
+
+template <class T> bool u8stringToStrings(const string &s, T &tokens)
+{
+    setcharclasses();
+    Utf8Iter it(s);
+
+    string current;
+    tokens.clear();
+    enum states {SPACE, TOKEN, INQUOTE, ESCAPE};
+    states state = SPACE;
+    for (; !it.eof(); it++) {
+	unsigned int c = *it;
+	if (visiblewhite.find(c) != visiblewhite.end()) 
+	    c = ' ';
+	LOGDEB3(("TextSplit::stringToStrings: 0x%04x\n", c));
+	if (c == (unsigned int)-1) {
+	    LOGERR(("TextSplit::stringToStrings: error while "
+		    "scanning UTF-8 string\n"));
+	    return false;
+	}
+
+	switch (c) {
+	    case '"': 
+	    switch(state) {
+	    case SPACE: state = INQUOTE; continue;
+	    case TOKEN: goto push_char;
+	    case ESCAPE: state = INQUOTE; goto push_char;
+	    case INQUOTE: tokens.push_back(current);current.clear();
+		state = SPACE; continue;
+	    }
+	    break;
+	    case '\\': 
+	    switch(state) {
+	    case SPACE: 
+	    case TOKEN: state=TOKEN; goto push_char;
+	    case INQUOTE: state = ESCAPE; continue;
+	    case ESCAPE: state = INQUOTE; goto push_char;
+	    }
+	    break;
+
+	    case ' ': 
+	    case '\t': 
+	    case '\n': 
+	    case '\r': 
+	    switch(state) {
+	      case SPACE: continue;
+	      case TOKEN: tokens.push_back(current); current.clear();
+		state = SPACE; continue; 
+	    case INQUOTE: 
+	    case ESCAPE: goto push_char;
+	    }
+	    break;
+
+	    default:
+	    switch(state) {
+	      case ESCAPE: state = INQUOTE; break;
+	      case SPACE:  state = TOKEN;  break;
+	      case TOKEN: 
+	      case INQUOTE: break;
+	    }
+	push_char:
+	    it.appendchartostring(current);
+	}
+    }
+
+    // End of string. Process residue, and possible error (unfinished quote)
+    switch(state) {
+    case SPACE: break;
+    case TOKEN: tokens.push_back(current); break;
+    case INQUOTE: 
+    case ESCAPE: return false;
+    }
+    return true;
+}
+
+bool TextSplit::stringToStrings(const string &s, list<string> &tokens)
+{
+    return u8stringToStrings<list<string> >(s, tokens);
+}
+
 #else  // TEST driver ->

 #include <unistd.h>
--- a/src/common/textsplit.h
+++ b/src/common/textsplit.h
@ -16,11 +16,13 @@
 */
 #ifndef _TEXTSPLIT_H_INCLUDED_
 #define _TEXTSPLIT_H_INCLUDED_
-/* @(#$Id: textsplit.h,v 1.20 2007-10-04 12:21:52 dockes Exp $  (C) 2004 J.F.Dockes */
+/* @(#$Id: textsplit.h,v 1.21 2008-12-05 11:09:31 dockes Exp $  (C) 2004 J.F.Dockes */

 #include <string>
+#include <list>
 #ifndef NO_NAMESPACES
 using std::string;
+using std::list;
 #endif

 /**
@ -74,15 +76,27 @@ public:
    {
    }

-    /**
-     * Split text, emit words and positions.
-     */
+    /** Split text, emit words and positions. */
    bool text_to_words(const string &in);

-    // Utility functions : these does not need the user to setup a callback 
+    //Utility functions : these does not need the user to setup a callback 
    // etc.
+
+    /** Count words in string, as the splitter would generate them */
    static int countWords(const string &in, Flags flgs = TXTS_ONLYSPANS);

+    /** Check if this is visibly not a single block of text */
+    static bool hasVisibleWhite(const string &in);
+
+    /** Split text span into strings, at white space, allowing for substrings
+     * quoted with " . Escaping with \ works as usual inside the quoted areas.
+     * This has to be kept separate from smallut.cpp's stringsToStrings, which
+     * basically works only if whitespace is ascii, and which processes 
+     * non-utf-8 input (iso-8859 config files work ok). This hopefully
+     * handles all Unicode whitespace, but needs correct utf-8 input
+     */
+    static bool stringToStrings(const string &s, list<string> &tokens);
+    
 private:
    Flags         m_flags;
    TextSplitCB  *m_cb;
--- a/src/common/uproplist.h
+++ b/src/common/uproplist.h
@ -16,7 +16,9 @@
 */
 #ifndef _PROPLIST_H_INCLUDED_
 #define _PROPLIST_H_INCLUDED_
-/* @(#$Id: uproplist.h,v 1.2 2006-01-30 11:15:27 dockes Exp $  (C) 2004 J.F.Dockes */
+/* @(#$Id: uproplist.h,v 1.3 2008-12-05 11:09:31 dockes Exp $  (C) 2004 J.F.Dockes */
+
+
 /* 
 * A subset of Unicode chars that we consider whitespace when we split text in
 * words. 
@ -36,19 +38,22 @@
 # For terms of use, see http://www.unicode.org/terms_of_use.html
 # For documentation, see UCD.html
 */
-
 static const unsigned int uniign[] = {
-    0x0085, /*    ; White_Space # Cc       <control-0085>*/
-    0x00A0, /*  ; White_Space # Zs       NO-BREAK SPACE*/
-    0x00A1, /* misc signs, bullet etc... */
-    0x00A2,
-    0x00A3,
-    0x00A4,
-    0x00A5,
-    0x00A6,
-    0x00A9, /*  copyright sign */
-    0x00AA,
-    0x00AE, /*  registered sign */
+    0x0085, /* NEXT LINE NEL;Cc */
+    0x00A0, /* NO-BREAK SPACE; Zs */
+    0x00A1, /* INVERTED EXCLAMATION MARK;Po */
+    0x00A2, /* CENT SIGN;Sc */
+    0x00A3, /* POUND SIGN;Sc; */
+    0x00A4, /* CURRENCY SIGN;Sc; */
+    0x00A5, /* YEN SIGN;Sc; */
+    0x00A6, /* BROKEN BAR;So */
+    0x00A7, /* SECTION SIGN;So; */
+    0x00A8, /* DIAERESIS;Sk; */
+    0x00A9, /* COPYRIGHT SIGN;So */
+    0x00AA, /* FEMININE ORDINAL INDICATOR;Ll */
+    0x00AB, /* LEFT-POINTING DOUBLE ANGLE QUOTATION MARK;Pi */
+    0x00AC, /* NOT SIGN;Sm */
+    0x00AE, /* registered sign */
    0x1680, /*  ; White_Space # Zs       OGHAM SPACE MARK*/
    0x180E, /*  ; White_Space # Zs       MONGOLIAN VOWEL SEPARATOR*/
    0x2000, /*  ; White_Space # Zs  [11] EN QUAD..HAIR SPACE*/
@ -181,4 +186,30 @@ static const unsigned int uniign[] = {
    0xFF61, /*  ; STerm # Po       HALFWIDTH IDEOGRAPHIC FULL STOP*/
 };

-#endif /*PLIST_H_INCLUDED_ */
+/* Things that would visibly break a block of text, rendering obvious the need
+ * of quotation if a phrase search is wanted */
+static const unsigned int avsbwht[] = {
+    0x0009, /* CHARACTER TABULATION */
+    0x000A, /* LINE FEED */
+    0x000D, /* CARRIAGE RETURN */
+    0x0020, /* SPACE;Zs;0;WS */
+    0x00A0, /* NO-BREAK SPACE;Zs;0;CS */
+    0x1680, /* OGHAM SPACE MARK;Zs;0;WS */
+    0x180E, /* MONGOLIAN VOWEL SEPARATOR;Zs;0;WS */
+    0x2000, /* EN QUAD;Zs;0;WS */
+    0x2001, /* EM QUAD;Zs;0;WS */
+    0x2002, /* EN SPACE;Zs;0;WS */
+    0x2003, /* EM SPACE;Zs;0;WS */
+    0x2004, /* THREE-PER-EM SPACE;Zs;0;WS */
+    0x2005, /* FOUR-PER-EM SPACE;Zs;0;WS */
+    0x2006, /* SIX-PER-EM SPACE;Zs;0;WS */
+    0x2007, /* FIGURE SPACE;Zs;0;WS */
+    0x2008, /* PUNCTUATION SPACE;Zs;0;WS */
+    0x2009, /* THIN SPACE;Zs;0;WS */
+    0x200A, /* HAIR SPACE;Zs;0;WS */
+    0x202F, /* NARROW NO-BREAK SPACE;Zs;0;CS */
+    0x205F, /* MEDIUM MATHEMATICAL SPACE;Zs;0;WS */
+    0x3000, /* IDEOGRAPHIC SPACE;Zs;0;WS */
+};
+
+#endif // _PROPLIST_H_INCLUDED_
--- a/src/kde/kioslave/recoll/kio_recoll.cpp
+++ b/src/kde/kioslave/recoll/kio_recoll.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: kio_recoll.cpp,v 1.22 2008-12-04 11:49:59 dockes Exp $ (C) 2005 J.F.Dockes";
+static char rcsid[] = "@(#$Id: kio_recoll.cpp,v 1.23 2008-12-05 11:09:31 dockes Exp $ (C) 2005 J.F.Dockes";
 #endif
 /*
 *   This program is free software; you can redistribute it and/or modify
@ -293,7 +293,7 @@ bool RecollProtocol::doSearch(const QueryDesc& qd)
 	    // If there is no white space inside the query, then the user
 	    // certainly means it as a phrase.
 	    bool isreallyaphrase = false;
-	    if (qs.find_first_of(" \t") == string::npos)
+	    if (!TextSplit::hasVisibleWhite(qs))
 		isreallyaphrase = true;
 	    clp = isreallyaphrase ? 
 		new Rcl::SearchDataClauseDist(Rcl::SCLT_PHRASE, qs, 0) :
--- a/src/qtgui/ssearch_w.cpp
+++ b/src/qtgui/ssearch_w.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: ssearch_w.cpp,v 1.25 2008-10-13 11:46:06 dockes Exp $ (C) 2006 J.F.Dockes";
+static char rcsid[] = "@(#$Id: ssearch_w.cpp,v 1.26 2008-12-05 11:09:31 dockes Exp $ (C) 2006 J.F.Dockes";
 #endif
 /*
 *   This program is free software; you can redistribute it and/or modify
@ -107,7 +107,7 @@ void SSearch::startSimpleSearch()
 	// If there is no white space inside the query, then the user
 	// certainly means it as a phrase.
 	bool isreallyaphrase = false;
-	if (u8.find_first_of(" \t\r\n") == string::npos)
+	if (!TextSplit::hasVisibleWhite(u8))
 	    isreallyaphrase = true;

 	// Maybe add automatic phrase ? For ALL and ANY, and not if
--- a/src/query/recollq.cpp
+++ b/src/query/recollq.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: recollq.cpp,v 1.20 2008-10-13 08:23:36 dockes Exp $ (C) 2006 J.F.Dockes";
+static char rcsid[] = "@(#$Id: recollq.cpp,v 1.21 2008-12-05 11:09:31 dockes Exp $ (C) 2006 J.F.Dockes";
 #endif
 /*
 *   This program is free software; you can redistribute it and/or modify
@ -203,7 +203,7 @@ int recollq(RclConfig **cfp, int argc, char **argv)
 	    // If there is no white space inside the query, then the user
 	    // certainly means it as a phrase.
 	    bool isreallyaphrase = false;
-	    if (qs.find_first_of(" \t") == string::npos)
+	    if (!TextSplit::hasVisibleWhite(qs))
 		isreallyaphrase = true;
 	    clp = isreallyaphrase ? 
 		new Rcl::SearchDataClauseDist(Rcl::SCLT_PHRASE, qs, 0) :
--- a/src/query/wasatorcl.cpp
+++ b/src/query/wasatorcl.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: wasatorcl.cpp,v 1.17 2008-11-18 13:25:48 dockes Exp $ (C) 2006 J.F.Dockes";
+static char rcsid[] = "@(#$Id: wasatorcl.cpp,v 1.18 2008-12-05 11:09:31 dockes Exp $ (C) 2006 J.F.Dockes";
 #endif
 /*
 *   This program is free software; you can redistribute it and/or modify
@ -102,7 +102,7 @@ Rcl::SearchData *wasaQueryToRcl(WasaQuery *wasa)
 		break;
 	    } 

-	    if ((*it)->m_value.find_first_of(" \t\n\r") != string::npos) {
+	    if (TextSplit::hasVisibleWhite((*it)->m_value)) {
 		int slack = (mods & WasaQuery::WQM_PHRASESLACK) ? 10 : 0;
 		Rcl::SClType tp = Rcl::SCLT_PHRASE;
 		if (mods & WasaQuery::WQM_PROX) {
--- a/src/rcldb/searchdata.cpp
+++ b/src/rcldb/searchdata.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.26 2008-10-14 07:50:13 dockes Exp $ (C) 2006 J.F.Dockes";
+static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.27 2008-12-05 11:09:31 dockes Exp $ (C) 2006 J.F.Dockes";
 #endif
 /*
 *   This program is free software; you can redistribute it and/or modify
@ -380,7 +380,7 @@ bool StringToXapianQ::processUserString(const string &iq,
    // depends on separators: [paul@dom.net] would still be a word
    // (span), but [about:me] will probably be handled as a phrase.
    list<string> phrases;
-    stringToStrings(iq, phrases);
+    TextSplit::stringToStrings(iq, phrases);

    // Process each element: textsplit into terms, handle stem/wildcard 
    // expansion and transform into an appropriate Xapian::Query
@ -568,7 +568,7 @@ bool SearchDataClauseFilename::toNativeQuery(Rcl::Db &db, void *p,
    *qp = Xapian::Query();

    list<string> patterns;
-    stringToStrings(m_text, patterns);
+    TextSplit::stringToStrings(m_text, patterns);
    list<string> names;
    for (list<string>::iterator it = patterns.begin();
 	 it != patterns.end(); it++) {