From 34149638104c079a07ee718b38e9b67399712bf5 Mon Sep 17 00:00:00 2001 From: dockes Date: Fri, 5 Dec 2008 11:09:31 +0000 Subject: [PATCH] take care of splitting user string with respect to unicode white space, not only ascii --- src/common/textsplit.cpp | 106 ++++++++++++++++++++++++- src/common/textsplit.h | 24 ++++-- src/common/uproplist.h | 59 ++++++++++---- src/kde/kioslave/recoll/kio_recoll.cpp | 4 +- src/qtgui/ssearch_w.cpp | 4 +- src/query/recollq.cpp | 4 +- src/query/wasatorcl.cpp | 4 +- src/rcldb/searchdata.cpp | 6 +- 8 files changed, 179 insertions(+), 32 deletions(-) diff --git a/src/common/textsplit.cpp b/src/common/textsplit.cpp index 177f749e..4826ff99 100644 --- a/src/common/textsplit.cpp +++ b/src/common/textsplit.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.36 2007-12-13 06:58:21 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.37 2008-12-05 11:09:31 dockes Exp $ (C) 2004 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -59,6 +59,7 @@ enum CharClass {LETTER=256, SPACE=257, DIGIT=258, WILD=259}; static int charclasses[256]; static set unicign; +static set visiblewhite; static void setcharclasses() { static int init = 0; @@ -91,10 +92,15 @@ static void setcharclasses() for (i = 0; i < strlen(special); i++) charclasses[int(special[i])] = special[i]; - for (i = 0; i < sizeof(uniign); i++) + for (i = 0; i < sizeof(uniign) / sizeof(int); i++) { unicign.insert(uniign[i]); + } unicign.insert((unsigned int)-1); + for (i = 0; i < sizeof(avsbwht) / sizeof(int); i++) { + visiblewhite.insert(avsbwht[i]); + } + init = 1; } @@ -533,6 +539,102 @@ int TextSplit::countWords(const string& s, TextSplit::Flags flgs) return cb.wcnt; } +bool TextSplit::hasVisibleWhite(const string &in) +{ + setcharclasses(); + Utf8Iter it(in); + for (; !it.eof(); it++) { + unsigned int c = *it; + LOGDEB3(("TextSplit::hasVisibleWhite: testing 0x%04x\n", c)); + if (c == (unsigned int)-1) { + LOGERR(("hasVisibleWhite: error while scanning UTF-8 string\n")); + return false; + } + if (visiblewhite.find(c) != visiblewhite.end()) + return true; + } + return false; +} + +template bool u8stringToStrings(const string &s, T &tokens) +{ + setcharclasses(); + Utf8Iter it(s); + + string current; + tokens.clear(); + enum states {SPACE, TOKEN, INQUOTE, ESCAPE}; + states state = SPACE; + for (; !it.eof(); it++) { + unsigned int c = *it; + if (visiblewhite.find(c) != visiblewhite.end()) + c = ' '; + LOGDEB3(("TextSplit::stringToStrings: 0x%04x\n", c)); + if (c == (unsigned int)-1) { + LOGERR(("TextSplit::stringToStrings: error while " + "scanning UTF-8 string\n")); + return false; + } + + switch (c) { + case '"': + switch(state) { + case SPACE: state = INQUOTE; continue; + case TOKEN: goto push_char; + case ESCAPE: state = INQUOTE; goto push_char; + case INQUOTE: tokens.push_back(current);current.clear(); + state = SPACE; continue; + } + break; + case '\\': + switch(state) { + case SPACE: + case TOKEN: state=TOKEN; goto push_char; + case INQUOTE: state = ESCAPE; continue; + case ESCAPE: state = INQUOTE; goto push_char; + } + break; + + case ' ': + case '\t': + case '\n': + case '\r': + switch(state) { + case SPACE: continue; + case TOKEN: tokens.push_back(current); current.clear(); + state = SPACE; continue; + case INQUOTE: + case ESCAPE: goto push_char; + } + break; + + default: + switch(state) { + case ESCAPE: state = INQUOTE; break; + case SPACE: state = TOKEN; break; + case TOKEN: + case INQUOTE: break; + } + push_char: + it.appendchartostring(current); + } + } + + // End of string. Process residue, and possible error (unfinished quote) + switch(state) { + case SPACE: break; + case TOKEN: tokens.push_back(current); break; + case INQUOTE: + case ESCAPE: return false; + } + return true; +} + +bool TextSplit::stringToStrings(const string &s, list &tokens) +{ + return u8stringToStrings >(s, tokens); +} + #else // TEST driver -> #include diff --git a/src/common/textsplit.h b/src/common/textsplit.h index 21079bb9..4c3b1ab2 100644 --- a/src/common/textsplit.h +++ b/src/common/textsplit.h @@ -16,11 +16,13 @@ */ #ifndef _TEXTSPLIT_H_INCLUDED_ #define _TEXTSPLIT_H_INCLUDED_ -/* @(#$Id: textsplit.h,v 1.20 2007-10-04 12:21:52 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: textsplit.h,v 1.21 2008-12-05 11:09:31 dockes Exp $ (C) 2004 J.F.Dockes */ #include +#include #ifndef NO_NAMESPACES using std::string; +using std::list; #endif /** @@ -74,15 +76,27 @@ public: { } - /** - * Split text, emit words and positions. - */ + /** Split text, emit words and positions. */ bool text_to_words(const string &in); - // Utility functions : these does not need the user to setup a callback + //Utility functions : these does not need the user to setup a callback // etc. + + /** Count words in string, as the splitter would generate them */ static int countWords(const string &in, Flags flgs = TXTS_ONLYSPANS); + /** Check if this is visibly not a single block of text */ + static bool hasVisibleWhite(const string &in); + + /** Split text span into strings, at white space, allowing for substrings + * quoted with " . Escaping with \ works as usual inside the quoted areas. + * This has to be kept separate from smallut.cpp's stringsToStrings, which + * basically works only if whitespace is ascii, and which processes + * non-utf-8 input (iso-8859 config files work ok). This hopefully + * handles all Unicode whitespace, but needs correct utf-8 input + */ + static bool stringToStrings(const string &s, list &tokens); + private: Flags m_flags; TextSplitCB *m_cb; diff --git a/src/common/uproplist.h b/src/common/uproplist.h index cad898e1..f3c312f5 100644 --- a/src/common/uproplist.h +++ b/src/common/uproplist.h @@ -16,7 +16,9 @@ */ #ifndef _PROPLIST_H_INCLUDED_ #define _PROPLIST_H_INCLUDED_ -/* @(#$Id: uproplist.h,v 1.2 2006-01-30 11:15:27 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: uproplist.h,v 1.3 2008-12-05 11:09:31 dockes Exp $ (C) 2004 J.F.Dockes */ + + /* * A subset of Unicode chars that we consider whitespace when we split text in * words. @@ -36,19 +38,22 @@ # For terms of use, see http://www.unicode.org/terms_of_use.html # For documentation, see UCD.html */ - static const unsigned int uniign[] = { - 0x0085, /* ; White_Space # Cc */ - 0x00A0, /* ; White_Space # Zs NO-BREAK SPACE*/ - 0x00A1, /* misc signs, bullet etc... */ - 0x00A2, - 0x00A3, - 0x00A4, - 0x00A5, - 0x00A6, - 0x00A9, /* copyright sign */ - 0x00AA, - 0x00AE, /* registered sign */ + 0x0085, /* NEXT LINE NEL;Cc */ + 0x00A0, /* NO-BREAK SPACE; Zs */ + 0x00A1, /* INVERTED EXCLAMATION MARK;Po */ + 0x00A2, /* CENT SIGN;Sc */ + 0x00A3, /* POUND SIGN;Sc; */ + 0x00A4, /* CURRENCY SIGN;Sc; */ + 0x00A5, /* YEN SIGN;Sc; */ + 0x00A6, /* BROKEN BAR;So */ + 0x00A7, /* SECTION SIGN;So; */ + 0x00A8, /* DIAERESIS;Sk; */ + 0x00A9, /* COPYRIGHT SIGN;So */ + 0x00AA, /* FEMININE ORDINAL INDICATOR;Ll */ + 0x00AB, /* LEFT-POINTING DOUBLE ANGLE QUOTATION MARK;Pi */ + 0x00AC, /* NOT SIGN;Sm */ + 0x00AE, /* registered sign */ 0x1680, /* ; White_Space # Zs OGHAM SPACE MARK*/ 0x180E, /* ; White_Space # Zs MONGOLIAN VOWEL SEPARATOR*/ 0x2000, /* ; White_Space # Zs [11] EN QUAD..HAIR SPACE*/ @@ -181,4 +186,30 @@ static const unsigned int uniign[] = { 0xFF61, /* ; STerm # Po HALFWIDTH IDEOGRAPHIC FULL STOP*/ }; -#endif /*PLIST_H_INCLUDED_ */ +/* Things that would visibly break a block of text, rendering obvious the need + * of quotation if a phrase search is wanted */ +static const unsigned int avsbwht[] = { + 0x0009, /* CHARACTER TABULATION */ + 0x000A, /* LINE FEED */ + 0x000D, /* CARRIAGE RETURN */ + 0x0020, /* SPACE;Zs;0;WS */ + 0x00A0, /* NO-BREAK SPACE;Zs;0;CS */ + 0x1680, /* OGHAM SPACE MARK;Zs;0;WS */ + 0x180E, /* MONGOLIAN VOWEL SEPARATOR;Zs;0;WS */ + 0x2000, /* EN QUAD;Zs;0;WS */ + 0x2001, /* EM QUAD;Zs;0;WS */ + 0x2002, /* EN SPACE;Zs;0;WS */ + 0x2003, /* EM SPACE;Zs;0;WS */ + 0x2004, /* THREE-PER-EM SPACE;Zs;0;WS */ + 0x2005, /* FOUR-PER-EM SPACE;Zs;0;WS */ + 0x2006, /* SIX-PER-EM SPACE;Zs;0;WS */ + 0x2007, /* FIGURE SPACE;Zs;0;WS */ + 0x2008, /* PUNCTUATION SPACE;Zs;0;WS */ + 0x2009, /* THIN SPACE;Zs;0;WS */ + 0x200A, /* HAIR SPACE;Zs;0;WS */ + 0x202F, /* NARROW NO-BREAK SPACE;Zs;0;CS */ + 0x205F, /* MEDIUM MATHEMATICAL SPACE;Zs;0;WS */ + 0x3000, /* IDEOGRAPHIC SPACE;Zs;0;WS */ +}; + +#endif // _PROPLIST_H_INCLUDED_ diff --git a/src/kde/kioslave/recoll/kio_recoll.cpp b/src/kde/kioslave/recoll/kio_recoll.cpp index 8c26cc99..de5ea7a1 100644 --- a/src/kde/kioslave/recoll/kio_recoll.cpp +++ b/src/kde/kioslave/recoll/kio_recoll.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: kio_recoll.cpp,v 1.22 2008-12-04 11:49:59 dockes Exp $ (C) 2005 J.F.Dockes"; +static char rcsid[] = "@(#$Id: kio_recoll.cpp,v 1.23 2008-12-05 11:09:31 dockes Exp $ (C) 2005 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -293,7 +293,7 @@ bool RecollProtocol::doSearch(const QueryDesc& qd) // If there is no white space inside the query, then the user // certainly means it as a phrase. bool isreallyaphrase = false; - if (qs.find_first_of(" \t") == string::npos) + if (!TextSplit::hasVisibleWhite(qs)) isreallyaphrase = true; clp = isreallyaphrase ? new Rcl::SearchDataClauseDist(Rcl::SCLT_PHRASE, qs, 0) : diff --git a/src/qtgui/ssearch_w.cpp b/src/qtgui/ssearch_w.cpp index 4a542abe..032dfef2 100644 --- a/src/qtgui/ssearch_w.cpp +++ b/src/qtgui/ssearch_w.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: ssearch_w.cpp,v 1.25 2008-10-13 11:46:06 dockes Exp $ (C) 2006 J.F.Dockes"; +static char rcsid[] = "@(#$Id: ssearch_w.cpp,v 1.26 2008-12-05 11:09:31 dockes Exp $ (C) 2006 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -107,7 +107,7 @@ void SSearch::startSimpleSearch() // If there is no white space inside the query, then the user // certainly means it as a phrase. bool isreallyaphrase = false; - if (u8.find_first_of(" \t\r\n") == string::npos) + if (!TextSplit::hasVisibleWhite(u8)) isreallyaphrase = true; // Maybe add automatic phrase ? For ALL and ANY, and not if diff --git a/src/query/recollq.cpp b/src/query/recollq.cpp index 400973ea..2515dec0 100644 --- a/src/query/recollq.cpp +++ b/src/query/recollq.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: recollq.cpp,v 1.20 2008-10-13 08:23:36 dockes Exp $ (C) 2006 J.F.Dockes"; +static char rcsid[] = "@(#$Id: recollq.cpp,v 1.21 2008-12-05 11:09:31 dockes Exp $ (C) 2006 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -203,7 +203,7 @@ int recollq(RclConfig **cfp, int argc, char **argv) // If there is no white space inside the query, then the user // certainly means it as a phrase. bool isreallyaphrase = false; - if (qs.find_first_of(" \t") == string::npos) + if (!TextSplit::hasVisibleWhite(qs)) isreallyaphrase = true; clp = isreallyaphrase ? new Rcl::SearchDataClauseDist(Rcl::SCLT_PHRASE, qs, 0) : diff --git a/src/query/wasatorcl.cpp b/src/query/wasatorcl.cpp index c5e92b05..be5930d3 100644 --- a/src/query/wasatorcl.cpp +++ b/src/query/wasatorcl.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: wasatorcl.cpp,v 1.17 2008-11-18 13:25:48 dockes Exp $ (C) 2006 J.F.Dockes"; +static char rcsid[] = "@(#$Id: wasatorcl.cpp,v 1.18 2008-12-05 11:09:31 dockes Exp $ (C) 2006 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -102,7 +102,7 @@ Rcl::SearchData *wasaQueryToRcl(WasaQuery *wasa) break; } - if ((*it)->m_value.find_first_of(" \t\n\r") != string::npos) { + if (TextSplit::hasVisibleWhite((*it)->m_value)) { int slack = (mods & WasaQuery::WQM_PHRASESLACK) ? 10 : 0; Rcl::SClType tp = Rcl::SCLT_PHRASE; if (mods & WasaQuery::WQM_PROX) { diff --git a/src/rcldb/searchdata.cpp b/src/rcldb/searchdata.cpp index 1df4d867..4f5605bd 100644 --- a/src/rcldb/searchdata.cpp +++ b/src/rcldb/searchdata.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.26 2008-10-14 07:50:13 dockes Exp $ (C) 2006 J.F.Dockes"; +static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.27 2008-12-05 11:09:31 dockes Exp $ (C) 2006 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -380,7 +380,7 @@ bool StringToXapianQ::processUserString(const string &iq, // depends on separators: [paul@dom.net] would still be a word // (span), but [about:me] will probably be handled as a phrase. list phrases; - stringToStrings(iq, phrases); + TextSplit::stringToStrings(iq, phrases); // Process each element: textsplit into terms, handle stem/wildcard // expansion and transform into an appropriate Xapian::Query @@ -568,7 +568,7 @@ bool SearchDataClauseFilename::toNativeQuery(Rcl::Db &db, void *p, *qp = Xapian::Query(); list patterns; - stringToStrings(m_text, patterns); + TextSplit::stringToStrings(m_text, patterns); list names; for (list::iterator it = patterns.begin(); it != patterns.end(); it++) {