take care of splitting user string with respect to unicode white space, not only ascii

This commit is contained in:
dockes 2008-12-05 11:09:31 +00:00
parent 07dc3e0dd0
commit 3414963810
8 changed files with 179 additions and 32 deletions

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.36 2007-12-13 06:58:21 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.37 2008-12-05 11:09:31 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -59,6 +59,7 @@ enum CharClass {LETTER=256, SPACE=257, DIGIT=258, WILD=259};
static int charclasses[256];
static set<unsigned int> unicign;
static set<unsigned int> visiblewhite;
static void setcharclasses()
{
static int init = 0;
@ -91,10 +92,15 @@ static void setcharclasses()
for (i = 0; i < strlen(special); i++)
charclasses[int(special[i])] = special[i];
for (i = 0; i < sizeof(uniign); i++)
for (i = 0; i < sizeof(uniign) / sizeof(int); i++) {
unicign.insert(uniign[i]);
}
unicign.insert((unsigned int)-1);
for (i = 0; i < sizeof(avsbwht) / sizeof(int); i++) {
visiblewhite.insert(avsbwht[i]);
}
init = 1;
}
@ -533,6 +539,102 @@ int TextSplit::countWords(const string& s, TextSplit::Flags flgs)
return cb.wcnt;
}
bool TextSplit::hasVisibleWhite(const string &in)
{
setcharclasses();
Utf8Iter it(in);
for (; !it.eof(); it++) {
unsigned int c = *it;
LOGDEB3(("TextSplit::hasVisibleWhite: testing 0x%04x\n", c));
if (c == (unsigned int)-1) {
LOGERR(("hasVisibleWhite: error while scanning UTF-8 string\n"));
return false;
}
if (visiblewhite.find(c) != visiblewhite.end())
return true;
}
return false;
}
template <class T> bool u8stringToStrings(const string &s, T &tokens)
{
setcharclasses();
Utf8Iter it(s);
string current;
tokens.clear();
enum states {SPACE, TOKEN, INQUOTE, ESCAPE};
states state = SPACE;
for (; !it.eof(); it++) {
unsigned int c = *it;
if (visiblewhite.find(c) != visiblewhite.end())
c = ' ';
LOGDEB3(("TextSplit::stringToStrings: 0x%04x\n", c));
if (c == (unsigned int)-1) {
LOGERR(("TextSplit::stringToStrings: error while "
"scanning UTF-8 string\n"));
return false;
}
switch (c) {
case '"':
switch(state) {
case SPACE: state = INQUOTE; continue;
case TOKEN: goto push_char;
case ESCAPE: state = INQUOTE; goto push_char;
case INQUOTE: tokens.push_back(current);current.clear();
state = SPACE; continue;
}
break;
case '\\':
switch(state) {
case SPACE:
case TOKEN: state=TOKEN; goto push_char;
case INQUOTE: state = ESCAPE; continue;
case ESCAPE: state = INQUOTE; goto push_char;
}
break;
case ' ':
case '\t':
case '\n':
case '\r':
switch(state) {
case SPACE: continue;
case TOKEN: tokens.push_back(current); current.clear();
state = SPACE; continue;
case INQUOTE:
case ESCAPE: goto push_char;
}
break;
default:
switch(state) {
case ESCAPE: state = INQUOTE; break;
case SPACE: state = TOKEN; break;
case TOKEN:
case INQUOTE: break;
}
push_char:
it.appendchartostring(current);
}
}
// End of string. Process residue, and possible error (unfinished quote)
switch(state) {
case SPACE: break;
case TOKEN: tokens.push_back(current); break;
case INQUOTE:
case ESCAPE: return false;
}
return true;
}
bool TextSplit::stringToStrings(const string &s, list<string> &tokens)
{
return u8stringToStrings<list<string> >(s, tokens);
}
#else // TEST driver ->
#include <unistd.h>

View File

@ -16,11 +16,13 @@
*/
#ifndef _TEXTSPLIT_H_INCLUDED_
#define _TEXTSPLIT_H_INCLUDED_
/* @(#$Id: textsplit.h,v 1.20 2007-10-04 12:21:52 dockes Exp $ (C) 2004 J.F.Dockes */
/* @(#$Id: textsplit.h,v 1.21 2008-12-05 11:09:31 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string>
#include <list>
#ifndef NO_NAMESPACES
using std::string;
using std::list;
#endif
/**
@ -74,15 +76,27 @@ public:
{
}
/**
* Split text, emit words and positions.
*/
/** Split text, emit words and positions. */
bool text_to_words(const string &in);
// Utility functions : these does not need the user to setup a callback
//Utility functions : these does not need the user to setup a callback
// etc.
/** Count words in string, as the splitter would generate them */
static int countWords(const string &in, Flags flgs = TXTS_ONLYSPANS);
/** Check if this is visibly not a single block of text */
static bool hasVisibleWhite(const string &in);
/** Split text span into strings, at white space, allowing for substrings
* quoted with " . Escaping with \ works as usual inside the quoted areas.
* This has to be kept separate from smallut.cpp's stringsToStrings, which
* basically works only if whitespace is ascii, and which processes
* non-utf-8 input (iso-8859 config files work ok). This hopefully
* handles all Unicode whitespace, but needs correct utf-8 input
*/
static bool stringToStrings(const string &s, list<string> &tokens);
private:
Flags m_flags;
TextSplitCB *m_cb;

View File

@ -16,7 +16,9 @@
*/
#ifndef _PROPLIST_H_INCLUDED_
#define _PROPLIST_H_INCLUDED_
/* @(#$Id: uproplist.h,v 1.2 2006-01-30 11:15:27 dockes Exp $ (C) 2004 J.F.Dockes */
/* @(#$Id: uproplist.h,v 1.3 2008-12-05 11:09:31 dockes Exp $ (C) 2004 J.F.Dockes */
/*
* A subset of Unicode chars that we consider whitespace when we split text in
* words.
@ -36,19 +38,22 @@
# For terms of use, see http://www.unicode.org/terms_of_use.html
# For documentation, see UCD.html
*/
static const unsigned int uniign[] = {
0x0085, /* ; White_Space # Cc <control-0085>*/
0x00A0, /* ; White_Space # Zs NO-BREAK SPACE*/
0x00A1, /* misc signs, bullet etc... */
0x00A2,
0x00A3,
0x00A4,
0x00A5,
0x00A6,
0x00A9, /* copyright sign */
0x00AA,
0x00AE, /* registered sign */
0x0085, /* NEXT LINE NEL;Cc */
0x00A0, /* NO-BREAK SPACE; Zs */
0x00A1, /* INVERTED EXCLAMATION MARK;Po */
0x00A2, /* CENT SIGN;Sc */
0x00A3, /* POUND SIGN;Sc; */
0x00A4, /* CURRENCY SIGN;Sc; */
0x00A5, /* YEN SIGN;Sc; */
0x00A6, /* BROKEN BAR;So */
0x00A7, /* SECTION SIGN;So; */
0x00A8, /* DIAERESIS;Sk; */
0x00A9, /* COPYRIGHT SIGN;So */
0x00AA, /* FEMININE ORDINAL INDICATOR;Ll */
0x00AB, /* LEFT-POINTING DOUBLE ANGLE QUOTATION MARK;Pi */
0x00AC, /* NOT SIGN;Sm */
0x00AE, /* registered sign */
0x1680, /* ; White_Space # Zs OGHAM SPACE MARK*/
0x180E, /* ; White_Space # Zs MONGOLIAN VOWEL SEPARATOR*/
0x2000, /* ; White_Space # Zs [11] EN QUAD..HAIR SPACE*/
@ -181,4 +186,30 @@ static const unsigned int uniign[] = {
0xFF61, /* ; STerm # Po HALFWIDTH IDEOGRAPHIC FULL STOP*/
};
#endif /*PLIST_H_INCLUDED_ */
/* Things that would visibly break a block of text, rendering obvious the need
* of quotation if a phrase search is wanted */
static const unsigned int avsbwht[] = {
0x0009, /* CHARACTER TABULATION */
0x000A, /* LINE FEED */
0x000D, /* CARRIAGE RETURN */
0x0020, /* SPACE;Zs;0;WS */
0x00A0, /* NO-BREAK SPACE;Zs;0;CS */
0x1680, /* OGHAM SPACE MARK;Zs;0;WS */
0x180E, /* MONGOLIAN VOWEL SEPARATOR;Zs;0;WS */
0x2000, /* EN QUAD;Zs;0;WS */
0x2001, /* EM QUAD;Zs;0;WS */
0x2002, /* EN SPACE;Zs;0;WS */
0x2003, /* EM SPACE;Zs;0;WS */
0x2004, /* THREE-PER-EM SPACE;Zs;0;WS */
0x2005, /* FOUR-PER-EM SPACE;Zs;0;WS */
0x2006, /* SIX-PER-EM SPACE;Zs;0;WS */
0x2007, /* FIGURE SPACE;Zs;0;WS */
0x2008, /* PUNCTUATION SPACE;Zs;0;WS */
0x2009, /* THIN SPACE;Zs;0;WS */
0x200A, /* HAIR SPACE;Zs;0;WS */
0x202F, /* NARROW NO-BREAK SPACE;Zs;0;CS */
0x205F, /* MEDIUM MATHEMATICAL SPACE;Zs;0;WS */
0x3000, /* IDEOGRAPHIC SPACE;Zs;0;WS */
};
#endif // _PROPLIST_H_INCLUDED_

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: kio_recoll.cpp,v 1.22 2008-12-04 11:49:59 dockes Exp $ (C) 2005 J.F.Dockes";
static char rcsid[] = "@(#$Id: kio_recoll.cpp,v 1.23 2008-12-05 11:09:31 dockes Exp $ (C) 2005 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -293,7 +293,7 @@ bool RecollProtocol::doSearch(const QueryDesc& qd)
// If there is no white space inside the query, then the user
// certainly means it as a phrase.
bool isreallyaphrase = false;
if (qs.find_first_of(" \t") == string::npos)
if (!TextSplit::hasVisibleWhite(qs))
isreallyaphrase = true;
clp = isreallyaphrase ?
new Rcl::SearchDataClauseDist(Rcl::SCLT_PHRASE, qs, 0) :

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: ssearch_w.cpp,v 1.25 2008-10-13 11:46:06 dockes Exp $ (C) 2006 J.F.Dockes";
static char rcsid[] = "@(#$Id: ssearch_w.cpp,v 1.26 2008-12-05 11:09:31 dockes Exp $ (C) 2006 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -107,7 +107,7 @@ void SSearch::startSimpleSearch()
// If there is no white space inside the query, then the user
// certainly means it as a phrase.
bool isreallyaphrase = false;
if (u8.find_first_of(" \t\r\n") == string::npos)
if (!TextSplit::hasVisibleWhite(u8))
isreallyaphrase = true;
// Maybe add automatic phrase ? For ALL and ANY, and not if

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: recollq.cpp,v 1.20 2008-10-13 08:23:36 dockes Exp $ (C) 2006 J.F.Dockes";
static char rcsid[] = "@(#$Id: recollq.cpp,v 1.21 2008-12-05 11:09:31 dockes Exp $ (C) 2006 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -203,7 +203,7 @@ int recollq(RclConfig **cfp, int argc, char **argv)
// If there is no white space inside the query, then the user
// certainly means it as a phrase.
bool isreallyaphrase = false;
if (qs.find_first_of(" \t") == string::npos)
if (!TextSplit::hasVisibleWhite(qs))
isreallyaphrase = true;
clp = isreallyaphrase ?
new Rcl::SearchDataClauseDist(Rcl::SCLT_PHRASE, qs, 0) :

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: wasatorcl.cpp,v 1.17 2008-11-18 13:25:48 dockes Exp $ (C) 2006 J.F.Dockes";
static char rcsid[] = "@(#$Id: wasatorcl.cpp,v 1.18 2008-12-05 11:09:31 dockes Exp $ (C) 2006 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -102,7 +102,7 @@ Rcl::SearchData *wasaQueryToRcl(WasaQuery *wasa)
break;
}
if ((*it)->m_value.find_first_of(" \t\n\r") != string::npos) {
if (TextSplit::hasVisibleWhite((*it)->m_value)) {
int slack = (mods & WasaQuery::WQM_PHRASESLACK) ? 10 : 0;
Rcl::SClType tp = Rcl::SCLT_PHRASE;
if (mods & WasaQuery::WQM_PROX) {

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.26 2008-10-14 07:50:13 dockes Exp $ (C) 2006 J.F.Dockes";
static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.27 2008-12-05 11:09:31 dockes Exp $ (C) 2006 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -380,7 +380,7 @@ bool StringToXapianQ::processUserString(const string &iq,
// depends on separators: [paul@dom.net] would still be a word
// (span), but [about:me] will probably be handled as a phrase.
list<string> phrases;
stringToStrings(iq, phrases);
TextSplit::stringToStrings(iq, phrases);
// Process each element: textsplit into terms, handle stem/wildcard
// expansion and transform into an appropriate Xapian::Query
@ -568,7 +568,7 @@ bool SearchDataClauseFilename::toNativeQuery(Rcl::Db &db, void *p,
*qp = Xapian::Query();
list<string> patterns;
stringToStrings(m_text, patterns);
TextSplit::stringToStrings(m_text, patterns);
list<string> names;
for (list<string>::iterator it = patterns.begin();
it != patterns.end(); it++) {