take care of splitting user string with respect to unicode white space, not only ascii
This commit is contained in:
parent
07dc3e0dd0
commit
3414963810
@ -1,5 +1,5 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.36 2007-12-13 06:58:21 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.37 2008-12-05 11:09:31 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
#endif
|
||||
/*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
@ -59,6 +59,7 @@ enum CharClass {LETTER=256, SPACE=257, DIGIT=258, WILD=259};
|
||||
static int charclasses[256];
|
||||
|
||||
static set<unsigned int> unicign;
|
||||
static set<unsigned int> visiblewhite;
|
||||
static void setcharclasses()
|
||||
{
|
||||
static int init = 0;
|
||||
@ -91,10 +92,15 @@ static void setcharclasses()
|
||||
for (i = 0; i < strlen(special); i++)
|
||||
charclasses[int(special[i])] = special[i];
|
||||
|
||||
for (i = 0; i < sizeof(uniign); i++)
|
||||
for (i = 0; i < sizeof(uniign) / sizeof(int); i++) {
|
||||
unicign.insert(uniign[i]);
|
||||
}
|
||||
unicign.insert((unsigned int)-1);
|
||||
|
||||
for (i = 0; i < sizeof(avsbwht) / sizeof(int); i++) {
|
||||
visiblewhite.insert(avsbwht[i]);
|
||||
}
|
||||
|
||||
init = 1;
|
||||
}
|
||||
|
||||
@ -533,6 +539,102 @@ int TextSplit::countWords(const string& s, TextSplit::Flags flgs)
|
||||
return cb.wcnt;
|
||||
}
|
||||
|
||||
bool TextSplit::hasVisibleWhite(const string &in)
|
||||
{
|
||||
setcharclasses();
|
||||
Utf8Iter it(in);
|
||||
for (; !it.eof(); it++) {
|
||||
unsigned int c = *it;
|
||||
LOGDEB3(("TextSplit::hasVisibleWhite: testing 0x%04x\n", c));
|
||||
if (c == (unsigned int)-1) {
|
||||
LOGERR(("hasVisibleWhite: error while scanning UTF-8 string\n"));
|
||||
return false;
|
||||
}
|
||||
if (visiblewhite.find(c) != visiblewhite.end())
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
template <class T> bool u8stringToStrings(const string &s, T &tokens)
|
||||
{
|
||||
setcharclasses();
|
||||
Utf8Iter it(s);
|
||||
|
||||
string current;
|
||||
tokens.clear();
|
||||
enum states {SPACE, TOKEN, INQUOTE, ESCAPE};
|
||||
states state = SPACE;
|
||||
for (; !it.eof(); it++) {
|
||||
unsigned int c = *it;
|
||||
if (visiblewhite.find(c) != visiblewhite.end())
|
||||
c = ' ';
|
||||
LOGDEB3(("TextSplit::stringToStrings: 0x%04x\n", c));
|
||||
if (c == (unsigned int)-1) {
|
||||
LOGERR(("TextSplit::stringToStrings: error while "
|
||||
"scanning UTF-8 string\n"));
|
||||
return false;
|
||||
}
|
||||
|
||||
switch (c) {
|
||||
case '"':
|
||||
switch(state) {
|
||||
case SPACE: state = INQUOTE; continue;
|
||||
case TOKEN: goto push_char;
|
||||
case ESCAPE: state = INQUOTE; goto push_char;
|
||||
case INQUOTE: tokens.push_back(current);current.clear();
|
||||
state = SPACE; continue;
|
||||
}
|
||||
break;
|
||||
case '\\':
|
||||
switch(state) {
|
||||
case SPACE:
|
||||
case TOKEN: state=TOKEN; goto push_char;
|
||||
case INQUOTE: state = ESCAPE; continue;
|
||||
case ESCAPE: state = INQUOTE; goto push_char;
|
||||
}
|
||||
break;
|
||||
|
||||
case ' ':
|
||||
case '\t':
|
||||
case '\n':
|
||||
case '\r':
|
||||
switch(state) {
|
||||
case SPACE: continue;
|
||||
case TOKEN: tokens.push_back(current); current.clear();
|
||||
state = SPACE; continue;
|
||||
case INQUOTE:
|
||||
case ESCAPE: goto push_char;
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
switch(state) {
|
||||
case ESCAPE: state = INQUOTE; break;
|
||||
case SPACE: state = TOKEN; break;
|
||||
case TOKEN:
|
||||
case INQUOTE: break;
|
||||
}
|
||||
push_char:
|
||||
it.appendchartostring(current);
|
||||
}
|
||||
}
|
||||
|
||||
// End of string. Process residue, and possible error (unfinished quote)
|
||||
switch(state) {
|
||||
case SPACE: break;
|
||||
case TOKEN: tokens.push_back(current); break;
|
||||
case INQUOTE:
|
||||
case ESCAPE: return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool TextSplit::stringToStrings(const string &s, list<string> &tokens)
|
||||
{
|
||||
return u8stringToStrings<list<string> >(s, tokens);
|
||||
}
|
||||
|
||||
#else // TEST driver ->
|
||||
|
||||
#include <unistd.h>
|
||||
|
||||
@ -16,11 +16,13 @@
|
||||
*/
|
||||
#ifndef _TEXTSPLIT_H_INCLUDED_
|
||||
#define _TEXTSPLIT_H_INCLUDED_
|
||||
/* @(#$Id: textsplit.h,v 1.20 2007-10-04 12:21:52 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
/* @(#$Id: textsplit.h,v 1.21 2008-12-05 11:09:31 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
|
||||
#include <string>
|
||||
#include <list>
|
||||
#ifndef NO_NAMESPACES
|
||||
using std::string;
|
||||
using std::list;
|
||||
#endif
|
||||
|
||||
/**
|
||||
@ -74,15 +76,27 @@ public:
|
||||
{
|
||||
}
|
||||
|
||||
/**
|
||||
* Split text, emit words and positions.
|
||||
*/
|
||||
/** Split text, emit words and positions. */
|
||||
bool text_to_words(const string &in);
|
||||
|
||||
// Utility functions : these does not need the user to setup a callback
|
||||
//Utility functions : these does not need the user to setup a callback
|
||||
// etc.
|
||||
|
||||
/** Count words in string, as the splitter would generate them */
|
||||
static int countWords(const string &in, Flags flgs = TXTS_ONLYSPANS);
|
||||
|
||||
/** Check if this is visibly not a single block of text */
|
||||
static bool hasVisibleWhite(const string &in);
|
||||
|
||||
/** Split text span into strings, at white space, allowing for substrings
|
||||
* quoted with " . Escaping with \ works as usual inside the quoted areas.
|
||||
* This has to be kept separate from smallut.cpp's stringsToStrings, which
|
||||
* basically works only if whitespace is ascii, and which processes
|
||||
* non-utf-8 input (iso-8859 config files work ok). This hopefully
|
||||
* handles all Unicode whitespace, but needs correct utf-8 input
|
||||
*/
|
||||
static bool stringToStrings(const string &s, list<string> &tokens);
|
||||
|
||||
private:
|
||||
Flags m_flags;
|
||||
TextSplitCB *m_cb;
|
||||
|
||||
@ -16,7 +16,9 @@
|
||||
*/
|
||||
#ifndef _PROPLIST_H_INCLUDED_
|
||||
#define _PROPLIST_H_INCLUDED_
|
||||
/* @(#$Id: uproplist.h,v 1.2 2006-01-30 11:15:27 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
/* @(#$Id: uproplist.h,v 1.3 2008-12-05 11:09:31 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
|
||||
|
||||
/*
|
||||
* A subset of Unicode chars that we consider whitespace when we split text in
|
||||
* words.
|
||||
@ -36,19 +38,22 @@
|
||||
# For terms of use, see http://www.unicode.org/terms_of_use.html
|
||||
# For documentation, see UCD.html
|
||||
*/
|
||||
|
||||
static const unsigned int uniign[] = {
|
||||
0x0085, /* ; White_Space # Cc <control-0085>*/
|
||||
0x00A0, /* ; White_Space # Zs NO-BREAK SPACE*/
|
||||
0x00A1, /* misc signs, bullet etc... */
|
||||
0x00A2,
|
||||
0x00A3,
|
||||
0x00A4,
|
||||
0x00A5,
|
||||
0x00A6,
|
||||
0x00A9, /* copyright sign */
|
||||
0x00AA,
|
||||
0x00AE, /* registered sign */
|
||||
0x0085, /* NEXT LINE NEL;Cc */
|
||||
0x00A0, /* NO-BREAK SPACE; Zs */
|
||||
0x00A1, /* INVERTED EXCLAMATION MARK;Po */
|
||||
0x00A2, /* CENT SIGN;Sc */
|
||||
0x00A3, /* POUND SIGN;Sc; */
|
||||
0x00A4, /* CURRENCY SIGN;Sc; */
|
||||
0x00A5, /* YEN SIGN;Sc; */
|
||||
0x00A6, /* BROKEN BAR;So */
|
||||
0x00A7, /* SECTION SIGN;So; */
|
||||
0x00A8, /* DIAERESIS;Sk; */
|
||||
0x00A9, /* COPYRIGHT SIGN;So */
|
||||
0x00AA, /* FEMININE ORDINAL INDICATOR;Ll */
|
||||
0x00AB, /* LEFT-POINTING DOUBLE ANGLE QUOTATION MARK;Pi */
|
||||
0x00AC, /* NOT SIGN;Sm */
|
||||
0x00AE, /* registered sign */
|
||||
0x1680, /* ; White_Space # Zs OGHAM SPACE MARK*/
|
||||
0x180E, /* ; White_Space # Zs MONGOLIAN VOWEL SEPARATOR*/
|
||||
0x2000, /* ; White_Space # Zs [11] EN QUAD..HAIR SPACE*/
|
||||
@ -181,4 +186,30 @@ static const unsigned int uniign[] = {
|
||||
0xFF61, /* ; STerm # Po HALFWIDTH IDEOGRAPHIC FULL STOP*/
|
||||
};
|
||||
|
||||
#endif /*PLIST_H_INCLUDED_ */
|
||||
/* Things that would visibly break a block of text, rendering obvious the need
|
||||
* of quotation if a phrase search is wanted */
|
||||
static const unsigned int avsbwht[] = {
|
||||
0x0009, /* CHARACTER TABULATION */
|
||||
0x000A, /* LINE FEED */
|
||||
0x000D, /* CARRIAGE RETURN */
|
||||
0x0020, /* SPACE;Zs;0;WS */
|
||||
0x00A0, /* NO-BREAK SPACE;Zs;0;CS */
|
||||
0x1680, /* OGHAM SPACE MARK;Zs;0;WS */
|
||||
0x180E, /* MONGOLIAN VOWEL SEPARATOR;Zs;0;WS */
|
||||
0x2000, /* EN QUAD;Zs;0;WS */
|
||||
0x2001, /* EM QUAD;Zs;0;WS */
|
||||
0x2002, /* EN SPACE;Zs;0;WS */
|
||||
0x2003, /* EM SPACE;Zs;0;WS */
|
||||
0x2004, /* THREE-PER-EM SPACE;Zs;0;WS */
|
||||
0x2005, /* FOUR-PER-EM SPACE;Zs;0;WS */
|
||||
0x2006, /* SIX-PER-EM SPACE;Zs;0;WS */
|
||||
0x2007, /* FIGURE SPACE;Zs;0;WS */
|
||||
0x2008, /* PUNCTUATION SPACE;Zs;0;WS */
|
||||
0x2009, /* THIN SPACE;Zs;0;WS */
|
||||
0x200A, /* HAIR SPACE;Zs;0;WS */
|
||||
0x202F, /* NARROW NO-BREAK SPACE;Zs;0;CS */
|
||||
0x205F, /* MEDIUM MATHEMATICAL SPACE;Zs;0;WS */
|
||||
0x3000, /* IDEOGRAPHIC SPACE;Zs;0;WS */
|
||||
};
|
||||
|
||||
#endif // _PROPLIST_H_INCLUDED_
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: kio_recoll.cpp,v 1.22 2008-12-04 11:49:59 dockes Exp $ (C) 2005 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: kio_recoll.cpp,v 1.23 2008-12-05 11:09:31 dockes Exp $ (C) 2005 J.F.Dockes";
|
||||
#endif
|
||||
/*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
@ -293,7 +293,7 @@ bool RecollProtocol::doSearch(const QueryDesc& qd)
|
||||
// If there is no white space inside the query, then the user
|
||||
// certainly means it as a phrase.
|
||||
bool isreallyaphrase = false;
|
||||
if (qs.find_first_of(" \t") == string::npos)
|
||||
if (!TextSplit::hasVisibleWhite(qs))
|
||||
isreallyaphrase = true;
|
||||
clp = isreallyaphrase ?
|
||||
new Rcl::SearchDataClauseDist(Rcl::SCLT_PHRASE, qs, 0) :
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: ssearch_w.cpp,v 1.25 2008-10-13 11:46:06 dockes Exp $ (C) 2006 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: ssearch_w.cpp,v 1.26 2008-12-05 11:09:31 dockes Exp $ (C) 2006 J.F.Dockes";
|
||||
#endif
|
||||
/*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
@ -107,7 +107,7 @@ void SSearch::startSimpleSearch()
|
||||
// If there is no white space inside the query, then the user
|
||||
// certainly means it as a phrase.
|
||||
bool isreallyaphrase = false;
|
||||
if (u8.find_first_of(" \t\r\n") == string::npos)
|
||||
if (!TextSplit::hasVisibleWhite(u8))
|
||||
isreallyaphrase = true;
|
||||
|
||||
// Maybe add automatic phrase ? For ALL and ANY, and not if
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: recollq.cpp,v 1.20 2008-10-13 08:23:36 dockes Exp $ (C) 2006 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: recollq.cpp,v 1.21 2008-12-05 11:09:31 dockes Exp $ (C) 2006 J.F.Dockes";
|
||||
#endif
|
||||
/*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
@ -203,7 +203,7 @@ int recollq(RclConfig **cfp, int argc, char **argv)
|
||||
// If there is no white space inside the query, then the user
|
||||
// certainly means it as a phrase.
|
||||
bool isreallyaphrase = false;
|
||||
if (qs.find_first_of(" \t") == string::npos)
|
||||
if (!TextSplit::hasVisibleWhite(qs))
|
||||
isreallyaphrase = true;
|
||||
clp = isreallyaphrase ?
|
||||
new Rcl::SearchDataClauseDist(Rcl::SCLT_PHRASE, qs, 0) :
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: wasatorcl.cpp,v 1.17 2008-11-18 13:25:48 dockes Exp $ (C) 2006 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: wasatorcl.cpp,v 1.18 2008-12-05 11:09:31 dockes Exp $ (C) 2006 J.F.Dockes";
|
||||
#endif
|
||||
/*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
@ -102,7 +102,7 @@ Rcl::SearchData *wasaQueryToRcl(WasaQuery *wasa)
|
||||
break;
|
||||
}
|
||||
|
||||
if ((*it)->m_value.find_first_of(" \t\n\r") != string::npos) {
|
||||
if (TextSplit::hasVisibleWhite((*it)->m_value)) {
|
||||
int slack = (mods & WasaQuery::WQM_PHRASESLACK) ? 10 : 0;
|
||||
Rcl::SClType tp = Rcl::SCLT_PHRASE;
|
||||
if (mods & WasaQuery::WQM_PROX) {
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.26 2008-10-14 07:50:13 dockes Exp $ (C) 2006 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.27 2008-12-05 11:09:31 dockes Exp $ (C) 2006 J.F.Dockes";
|
||||
#endif
|
||||
/*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
@ -380,7 +380,7 @@ bool StringToXapianQ::processUserString(const string &iq,
|
||||
// depends on separators: [paul@dom.net] would still be a word
|
||||
// (span), but [about:me] will probably be handled as a phrase.
|
||||
list<string> phrases;
|
||||
stringToStrings(iq, phrases);
|
||||
TextSplit::stringToStrings(iq, phrases);
|
||||
|
||||
// Process each element: textsplit into terms, handle stem/wildcard
|
||||
// expansion and transform into an appropriate Xapian::Query
|
||||
@ -568,7 +568,7 @@ bool SearchDataClauseFilename::toNativeQuery(Rcl::Db &db, void *p,
|
||||
*qp = Xapian::Query();
|
||||
|
||||
list<string> patterns;
|
||||
stringToStrings(m_text, patterns);
|
||||
TextSplit::stringToStrings(m_text, patterns);
|
||||
list<string> names;
|
||||
for (list<string>::iterator it = patterns.begin();
|
||||
it != patterns.end(); it++) {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user