handle wildcards in search terms

This commit is contained in:
dockes 2007-01-18 12:09:58 +00:00
parent 2ca3f087ab
commit d12021b22c
3 changed files with 74 additions and 26 deletions

View File

@ -1,5 +1,5 @@
#ifndef lint #ifndef lint
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.27 2006-12-08 07:11:17 dockes Exp $ (C) 2004 J.F.Dockes"; static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.28 2007-01-18 12:09:58 dockes Exp $ (C) 2004 J.F.Dockes";
#endif #endif
/* /*
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
@ -52,7 +52,7 @@ using namespace std;
// The array is actually a remnant of the original version which did no utf8 // The array is actually a remnant of the original version which did no utf8
// It could be reduced to 128, because real (over 128) utf8 chars are now // It could be reduced to 128, because real (over 128) utf8 chars are now
// handled with a set holding all the separator values. // handled with a set holding all the separator values.
enum CharClass {LETTER=256, SPACE=257, DIGIT=258}; enum CharClass {LETTER=256, SPACE=257, DIGIT=258, WILD=259};
static int charclasses[256]; static int charclasses[256];
static set<unsigned int> unicign; static set<unsigned int> unicign;
@ -76,10 +76,14 @@ static void setcharclasses()
for (i = 0; i < strlen(blankspace); i++) for (i = 0; i < strlen(blankspace); i++)
charclasses[int(blankspace[i])] = SPACE; charclasses[int(blankspace[i])] = SPACE;
char seps[] = "!\"$%&()/<=>[\\]^{|}~:;*`?"; char seps[] = "!\"$%&()/<=>[\\]^{|}~:;`";
for (i = 0; i < strlen(seps); i++) for (i = 0; i < strlen(seps); i++)
charclasses[int(seps[i])] = SPACE; charclasses[int(seps[i])] = SPACE;
char wild[] = "*?";
for (i = 0; i < strlen(wild); i++)
charclasses[int(wild[i])] = WILD;
char special[] = ".@+-,#'\n\r"; char special[] = ".@+-,#'\n\r";
for (i = 0; i < strlen(special); i++) for (i = 0; i < strlen(special); i++)
charclasses[int(special[i])] = special[i]; charclasses[int(special[i])] = special[i];
@ -244,6 +248,12 @@ bool TextSplit::text_to_words(const string &in)
number = false; number = false;
} }
break; break;
case WILD:
if (m_flags & TXTS_KEEPWILD)
goto NORMALCHAR;
else
goto SPACE;
break;
case '-': case '-':
case '+': case '+':
if (wordLen == 0) { if (wordLen == 0) {
@ -338,6 +348,7 @@ bool TextSplit::text_to_words(const string &in)
break; break;
default: default:
NORMALCHAR:
wordLen += it.appendchartostring(span); wordLen += it.appendchartostring(span);
break; break;
} }
@ -426,6 +437,7 @@ static string usage =
" -S: no output\n" " -S: no output\n"
" -s: only spans\n" " -s: only spans\n"
" -w: only words\n" " -w: only words\n"
" -k: preserve wildcards (?*)\n"
" -c: just count words\n" " -c: just count words\n"
" if filename is 'stdin', will read stdin for data (end with ^D)\n" " if filename is 'stdin', will read stdin for data (end with ^D)\n"
" \n\n" " \n\n"
@ -443,6 +455,7 @@ static int op_flags;
#define OPT_w 0x2 #define OPT_w 0x2
#define OPT_S 0x4 #define OPT_S 0x4
#define OPT_c 0x8 #define OPT_c 0x8
#define OPT_k 0x10
int main(int argc, char **argv) int main(int argc, char **argv)
{ {
@ -457,6 +470,7 @@ int main(int argc, char **argv)
while (**argv) while (**argv)
switch (*(*argv)++) { switch (*(*argv)++) {
case 'c': op_flags |= OPT_c; break; case 'c': op_flags |= OPT_c; break;
case 'k': op_flags |= OPT_k; break;
case 's': op_flags |= OPT_s; break; case 's': op_flags |= OPT_s; break;
case 'S': op_flags |= OPT_S; break; case 'S': op_flags |= OPT_S; break;
case 'w': op_flags |= OPT_w; break; case 'w': op_flags |= OPT_w; break;
@ -477,6 +491,8 @@ int main(int argc, char **argv)
flags = TextSplit::TXTS_ONLYSPANS; flags = TextSplit::TXTS_ONLYSPANS;
else if (op_flags&OPT_w) else if (op_flags&OPT_w)
flags = TextSplit::TXTS_NOSPANS; flags = TextSplit::TXTS_NOSPANS;
if (op_flags & OPT_k)
flags = (TextSplit::Flags)(flags | TextSplit::TXTS_KEEPWILD);
string data; string data;
if (argc == 1) { if (argc == 1) {

View File

@ -16,7 +16,7 @@
*/ */
#ifndef _TEXTSPLIT_H_INCLUDED_ #ifndef _TEXTSPLIT_H_INCLUDED_
#define _TEXTSPLIT_H_INCLUDED_ #define _TEXTSPLIT_H_INCLUDED_
/* @(#$Id: textsplit.h,v 1.15 2006-12-08 07:11:17 dockes Exp $ (C) 2004 J.F.Dockes */ /* @(#$Id: textsplit.h,v 1.16 2007-01-18 12:09:58 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string> #include <string>
#ifndef NO_NAMESPACES #ifndef NO_NAMESPACES
@ -44,7 +44,8 @@ public:
*/ */
class TextSplit { class TextSplit {
public: public:
enum Flags {TXTS_NONE = 0, TXTS_ONLYSPANS = 1, TXTS_NOSPANS = 2}; enum Flags {TXTS_NONE = 0, TXTS_ONLYSPANS = 1, TXTS_NOSPANS = 2,
TXTS_KEEPWILD = 4};
/** /**
* Constructor: just store callback object * Constructor: just store callback object
*/ */

View File

@ -1,5 +1,5 @@
#ifndef lint #ifndef lint
static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.8 2007-01-17 13:53:41 dockes Exp $ (C) 2006 J.F.Dockes"; static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.9 2007-01-18 12:09:58 dockes Exp $ (C) 2006 J.F.Dockes";
#endif #endif
/* /*
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
@ -152,7 +152,6 @@ public:
: m_db(db), m_stemlang(stmlng) : m_db(db), m_stemlang(stmlng)
{ } { }
bool translate(const string &iq, bool translate(const string &iq,
const string &prefix, const string &prefix,
string &ermsg, string &ermsg,
@ -168,7 +167,8 @@ public:
} }
private: private:
void maybeStemExp(bool dont, const string& term, list<string>& exp); void maybeStemExp(bool dont, const string& term, list<string>& exp,
string& sterm);
Db& m_db; Db& m_db;
const string& m_stemlang; const string& m_stemlang;
@ -178,22 +178,37 @@ private:
vector<vector<string> > m_groups; vector<vector<string> > m_groups;
}; };
/** Make term dumb and possibly expand it into its stem siblings. */ /** Unaccent and lowercase term, possibly expand stem and wildcards
*
* @param nostemexp don't perform stem expansion. This is mainly used to
* prevent stem expansion inside phrases. 2 other factors can turn
* stem expansion off: a null stemlang, resulting from a global user
* preference, or a capitalized term.
* @param term input single word
* @param exp output expansion list
* @param sterm output lower-cased+unaccented version of the input term
* (only if stem expansion actually occured, else empty)
*/
void StringToXapianQ::maybeStemExp(bool nostemexp, void StringToXapianQ::maybeStemExp(bool nostemexp,
const string& term, const string& term,
list<string>& exp) list<string>& exp,
string &sterm)
{ {
LOGDEB2(("maybeStemExp: term [%s] stemlang [%s] nostemexp %d\n", LOGDEB2(("maybeStemExp: term [%s] stemlang [%s] nostemexp %d\n",
term.c_str(), m_stemlang.c_str(), nostemexp)); term.c_str(), m_stemlang.c_str(), nostemexp));
sterm.erase();
if (term.empty()) { if (term.empty()) {
exp.clear(); exp.clear();
return; return;
} }
// term1 is lowercase and without diacritics
string term1; string term1;
dumb_string(term, term1); dumb_string(term, term1);
if (m_stemlang.empty()) bool haswild = term.find_first_of("*?") != string::npos;
// No stemming if there are wildcards or prevented globally.
if (haswild || m_stemlang.empty())
nostemexp = true; nostemexp = true;
if (!nostemexp) { if (!nostemexp) {
@ -201,7 +216,7 @@ void StringToXapianQ::maybeStemExp(bool nostemexp,
// case we do not want to do stem expansion. Note that // case we do not want to do stem expansion. Note that
// the test is convoluted and possibly problematic // the test is convoluted and possibly problematic
string noacterm,noaclowterm; string noacterm, noaclowterm;
if (unacmaybefold(term, noacterm, "UTF-8", false) && if (unacmaybefold(term, noacterm, "UTF-8", false) &&
unacmaybefold(noacterm, noaclowterm, "UTF-8", true)) { unacmaybefold(noacterm, noaclowterm, "UTF-8", true)) {
Utf8Iter it1(noacterm); Utf8Iter it1(noacterm);
@ -209,14 +224,20 @@ void StringToXapianQ::maybeStemExp(bool nostemexp,
if (*it1 != *it2) if (*it1 != *it2)
nostemexp = true; nostemexp = true;
} }
LOGDEB1(("Term: %s stem expansion: %s\n", term.c_str()));
} }
if (nostemexp) { if (nostemexp && !haswild) {
exp = list<string>(1, term1); // Neither stemming nor wildcard expansion: just the word
exp.push_front(term1);
exp.resize(1);
} else { } else {
list<TermMatchEntry> l; list<TermMatchEntry> l;
m_db.termMatch(Rcl::Db::ET_STEM, m_stemlang, term1, l); if (haswild) {
m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, term1, l);
} else {
sterm = term1;
m_db.termMatch(Rcl::Db::ET_STEM, m_stemlang, term1, l);
}
for (list<TermMatchEntry>::const_iterator it = l.begin(); for (list<TermMatchEntry>::const_iterator it = l.begin();
it != l.end(); it++) { it != l.end(); it++) {
exp.push_back(it->term); exp.push_back(it->term);
@ -285,7 +306,6 @@ bool StringToXapianQ::translate(const string &iq,
list<Xapian::Query> &pqueries, list<Xapian::Query> &pqueries,
int slack, bool useNear) int slack, bool useNear)
{ {
string qstring = iq;
LOGDEB2(("StringToXapianQ:: query string: [%s]\n", iq.c_str())); LOGDEB2(("StringToXapianQ:: query string: [%s]\n", iq.c_str()));
ermsg.erase(); ermsg.erase();
m_terms.clear(); m_terms.clear();
@ -293,7 +313,7 @@ bool StringToXapianQ::translate(const string &iq,
// Split into words and phrases (word1 word2 "this is a phrase"): // Split into words and phrases (word1 word2 "this is a phrase"):
list<string> phrases; list<string> phrases;
stringToStrings(qstring, phrases); stringToStrings(iq, phrases);
// Then process each word/phrase: split into terms and transform // Then process each word/phrase: split into terms and transform
// into appropriate Xapian Query // into appropriate Xapian Query
@ -307,9 +327,13 @@ bool StringToXapianQ::translate(const string &iq,
// a span would fail if we didn't adjust the proximity to // a span would fail if we didn't adjust the proximity to
// account for the additional span term which is complicated. // account for the additional span term which is complicated.
wsQData splitDataS, splitDataW; wsQData splitDataS, splitDataW;
TextSplit splitterS(&splitDataS, TextSplit::TXTS_ONLYSPANS); TextSplit splitterS(&splitDataS, (TextSplit::Flags)
(TextSplit::TXTS_ONLYSPANS |
TextSplit::TXTS_KEEPWILD));
splitterS.text_to_words(*it); splitterS.text_to_words(*it);
TextSplit splitterW(&splitDataW, TextSplit::TXTS_NOSPANS); TextSplit splitterW(&splitDataW, (TextSplit::Flags)
(TextSplit::TXTS_NOSPANS |
TextSplit::TXTS_KEEPWILD));
splitterW.text_to_words(*it); splitterW.text_to_words(*it);
wsQData *splitData = &splitDataS; wsQData *splitData = &splitDataS;
if (splitDataS.terms.size() > 1 && if (splitDataS.terms.size() > 1 &&
@ -324,12 +348,19 @@ bool StringToXapianQ::translate(const string &iq,
{ {
string term = splitData->terms.front(); string term = splitData->terms.front();
list<string> exp; list<string> exp;
maybeStemExp(false, term, exp); string sterm;
maybeStemExp(false, term, exp, sterm);
m_terms.insert(m_terms.end(), exp.begin(), exp.end()); m_terms.insert(m_terms.end(), exp.begin(), exp.end());
// Push either term or OR of stem-expanded set // Push either term or OR of stem-expanded set
addPrefix(exp, prefix); addPrefix(exp, prefix);
pqueries.push_back(Xapian::Query(Xapian::Query::OP_OR, Xapian::Query xq(Xapian::Query::OP_OR,
exp.begin(), exp.end())); exp.begin(), exp.end());
// Give a relevance boost to the original term
if (!sterm.empty()) {
xq = Xapian::Query(Xapian::Query::OP_OR,
xq, Xapian::Query(sterm, 10));
}
pqueries.push_back(xq);
} }
break; break;
@ -347,9 +378,9 @@ bool StringToXapianQ::translate(const string &iq,
bool nostemexp = bool nostemexp =
(op == Xapian::Query::OP_PHRASE || hadmultiple) ? (op == Xapian::Query::OP_PHRASE || hadmultiple) ?
true : false; true : false;
string sterm;
list<string>exp; list<string>exp;
maybeStemExp(nostemexp, *it, exp); maybeStemExp(nostemexp, *it, exp, sterm);
groups.push_back(vector<string>(exp.begin(), exp.end())); groups.push_back(vector<string>(exp.begin(), exp.end()));
addPrefix(exp, prefix); addPrefix(exp, prefix);