handle wildcards in search terms

This commit is contained in:
dockes 2007-01-18 12:09:58 +00:00
parent 2ca3f087ab
commit d12021b22c
3 changed files with 74 additions and 26 deletions

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.27 2006-12-08 07:11:17 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.28 2007-01-18 12:09:58 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -52,7 +52,7 @@ using namespace std;
// The array is actually a remnant of the original version which did no utf8
// It could be reduced to 128, because real (over 128) utf8 chars are now
// handled with a set holding all the separator values.
enum CharClass {LETTER=256, SPACE=257, DIGIT=258};
enum CharClass {LETTER=256, SPACE=257, DIGIT=258, WILD=259};
static int charclasses[256];
static set<unsigned int> unicign;
@ -76,10 +76,14 @@ static void setcharclasses()
for (i = 0; i < strlen(blankspace); i++)
charclasses[int(blankspace[i])] = SPACE;
char seps[] = "!\"$%&()/<=>[\\]^{|}~:;*`?";
char seps[] = "!\"$%&()/<=>[\\]^{|}~:;`";
for (i = 0; i < strlen(seps); i++)
charclasses[int(seps[i])] = SPACE;
char wild[] = "*?";
for (i = 0; i < strlen(wild); i++)
charclasses[int(wild[i])] = WILD;
char special[] = ".@+-,#'\n\r";
for (i = 0; i < strlen(special); i++)
charclasses[int(special[i])] = special[i];
@ -244,6 +248,12 @@ bool TextSplit::text_to_words(const string &in)
number = false;
}
break;
case WILD:
if (m_flags & TXTS_KEEPWILD)
goto NORMALCHAR;
else
goto SPACE;
break;
case '-':
case '+':
if (wordLen == 0) {
@ -338,6 +348,7 @@ bool TextSplit::text_to_words(const string &in)
break;
default:
NORMALCHAR:
wordLen += it.appendchartostring(span);
break;
}
@ -426,6 +437,7 @@ static string usage =
" -S: no output\n"
" -s: only spans\n"
" -w: only words\n"
" -k: preserve wildcards (?*)\n"
" -c: just count words\n"
" if filename is 'stdin', will read stdin for data (end with ^D)\n"
" \n\n"
@ -443,6 +455,7 @@ static int op_flags;
#define OPT_w 0x2
#define OPT_S 0x4
#define OPT_c 0x8
#define OPT_k 0x10
int main(int argc, char **argv)
{
@ -457,6 +470,7 @@ int main(int argc, char **argv)
while (**argv)
switch (*(*argv)++) {
case 'c': op_flags |= OPT_c; break;
case 'k': op_flags |= OPT_k; break;
case 's': op_flags |= OPT_s; break;
case 'S': op_flags |= OPT_S; break;
case 'w': op_flags |= OPT_w; break;
@ -477,6 +491,8 @@ int main(int argc, char **argv)
flags = TextSplit::TXTS_ONLYSPANS;
else if (op_flags&OPT_w)
flags = TextSplit::TXTS_NOSPANS;
if (op_flags & OPT_k)
flags = (TextSplit::Flags)(flags | TextSplit::TXTS_KEEPWILD);
string data;
if (argc == 1) {

View File

@ -16,7 +16,7 @@
*/
#ifndef _TEXTSPLIT_H_INCLUDED_
#define _TEXTSPLIT_H_INCLUDED_
/* @(#$Id: textsplit.h,v 1.15 2006-12-08 07:11:17 dockes Exp $ (C) 2004 J.F.Dockes */
/* @(#$Id: textsplit.h,v 1.16 2007-01-18 12:09:58 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string>
#ifndef NO_NAMESPACES
@ -44,7 +44,8 @@ public:
*/
class TextSplit {
public:
enum Flags {TXTS_NONE = 0, TXTS_ONLYSPANS = 1, TXTS_NOSPANS = 2};
enum Flags {TXTS_NONE = 0, TXTS_ONLYSPANS = 1, TXTS_NOSPANS = 2,
TXTS_KEEPWILD = 4};
/**
* Constructor: just store callback object
*/

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.8 2007-01-17 13:53:41 dockes Exp $ (C) 2006 J.F.Dockes";
static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.9 2007-01-18 12:09:58 dockes Exp $ (C) 2006 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -152,7 +152,6 @@ public:
: m_db(db), m_stemlang(stmlng)
{ }
bool translate(const string &iq,
const string &prefix,
string &ermsg,
@ -168,7 +167,8 @@ public:
}
private:
void maybeStemExp(bool dont, const string& term, list<string>& exp);
void maybeStemExp(bool dont, const string& term, list<string>& exp,
string& sterm);
Db& m_db;
const string& m_stemlang;
@ -178,22 +178,37 @@ private:
vector<vector<string> > m_groups;
};
/** Make term dumb and possibly expand it into its stem siblings. */
/** Unaccent and lowercase term, possibly expand stem and wildcards
*
* @param nostemexp don't perform stem expansion. This is mainly used to
* prevent stem expansion inside phrases. 2 other factors can turn
* stem expansion off: a null stemlang, resulting from a global user
* preference, or a capitalized term.
* @param term input single word
* @param exp output expansion list
* @param sterm output lower-cased+unaccented version of the input term
* (only if stem expansion actually occured, else empty)
*/
void StringToXapianQ::maybeStemExp(bool nostemexp,
const string& term,
list<string>& exp)
list<string>& exp,
string &sterm)
{
LOGDEB2(("maybeStemExp: term [%s] stemlang [%s] nostemexp %d\n",
term.c_str(), m_stemlang.c_str(), nostemexp));
sterm.erase();
if (term.empty()) {
exp.clear();
return;
}
// term1 is lowercase and without diacritics
string term1;
dumb_string(term, term1);
if (m_stemlang.empty())
bool haswild = term.find_first_of("*?") != string::npos;
// No stemming if there are wildcards or prevented globally.
if (haswild || m_stemlang.empty())
nostemexp = true;
if (!nostemexp) {
@ -201,7 +216,7 @@ void StringToXapianQ::maybeStemExp(bool nostemexp,
// case we do not want to do stem expansion. Note that
// the test is convoluted and possibly problematic
string noacterm,noaclowterm;
string noacterm, noaclowterm;
if (unacmaybefold(term, noacterm, "UTF-8", false) &&
unacmaybefold(noacterm, noaclowterm, "UTF-8", true)) {
Utf8Iter it1(noacterm);
@ -209,14 +224,20 @@ void StringToXapianQ::maybeStemExp(bool nostemexp,
if (*it1 != *it2)
nostemexp = true;
}
LOGDEB1(("Term: %s stem expansion: %s\n", term.c_str()));
}
if (nostemexp) {
exp = list<string>(1, term1);
if (nostemexp && !haswild) {
// Neither stemming nor wildcard expansion: just the word
exp.push_front(term1);
exp.resize(1);
} else {
list<TermMatchEntry> l;
m_db.termMatch(Rcl::Db::ET_STEM, m_stemlang, term1, l);
if (haswild) {
m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, term1, l);
} else {
sterm = term1;
m_db.termMatch(Rcl::Db::ET_STEM, m_stemlang, term1, l);
}
for (list<TermMatchEntry>::const_iterator it = l.begin();
it != l.end(); it++) {
exp.push_back(it->term);
@ -285,7 +306,6 @@ bool StringToXapianQ::translate(const string &iq,
list<Xapian::Query> &pqueries,
int slack, bool useNear)
{
string qstring = iq;
LOGDEB2(("StringToXapianQ:: query string: [%s]\n", iq.c_str()));
ermsg.erase();
m_terms.clear();
@ -293,7 +313,7 @@ bool StringToXapianQ::translate(const string &iq,
// Split into words and phrases (word1 word2 "this is a phrase"):
list<string> phrases;
stringToStrings(qstring, phrases);
stringToStrings(iq, phrases);
// Then process each word/phrase: split into terms and transform
// into appropriate Xapian Query
@ -307,9 +327,13 @@ bool StringToXapianQ::translate(const string &iq,
// a span would fail if we didn't adjust the proximity to
// account for the additional span term which is complicated.
wsQData splitDataS, splitDataW;
TextSplit splitterS(&splitDataS, TextSplit::TXTS_ONLYSPANS);
TextSplit splitterS(&splitDataS, (TextSplit::Flags)
(TextSplit::TXTS_ONLYSPANS |
TextSplit::TXTS_KEEPWILD));
splitterS.text_to_words(*it);
TextSplit splitterW(&splitDataW, TextSplit::TXTS_NOSPANS);
TextSplit splitterW(&splitDataW, (TextSplit::Flags)
(TextSplit::TXTS_NOSPANS |
TextSplit::TXTS_KEEPWILD));
splitterW.text_to_words(*it);
wsQData *splitData = &splitDataS;
if (splitDataS.terms.size() > 1 &&
@ -324,12 +348,19 @@ bool StringToXapianQ::translate(const string &iq,
{
string term = splitData->terms.front();
list<string> exp;
maybeStemExp(false, term, exp);
string sterm;
maybeStemExp(false, term, exp, sterm);
m_terms.insert(m_terms.end(), exp.begin(), exp.end());
// Push either term or OR of stem-expanded set
addPrefix(exp, prefix);
pqueries.push_back(Xapian::Query(Xapian::Query::OP_OR,
exp.begin(), exp.end()));
Xapian::Query xq(Xapian::Query::OP_OR,
exp.begin(), exp.end());
// Give a relevance boost to the original term
if (!sterm.empty()) {
xq = Xapian::Query(Xapian::Query::OP_OR,
xq, Xapian::Query(sterm, 10));
}
pqueries.push_back(xq);
}
break;
@ -347,9 +378,9 @@ bool StringToXapianQ::translate(const string &iq,
bool nostemexp =
(op == Xapian::Query::OP_PHRASE || hadmultiple) ?
true : false;
string sterm;
list<string>exp;
maybeStemExp(nostemexp, *it, exp);
maybeStemExp(nostemexp, *it, exp, sterm);
groups.push_back(vector<string>(exp.begin(), exp.end()));
addPrefix(exp, prefix);