From d12021b22cb40d3c40234653a0d4c6756edb9a39 Mon Sep 17 00:00:00 2001 From: dockes Date: Thu, 18 Jan 2007 12:09:58 +0000 Subject: [PATCH] handle wildcards in search terms --- src/common/textsplit.cpp | 22 ++++++++++-- src/common/textsplit.h | 5 +-- src/rcldb/searchdata.cpp | 73 ++++++++++++++++++++++++++++------------ 3 files changed, 74 insertions(+), 26 deletions(-) diff --git a/src/common/textsplit.cpp b/src/common/textsplit.cpp index 24615dc1..076d220b 100644 --- a/src/common/textsplit.cpp +++ b/src/common/textsplit.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.27 2006-12-08 07:11:17 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.28 2007-01-18 12:09:58 dockes Exp $ (C) 2004 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -52,7 +52,7 @@ using namespace std; // The array is actually a remnant of the original version which did no utf8 // It could be reduced to 128, because real (over 128) utf8 chars are now // handled with a set holding all the separator values. -enum CharClass {LETTER=256, SPACE=257, DIGIT=258}; +enum CharClass {LETTER=256, SPACE=257, DIGIT=258, WILD=259}; static int charclasses[256]; static set unicign; @@ -76,10 +76,14 @@ static void setcharclasses() for (i = 0; i < strlen(blankspace); i++) charclasses[int(blankspace[i])] = SPACE; - char seps[] = "!\"$%&()/<=>[\\]^{|}~:;*`?"; + char seps[] = "!\"$%&()/<=>[\\]^{|}~:;`"; for (i = 0; i < strlen(seps); i++) charclasses[int(seps[i])] = SPACE; + char wild[] = "*?"; + for (i = 0; i < strlen(wild); i++) + charclasses[int(wild[i])] = WILD; + char special[] = ".@+-,#'\n\r"; for (i = 0; i < strlen(special); i++) charclasses[int(special[i])] = special[i]; @@ -244,6 +248,12 @@ bool TextSplit::text_to_words(const string &in) number = false; } break; + case WILD: + if (m_flags & TXTS_KEEPWILD) + goto NORMALCHAR; + else + goto SPACE; + break; case '-': case '+': if (wordLen == 0) { @@ -338,6 +348,7 @@ bool TextSplit::text_to_words(const string &in) break; default: + NORMALCHAR: wordLen += it.appendchartostring(span); break; } @@ -426,6 +437,7 @@ static string usage = " -S: no output\n" " -s: only spans\n" " -w: only words\n" + " -k: preserve wildcards (?*)\n" " -c: just count words\n" " if filename is 'stdin', will read stdin for data (end with ^D)\n" " \n\n" @@ -443,6 +455,7 @@ static int op_flags; #define OPT_w 0x2 #define OPT_S 0x4 #define OPT_c 0x8 +#define OPT_k 0x10 int main(int argc, char **argv) { @@ -457,6 +470,7 @@ int main(int argc, char **argv) while (**argv) switch (*(*argv)++) { case 'c': op_flags |= OPT_c; break; + case 'k': op_flags |= OPT_k; break; case 's': op_flags |= OPT_s; break; case 'S': op_flags |= OPT_S; break; case 'w': op_flags |= OPT_w; break; @@ -477,6 +491,8 @@ int main(int argc, char **argv) flags = TextSplit::TXTS_ONLYSPANS; else if (op_flags&OPT_w) flags = TextSplit::TXTS_NOSPANS; + if (op_flags & OPT_k) + flags = (TextSplit::Flags)(flags | TextSplit::TXTS_KEEPWILD); string data; if (argc == 1) { diff --git a/src/common/textsplit.h b/src/common/textsplit.h index 50bcabaf..b3da5dc3 100644 --- a/src/common/textsplit.h +++ b/src/common/textsplit.h @@ -16,7 +16,7 @@ */ #ifndef _TEXTSPLIT_H_INCLUDED_ #define _TEXTSPLIT_H_INCLUDED_ -/* @(#$Id: textsplit.h,v 1.15 2006-12-08 07:11:17 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: textsplit.h,v 1.16 2007-01-18 12:09:58 dockes Exp $ (C) 2004 J.F.Dockes */ #include #ifndef NO_NAMESPACES @@ -44,7 +44,8 @@ public: */ class TextSplit { public: - enum Flags {TXTS_NONE = 0, TXTS_ONLYSPANS = 1, TXTS_NOSPANS = 2}; + enum Flags {TXTS_NONE = 0, TXTS_ONLYSPANS = 1, TXTS_NOSPANS = 2, + TXTS_KEEPWILD = 4}; /** * Constructor: just store callback object */ diff --git a/src/rcldb/searchdata.cpp b/src/rcldb/searchdata.cpp index d51bce3d..8daae5e4 100644 --- a/src/rcldb/searchdata.cpp +++ b/src/rcldb/searchdata.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.8 2007-01-17 13:53:41 dockes Exp $ (C) 2006 J.F.Dockes"; +static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.9 2007-01-18 12:09:58 dockes Exp $ (C) 2006 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -152,7 +152,6 @@ public: : m_db(db), m_stemlang(stmlng) { } - bool translate(const string &iq, const string &prefix, string &ermsg, @@ -168,7 +167,8 @@ public: } private: - void maybeStemExp(bool dont, const string& term, list& exp); + void maybeStemExp(bool dont, const string& term, list& exp, + string& sterm); Db& m_db; const string& m_stemlang; @@ -178,22 +178,37 @@ private: vector > m_groups; }; -/** Make term dumb and possibly expand it into its stem siblings. */ +/** Unaccent and lowercase term, possibly expand stem and wildcards + * + * @param nostemexp don't perform stem expansion. This is mainly used to + * prevent stem expansion inside phrases. 2 other factors can turn + * stem expansion off: a null stemlang, resulting from a global user + * preference, or a capitalized term. + * @param term input single word + * @param exp output expansion list + * @param sterm output lower-cased+unaccented version of the input term + * (only if stem expansion actually occured, else empty) + */ void StringToXapianQ::maybeStemExp(bool nostemexp, const string& term, - list& exp) + list& exp, + string &sterm) { LOGDEB2(("maybeStemExp: term [%s] stemlang [%s] nostemexp %d\n", term.c_str(), m_stemlang.c_str(), nostemexp)); + sterm.erase(); if (term.empty()) { exp.clear(); return; } - + // term1 is lowercase and without diacritics string term1; dumb_string(term, term1); - if (m_stemlang.empty()) + bool haswild = term.find_first_of("*?") != string::npos; + + // No stemming if there are wildcards or prevented globally. + if (haswild || m_stemlang.empty()) nostemexp = true; if (!nostemexp) { @@ -201,7 +216,7 @@ void StringToXapianQ::maybeStemExp(bool nostemexp, // case we do not want to do stem expansion. Note that // the test is convoluted and possibly problematic - string noacterm,noaclowterm; + string noacterm, noaclowterm; if (unacmaybefold(term, noacterm, "UTF-8", false) && unacmaybefold(noacterm, noaclowterm, "UTF-8", true)) { Utf8Iter it1(noacterm); @@ -209,14 +224,20 @@ void StringToXapianQ::maybeStemExp(bool nostemexp, if (*it1 != *it2) nostemexp = true; } - LOGDEB1(("Term: %s stem expansion: %s\n", term.c_str())); } - if (nostemexp) { - exp = list(1, term1); + if (nostemexp && !haswild) { + // Neither stemming nor wildcard expansion: just the word + exp.push_front(term1); + exp.resize(1); } else { list l; - m_db.termMatch(Rcl::Db::ET_STEM, m_stemlang, term1, l); + if (haswild) { + m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, term1, l); + } else { + sterm = term1; + m_db.termMatch(Rcl::Db::ET_STEM, m_stemlang, term1, l); + } for (list::const_iterator it = l.begin(); it != l.end(); it++) { exp.push_back(it->term); @@ -285,7 +306,6 @@ bool StringToXapianQ::translate(const string &iq, list &pqueries, int slack, bool useNear) { - string qstring = iq; LOGDEB2(("StringToXapianQ:: query string: [%s]\n", iq.c_str())); ermsg.erase(); m_terms.clear(); @@ -293,7 +313,7 @@ bool StringToXapianQ::translate(const string &iq, // Split into words and phrases (word1 word2 "this is a phrase"): list phrases; - stringToStrings(qstring, phrases); + stringToStrings(iq, phrases); // Then process each word/phrase: split into terms and transform // into appropriate Xapian Query @@ -307,9 +327,13 @@ bool StringToXapianQ::translate(const string &iq, // a span would fail if we didn't adjust the proximity to // account for the additional span term which is complicated. wsQData splitDataS, splitDataW; - TextSplit splitterS(&splitDataS, TextSplit::TXTS_ONLYSPANS); + TextSplit splitterS(&splitDataS, (TextSplit::Flags) + (TextSplit::TXTS_ONLYSPANS | + TextSplit::TXTS_KEEPWILD)); splitterS.text_to_words(*it); - TextSplit splitterW(&splitDataW, TextSplit::TXTS_NOSPANS); + TextSplit splitterW(&splitDataW, (TextSplit::Flags) + (TextSplit::TXTS_NOSPANS | + TextSplit::TXTS_KEEPWILD)); splitterW.text_to_words(*it); wsQData *splitData = &splitDataS; if (splitDataS.terms.size() > 1 && @@ -324,12 +348,19 @@ bool StringToXapianQ::translate(const string &iq, { string term = splitData->terms.front(); list exp; - maybeStemExp(false, term, exp); + string sterm; + maybeStemExp(false, term, exp, sterm); m_terms.insert(m_terms.end(), exp.begin(), exp.end()); // Push either term or OR of stem-expanded set addPrefix(exp, prefix); - pqueries.push_back(Xapian::Query(Xapian::Query::OP_OR, - exp.begin(), exp.end())); + Xapian::Query xq(Xapian::Query::OP_OR, + exp.begin(), exp.end()); + // Give a relevance boost to the original term + if (!sterm.empty()) { + xq = Xapian::Query(Xapian::Query::OP_OR, + xq, Xapian::Query(sterm, 10)); + } + pqueries.push_back(xq); } break; @@ -347,9 +378,9 @@ bool StringToXapianQ::translate(const string &iq, bool nostemexp = (op == Xapian::Query::OP_PHRASE || hadmultiple) ? true : false; - + string sterm; listexp; - maybeStemExp(nostemexp, *it, exp); + maybeStemExp(nostemexp, *it, exp, sterm); groups.push_back(vector(exp.begin(), exp.end())); addPrefix(exp, prefix);