handle wildcards in search terms
This commit is contained in:
parent
2ca3f087ab
commit
d12021b22c
@ -1,5 +1,5 @@
|
|||||||
#ifndef lint
|
#ifndef lint
|
||||||
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.27 2006-12-08 07:11:17 dockes Exp $ (C) 2004 J.F.Dockes";
|
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.28 2007-01-18 12:09:58 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||||
#endif
|
#endif
|
||||||
/*
|
/*
|
||||||
* This program is free software; you can redistribute it and/or modify
|
* This program is free software; you can redistribute it and/or modify
|
||||||
@ -52,7 +52,7 @@ using namespace std;
|
|||||||
// The array is actually a remnant of the original version which did no utf8
|
// The array is actually a remnant of the original version which did no utf8
|
||||||
// It could be reduced to 128, because real (over 128) utf8 chars are now
|
// It could be reduced to 128, because real (over 128) utf8 chars are now
|
||||||
// handled with a set holding all the separator values.
|
// handled with a set holding all the separator values.
|
||||||
enum CharClass {LETTER=256, SPACE=257, DIGIT=258};
|
enum CharClass {LETTER=256, SPACE=257, DIGIT=258, WILD=259};
|
||||||
static int charclasses[256];
|
static int charclasses[256];
|
||||||
|
|
||||||
static set<unsigned int> unicign;
|
static set<unsigned int> unicign;
|
||||||
@ -76,10 +76,14 @@ static void setcharclasses()
|
|||||||
for (i = 0; i < strlen(blankspace); i++)
|
for (i = 0; i < strlen(blankspace); i++)
|
||||||
charclasses[int(blankspace[i])] = SPACE;
|
charclasses[int(blankspace[i])] = SPACE;
|
||||||
|
|
||||||
char seps[] = "!\"$%&()/<=>[\\]^{|}~:;*`?";
|
char seps[] = "!\"$%&()/<=>[\\]^{|}~:;`";
|
||||||
for (i = 0; i < strlen(seps); i++)
|
for (i = 0; i < strlen(seps); i++)
|
||||||
charclasses[int(seps[i])] = SPACE;
|
charclasses[int(seps[i])] = SPACE;
|
||||||
|
|
||||||
|
char wild[] = "*?";
|
||||||
|
for (i = 0; i < strlen(wild); i++)
|
||||||
|
charclasses[int(wild[i])] = WILD;
|
||||||
|
|
||||||
char special[] = ".@+-,#'\n\r";
|
char special[] = ".@+-,#'\n\r";
|
||||||
for (i = 0; i < strlen(special); i++)
|
for (i = 0; i < strlen(special); i++)
|
||||||
charclasses[int(special[i])] = special[i];
|
charclasses[int(special[i])] = special[i];
|
||||||
@ -244,6 +248,12 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
number = false;
|
number = false;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
case WILD:
|
||||||
|
if (m_flags & TXTS_KEEPWILD)
|
||||||
|
goto NORMALCHAR;
|
||||||
|
else
|
||||||
|
goto SPACE;
|
||||||
|
break;
|
||||||
case '-':
|
case '-':
|
||||||
case '+':
|
case '+':
|
||||||
if (wordLen == 0) {
|
if (wordLen == 0) {
|
||||||
@ -338,6 +348,7 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
break;
|
break;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
|
NORMALCHAR:
|
||||||
wordLen += it.appendchartostring(span);
|
wordLen += it.appendchartostring(span);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -426,6 +437,7 @@ static string usage =
|
|||||||
" -S: no output\n"
|
" -S: no output\n"
|
||||||
" -s: only spans\n"
|
" -s: only spans\n"
|
||||||
" -w: only words\n"
|
" -w: only words\n"
|
||||||
|
" -k: preserve wildcards (?*)\n"
|
||||||
" -c: just count words\n"
|
" -c: just count words\n"
|
||||||
" if filename is 'stdin', will read stdin for data (end with ^D)\n"
|
" if filename is 'stdin', will read stdin for data (end with ^D)\n"
|
||||||
" \n\n"
|
" \n\n"
|
||||||
@ -443,6 +455,7 @@ static int op_flags;
|
|||||||
#define OPT_w 0x2
|
#define OPT_w 0x2
|
||||||
#define OPT_S 0x4
|
#define OPT_S 0x4
|
||||||
#define OPT_c 0x8
|
#define OPT_c 0x8
|
||||||
|
#define OPT_k 0x10
|
||||||
|
|
||||||
int main(int argc, char **argv)
|
int main(int argc, char **argv)
|
||||||
{
|
{
|
||||||
@ -457,6 +470,7 @@ int main(int argc, char **argv)
|
|||||||
while (**argv)
|
while (**argv)
|
||||||
switch (*(*argv)++) {
|
switch (*(*argv)++) {
|
||||||
case 'c': op_flags |= OPT_c; break;
|
case 'c': op_flags |= OPT_c; break;
|
||||||
|
case 'k': op_flags |= OPT_k; break;
|
||||||
case 's': op_flags |= OPT_s; break;
|
case 's': op_flags |= OPT_s; break;
|
||||||
case 'S': op_flags |= OPT_S; break;
|
case 'S': op_flags |= OPT_S; break;
|
||||||
case 'w': op_flags |= OPT_w; break;
|
case 'w': op_flags |= OPT_w; break;
|
||||||
@ -477,6 +491,8 @@ int main(int argc, char **argv)
|
|||||||
flags = TextSplit::TXTS_ONLYSPANS;
|
flags = TextSplit::TXTS_ONLYSPANS;
|
||||||
else if (op_flags&OPT_w)
|
else if (op_flags&OPT_w)
|
||||||
flags = TextSplit::TXTS_NOSPANS;
|
flags = TextSplit::TXTS_NOSPANS;
|
||||||
|
if (op_flags & OPT_k)
|
||||||
|
flags = (TextSplit::Flags)(flags | TextSplit::TXTS_KEEPWILD);
|
||||||
|
|
||||||
string data;
|
string data;
|
||||||
if (argc == 1) {
|
if (argc == 1) {
|
||||||
|
|||||||
@ -16,7 +16,7 @@
|
|||||||
*/
|
*/
|
||||||
#ifndef _TEXTSPLIT_H_INCLUDED_
|
#ifndef _TEXTSPLIT_H_INCLUDED_
|
||||||
#define _TEXTSPLIT_H_INCLUDED_
|
#define _TEXTSPLIT_H_INCLUDED_
|
||||||
/* @(#$Id: textsplit.h,v 1.15 2006-12-08 07:11:17 dockes Exp $ (C) 2004 J.F.Dockes */
|
/* @(#$Id: textsplit.h,v 1.16 2007-01-18 12:09:58 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#ifndef NO_NAMESPACES
|
#ifndef NO_NAMESPACES
|
||||||
@ -44,7 +44,8 @@ public:
|
|||||||
*/
|
*/
|
||||||
class TextSplit {
|
class TextSplit {
|
||||||
public:
|
public:
|
||||||
enum Flags {TXTS_NONE = 0, TXTS_ONLYSPANS = 1, TXTS_NOSPANS = 2};
|
enum Flags {TXTS_NONE = 0, TXTS_ONLYSPANS = 1, TXTS_NOSPANS = 2,
|
||||||
|
TXTS_KEEPWILD = 4};
|
||||||
/**
|
/**
|
||||||
* Constructor: just store callback object
|
* Constructor: just store callback object
|
||||||
*/
|
*/
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
#ifndef lint
|
#ifndef lint
|
||||||
static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.8 2007-01-17 13:53:41 dockes Exp $ (C) 2006 J.F.Dockes";
|
static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.9 2007-01-18 12:09:58 dockes Exp $ (C) 2006 J.F.Dockes";
|
||||||
#endif
|
#endif
|
||||||
/*
|
/*
|
||||||
* This program is free software; you can redistribute it and/or modify
|
* This program is free software; you can redistribute it and/or modify
|
||||||
@ -152,7 +152,6 @@ public:
|
|||||||
: m_db(db), m_stemlang(stmlng)
|
: m_db(db), m_stemlang(stmlng)
|
||||||
{ }
|
{ }
|
||||||
|
|
||||||
|
|
||||||
bool translate(const string &iq,
|
bool translate(const string &iq,
|
||||||
const string &prefix,
|
const string &prefix,
|
||||||
string &ermsg,
|
string &ermsg,
|
||||||
@ -168,7 +167,8 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
void maybeStemExp(bool dont, const string& term, list<string>& exp);
|
void maybeStemExp(bool dont, const string& term, list<string>& exp,
|
||||||
|
string& sterm);
|
||||||
|
|
||||||
Db& m_db;
|
Db& m_db;
|
||||||
const string& m_stemlang;
|
const string& m_stemlang;
|
||||||
@ -178,22 +178,37 @@ private:
|
|||||||
vector<vector<string> > m_groups;
|
vector<vector<string> > m_groups;
|
||||||
};
|
};
|
||||||
|
|
||||||
/** Make term dumb and possibly expand it into its stem siblings. */
|
/** Unaccent and lowercase term, possibly expand stem and wildcards
|
||||||
|
*
|
||||||
|
* @param nostemexp don't perform stem expansion. This is mainly used to
|
||||||
|
* prevent stem expansion inside phrases. 2 other factors can turn
|
||||||
|
* stem expansion off: a null stemlang, resulting from a global user
|
||||||
|
* preference, or a capitalized term.
|
||||||
|
* @param term input single word
|
||||||
|
* @param exp output expansion list
|
||||||
|
* @param sterm output lower-cased+unaccented version of the input term
|
||||||
|
* (only if stem expansion actually occured, else empty)
|
||||||
|
*/
|
||||||
void StringToXapianQ::maybeStemExp(bool nostemexp,
|
void StringToXapianQ::maybeStemExp(bool nostemexp,
|
||||||
const string& term,
|
const string& term,
|
||||||
list<string>& exp)
|
list<string>& exp,
|
||||||
|
string &sterm)
|
||||||
{
|
{
|
||||||
LOGDEB2(("maybeStemExp: term [%s] stemlang [%s] nostemexp %d\n",
|
LOGDEB2(("maybeStemExp: term [%s] stemlang [%s] nostemexp %d\n",
|
||||||
term.c_str(), m_stemlang.c_str(), nostemexp));
|
term.c_str(), m_stemlang.c_str(), nostemexp));
|
||||||
|
sterm.erase();
|
||||||
if (term.empty()) {
|
if (term.empty()) {
|
||||||
exp.clear();
|
exp.clear();
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
// term1 is lowercase and without diacritics
|
||||||
string term1;
|
string term1;
|
||||||
dumb_string(term, term1);
|
dumb_string(term, term1);
|
||||||
|
|
||||||
if (m_stemlang.empty())
|
bool haswild = term.find_first_of("*?") != string::npos;
|
||||||
|
|
||||||
|
// No stemming if there are wildcards or prevented globally.
|
||||||
|
if (haswild || m_stemlang.empty())
|
||||||
nostemexp = true;
|
nostemexp = true;
|
||||||
|
|
||||||
if (!nostemexp) {
|
if (!nostemexp) {
|
||||||
@ -201,7 +216,7 @@ void StringToXapianQ::maybeStemExp(bool nostemexp,
|
|||||||
// case we do not want to do stem expansion. Note that
|
// case we do not want to do stem expansion. Note that
|
||||||
// the test is convoluted and possibly problematic
|
// the test is convoluted and possibly problematic
|
||||||
|
|
||||||
string noacterm,noaclowterm;
|
string noacterm, noaclowterm;
|
||||||
if (unacmaybefold(term, noacterm, "UTF-8", false) &&
|
if (unacmaybefold(term, noacterm, "UTF-8", false) &&
|
||||||
unacmaybefold(noacterm, noaclowterm, "UTF-8", true)) {
|
unacmaybefold(noacterm, noaclowterm, "UTF-8", true)) {
|
||||||
Utf8Iter it1(noacterm);
|
Utf8Iter it1(noacterm);
|
||||||
@ -209,14 +224,20 @@ void StringToXapianQ::maybeStemExp(bool nostemexp,
|
|||||||
if (*it1 != *it2)
|
if (*it1 != *it2)
|
||||||
nostemexp = true;
|
nostemexp = true;
|
||||||
}
|
}
|
||||||
LOGDEB1(("Term: %s stem expansion: %s\n", term.c_str()));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (nostemexp) {
|
if (nostemexp && !haswild) {
|
||||||
exp = list<string>(1, term1);
|
// Neither stemming nor wildcard expansion: just the word
|
||||||
|
exp.push_front(term1);
|
||||||
|
exp.resize(1);
|
||||||
} else {
|
} else {
|
||||||
list<TermMatchEntry> l;
|
list<TermMatchEntry> l;
|
||||||
m_db.termMatch(Rcl::Db::ET_STEM, m_stemlang, term1, l);
|
if (haswild) {
|
||||||
|
m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, term1, l);
|
||||||
|
} else {
|
||||||
|
sterm = term1;
|
||||||
|
m_db.termMatch(Rcl::Db::ET_STEM, m_stemlang, term1, l);
|
||||||
|
}
|
||||||
for (list<TermMatchEntry>::const_iterator it = l.begin();
|
for (list<TermMatchEntry>::const_iterator it = l.begin();
|
||||||
it != l.end(); it++) {
|
it != l.end(); it++) {
|
||||||
exp.push_back(it->term);
|
exp.push_back(it->term);
|
||||||
@ -285,7 +306,6 @@ bool StringToXapianQ::translate(const string &iq,
|
|||||||
list<Xapian::Query> &pqueries,
|
list<Xapian::Query> &pqueries,
|
||||||
int slack, bool useNear)
|
int slack, bool useNear)
|
||||||
{
|
{
|
||||||
string qstring = iq;
|
|
||||||
LOGDEB2(("StringToXapianQ:: query string: [%s]\n", iq.c_str()));
|
LOGDEB2(("StringToXapianQ:: query string: [%s]\n", iq.c_str()));
|
||||||
ermsg.erase();
|
ermsg.erase();
|
||||||
m_terms.clear();
|
m_terms.clear();
|
||||||
@ -293,7 +313,7 @@ bool StringToXapianQ::translate(const string &iq,
|
|||||||
|
|
||||||
// Split into words and phrases (word1 word2 "this is a phrase"):
|
// Split into words and phrases (word1 word2 "this is a phrase"):
|
||||||
list<string> phrases;
|
list<string> phrases;
|
||||||
stringToStrings(qstring, phrases);
|
stringToStrings(iq, phrases);
|
||||||
|
|
||||||
// Then process each word/phrase: split into terms and transform
|
// Then process each word/phrase: split into terms and transform
|
||||||
// into appropriate Xapian Query
|
// into appropriate Xapian Query
|
||||||
@ -307,9 +327,13 @@ bool StringToXapianQ::translate(const string &iq,
|
|||||||
// a span would fail if we didn't adjust the proximity to
|
// a span would fail if we didn't adjust the proximity to
|
||||||
// account for the additional span term which is complicated.
|
// account for the additional span term which is complicated.
|
||||||
wsQData splitDataS, splitDataW;
|
wsQData splitDataS, splitDataW;
|
||||||
TextSplit splitterS(&splitDataS, TextSplit::TXTS_ONLYSPANS);
|
TextSplit splitterS(&splitDataS, (TextSplit::Flags)
|
||||||
|
(TextSplit::TXTS_ONLYSPANS |
|
||||||
|
TextSplit::TXTS_KEEPWILD));
|
||||||
splitterS.text_to_words(*it);
|
splitterS.text_to_words(*it);
|
||||||
TextSplit splitterW(&splitDataW, TextSplit::TXTS_NOSPANS);
|
TextSplit splitterW(&splitDataW, (TextSplit::Flags)
|
||||||
|
(TextSplit::TXTS_NOSPANS |
|
||||||
|
TextSplit::TXTS_KEEPWILD));
|
||||||
splitterW.text_to_words(*it);
|
splitterW.text_to_words(*it);
|
||||||
wsQData *splitData = &splitDataS;
|
wsQData *splitData = &splitDataS;
|
||||||
if (splitDataS.terms.size() > 1 &&
|
if (splitDataS.terms.size() > 1 &&
|
||||||
@ -324,12 +348,19 @@ bool StringToXapianQ::translate(const string &iq,
|
|||||||
{
|
{
|
||||||
string term = splitData->terms.front();
|
string term = splitData->terms.front();
|
||||||
list<string> exp;
|
list<string> exp;
|
||||||
maybeStemExp(false, term, exp);
|
string sterm;
|
||||||
|
maybeStemExp(false, term, exp, sterm);
|
||||||
m_terms.insert(m_terms.end(), exp.begin(), exp.end());
|
m_terms.insert(m_terms.end(), exp.begin(), exp.end());
|
||||||
// Push either term or OR of stem-expanded set
|
// Push either term or OR of stem-expanded set
|
||||||
addPrefix(exp, prefix);
|
addPrefix(exp, prefix);
|
||||||
pqueries.push_back(Xapian::Query(Xapian::Query::OP_OR,
|
Xapian::Query xq(Xapian::Query::OP_OR,
|
||||||
exp.begin(), exp.end()));
|
exp.begin(), exp.end());
|
||||||
|
// Give a relevance boost to the original term
|
||||||
|
if (!sterm.empty()) {
|
||||||
|
xq = Xapian::Query(Xapian::Query::OP_OR,
|
||||||
|
xq, Xapian::Query(sterm, 10));
|
||||||
|
}
|
||||||
|
pqueries.push_back(xq);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
@ -347,9 +378,9 @@ bool StringToXapianQ::translate(const string &iq,
|
|||||||
bool nostemexp =
|
bool nostemexp =
|
||||||
(op == Xapian::Query::OP_PHRASE || hadmultiple) ?
|
(op == Xapian::Query::OP_PHRASE || hadmultiple) ?
|
||||||
true : false;
|
true : false;
|
||||||
|
string sterm;
|
||||||
list<string>exp;
|
list<string>exp;
|
||||||
maybeStemExp(nostemexp, *it, exp);
|
maybeStemExp(nostemexp, *it, exp, sterm);
|
||||||
|
|
||||||
groups.push_back(vector<string>(exp.begin(), exp.end()));
|
groups.push_back(vector<string>(exp.begin(), exp.end()));
|
||||||
addPrefix(exp, prefix);
|
addPrefix(exp, prefix);
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user