make searchdata a more flexible struct
This commit is contained in:
parent
1d7f103fe7
commit
cdbf026738
@ -8,8 +8,8 @@ LIBS = librcl.a
|
||||
|
||||
all: $(LIBS)
|
||||
|
||||
OBJS = conftree.o csguess.o debuglog.o execmd.o idfile.o md5.o wipedir.o fstreewalk.o mh_html.o mh_mail.o mh_exec.o mh_text.o htmlparse.o indexer.o internfile.o mimehandler.o mimeparse.o mimetype.o myhtmlparse.o pathhash.o pathut.o rclconfig.o rcldb.o rclinit.o stemdb.o base64.o readfile.o smallut.o textsplit.o transcode.o unacpp.o history.o docseq.o sortseq.o copyfile.o rclaspell.o
|
||||
DEPS = conftree.dep.stamp csguess.dep.stamp debuglog.dep.stamp execmd.dep.stamp idfile.dep.stamp md5.dep.stamp wipedir.dep.stamp fstreewalk.dep.stamp mh_html.dep.stamp mh_mail.dep.stamp mh_exec.dep.stamp mh_text.dep.stamp htmlparse.dep.stamp indexer.dep.stamp internfile.dep.stamp mimehandler.dep.stamp mimeparse.dep.stamp mimetype.dep.stamp myhtmlparse.dep.stamp pathhash.dep.stamp pathut.dep.stamp rclconfig.dep.stamp rcldb.dep.stamp rclinit.dep.stamp stemdb.dep.stamp base64.dep.stamp readfile.dep.stamp smallut.dep.stamp textsplit.dep.stamp transcode.dep.stamp unacpp.dep.stamp history.dep.stamp docseq.dep.stamp sortseq.dep.stamp copyfile.dep.stamp rclaspell.dep.stamp
|
||||
OBJS = conftree.o csguess.o debuglog.o execmd.o idfile.o md5.o wipedir.o fstreewalk.o mh_html.o mh_mail.o searchdata.o mh_exec.o mh_text.o htmlparse.o indexer.o internfile.o mimehandler.o mimeparse.o mimetype.o myhtmlparse.o pathhash.o pathut.o rclconfig.o rcldb.o rclinit.o stemdb.o base64.o readfile.o smallut.o textsplit.o transcode.o unacpp.o history.o docseq.o sortseq.o copyfile.o rclaspell.o
|
||||
DEPS = conftree.dep.stamp csguess.dep.stamp debuglog.dep.stamp execmd.dep.stamp idfile.dep.stamp md5.dep.stamp wipedir.dep.stamp fstreewalk.dep.stamp mh_html.dep.stamp mh_mail.dep.stamp searchdata.dep.stamp mh_exec.dep.stamp mh_text.dep.stamp htmlparse.dep.stamp indexer.dep.stamp internfile.dep.stamp mimehandler.dep.stamp mimeparse.dep.stamp mimetype.dep.stamp myhtmlparse.dep.stamp pathhash.dep.stamp pathut.dep.stamp rclconfig.dep.stamp rcldb.dep.stamp rclinit.dep.stamp stemdb.dep.stamp base64.dep.stamp readfile.dep.stamp smallut.dep.stamp textsplit.dep.stamp transcode.dep.stamp unacpp.dep.stamp history.dep.stamp docseq.dep.stamp sortseq.dep.stamp copyfile.dep.stamp rclaspell.dep.stamp
|
||||
|
||||
librcl.a : $(DEPS) $(OBJS) unac.o
|
||||
ar ru librcl.a $(OBJS) unac.o
|
||||
@ -37,6 +37,8 @@ mh_html.o : ../common/mh_html.cpp
|
||||
$(CXX) $(ALL_CXXFLAGS) -c ../common/mh_html.cpp
|
||||
mh_mail.o : ../common/mh_mail.cpp
|
||||
$(CXX) $(ALL_CXXFLAGS) -c ../common/mh_mail.cpp
|
||||
searchdata.o : ../common/searchdata.cpp
|
||||
$(CXX) $(ALL_CXXFLAGS) -c ../common/searchdata.cpp
|
||||
mh_exec.o : ../common/mh_exec.cpp
|
||||
$(CXX) $(ALL_CXXFLAGS) -c ../common/mh_exec.cpp
|
||||
mh_text.o : ../common/mh_text.cpp
|
||||
@ -125,6 +127,9 @@ mh_html.dep.stamp : ../common/mh_html.cpp
|
||||
mh_mail.dep.stamp : ../common/mh_mail.cpp
|
||||
$(CXX) -M $(ALL_CXXFLAGS) ../common/mh_mail.cpp > mh_mail.dep
|
||||
touch mh_mail.dep.stamp
|
||||
searchdata.dep.stamp : ../common/searchdata.cpp
|
||||
$(CXX) -M $(ALL_CXXFLAGS) ../common/searchdata.cpp > searchdata.dep
|
||||
touch searchdata.dep.stamp
|
||||
mh_exec.dep.stamp : ../common/mh_exec.cpp
|
||||
$(CXX) -M $(ALL_CXXFLAGS) ../common/mh_exec.cpp > mh_exec.dep
|
||||
touch mh_exec.dep.stamp
|
||||
@ -213,6 +218,7 @@ include wipedir.dep
|
||||
include fstreewalk.dep
|
||||
include mh_html.dep
|
||||
include mh_mail.dep
|
||||
include searchdata.dep
|
||||
include mh_exec.dep
|
||||
include mh_text.dep
|
||||
include htmlparse.dep
|
||||
|
||||
@ -8,6 +8,7 @@ SRCS="${depth}/utils/conftree.cpp ${depth}/index/csguess.cpp \
|
||||
${depth}/utils/idfile.cpp ${depth}/utils/md5.cpp \
|
||||
${depth}/utils/wipedir.cpp ${depth}/utils/fstreewalk.cpp \
|
||||
${depth}/common/mh_html.cpp ${depth}/common/mh_mail.cpp \
|
||||
${depth}/common/searchdata.cpp \
|
||||
${depth}/common/mh_exec.cpp ${depth}/common/mh_text.cpp \
|
||||
${depth}/common/htmlparse.cpp ${depth}/index/indexer.cpp \
|
||||
${depth}/common/internfile.cpp ${depth}/common/mimehandler.cpp \
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.90 2006-11-12 08:35:11 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.91 2006-11-13 08:49:44 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
#endif
|
||||
/*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
@ -174,6 +174,229 @@ bool Native::subDocs(const string &hash, vector<Xapian::docid>& docids)
|
||||
return false;
|
||||
}
|
||||
|
||||
bool Native::dbDataToRclDoc(std::string &data, Doc &doc,
|
||||
int qopts,
|
||||
Xapian::docid docid, const list<string>& terms)
|
||||
{
|
||||
LOGDEB1(("Db::dbDataToRclDoc: opts %x data: %s\n", qopts, data.c_str()));
|
||||
ConfSimple parms(&data);
|
||||
if (!parms.ok())
|
||||
return false;
|
||||
parms.get(string("url"), doc.url);
|
||||
parms.get(string("mtype"), doc.mimetype);
|
||||
parms.get(string("fmtime"), doc.fmtime);
|
||||
parms.get(string("dmtime"), doc.dmtime);
|
||||
parms.get(string("origcharset"), doc.origcharset);
|
||||
parms.get(string("caption"), doc.title);
|
||||
parms.get(string("keywords"), doc.keywords);
|
||||
parms.get(string("abstract"), doc.abstract);
|
||||
// Possibly remove synthetic abstract indicator (if it's there, we
|
||||
// used to index the beginning of the text as abstract).
|
||||
bool syntabs = false;
|
||||
if (doc.abstract.find(rclSyntAbs) == 0) {
|
||||
doc.abstract = doc.abstract.substr(rclSyntAbs.length());
|
||||
syntabs = true;
|
||||
}
|
||||
// If the option is set and the abstract is synthetic or empty , build
|
||||
// abstract from position data.
|
||||
if ((qopts & Db::QO_BUILD_ABSTRACT) && !terms.empty()) {
|
||||
LOGDEB(("dbDataToRclDoc:: building abstract from position data\n"));
|
||||
if (doc.abstract.empty() || syntabs ||
|
||||
(qopts & Db::QO_REPLACE_ABSTRACT))
|
||||
doc.abstract = makeAbstract(docid, terms);
|
||||
}
|
||||
parms.get(string("ipath"), doc.ipath);
|
||||
parms.get(string("fbytes"), doc.fbytes);
|
||||
parms.get(string("dbytes"), doc.dbytes);
|
||||
doc.xdocid = docid;
|
||||
return true;
|
||||
}
|
||||
|
||||
// We build a possibly full size but sparsely populated (only around
|
||||
// the search term occurrences) reconstruction of the document. It
|
||||
// would be possible to compress the array, by having only multiple
|
||||
// chunks around the terms, but this would seriously complicate the
|
||||
// data structure.
|
||||
string Native::makeAbstract(Xapian::docid docid, const list<string>& terms)
|
||||
{
|
||||
LOGDEB(("Native::makeAbstract: maxlen %d wWidth %d\n",
|
||||
m_db->m_synthAbsLen, m_db->m_synthAbsWordCtxLen));
|
||||
|
||||
Chrono chron;
|
||||
|
||||
// For each of the query terms, query xapian for its positions
|
||||
// list in the document. For each position entry, remember it in qtermposs
|
||||
// and insert it and its neighbours in the set of 'interesting' positions
|
||||
|
||||
// The terms 'array' that we partially populate with the document
|
||||
// terms, at their positions around the search terms positions:
|
||||
map<unsigned int, string> sparseDoc;
|
||||
|
||||
// All the query term positions. We remember this mainly because we are
|
||||
// going to random-shuffle it for selecting the chunks that we actually
|
||||
// print.
|
||||
vector<unsigned int> qtermposs;
|
||||
|
||||
// Limit the total number of slots we populate.
|
||||
const unsigned int maxtotaloccs = 300;
|
||||
// Max occurrences per term. We initially know nothing about the
|
||||
// occurrences repartition (it would be possible that only one
|
||||
// term in the list occurs, or that all do). So this is a rather
|
||||
// arbitrary choice.
|
||||
const unsigned int maxoccperterm = maxtotaloccs / 10;
|
||||
unsigned int totaloccs = 0;
|
||||
|
||||
for (list<string>::const_iterator qit = terms.begin(); qit != terms.end();
|
||||
qit++) {
|
||||
Xapian::PositionIterator pos;
|
||||
// There may be query terms not in this doc. This raises an
|
||||
// exception when requesting the position list, we catch it.
|
||||
string emptys;
|
||||
try {
|
||||
unsigned int occurrences = 0;
|
||||
for (pos = db.positionlist_begin(docid, *qit);
|
||||
pos != db.positionlist_end(docid, *qit); pos++) {
|
||||
unsigned int ipos = *pos;
|
||||
LOGDEB2(("Abstract: [%s] at %d\n", qit->c_str(), ipos));
|
||||
// Remember the term position
|
||||
qtermposs.push_back(ipos);
|
||||
// Add adjacent slots to the set to populate at next step
|
||||
unsigned int sta = MAX(0, ipos-m_db->m_synthAbsWordCtxLen);
|
||||
unsigned int sto = ipos+m_db->m_synthAbsWordCtxLen;
|
||||
for (unsigned int ii = sta; ii <= sto; ii++) {
|
||||
if (ii == ipos)
|
||||
sparseDoc[ii] = *qit;
|
||||
else
|
||||
sparseDoc[ii] = emptys;
|
||||
}
|
||||
// Limit the number of occurences we keep for each
|
||||
// term. The abstract has a finite length anyway !
|
||||
if (occurrences++ > maxoccperterm)
|
||||
break;
|
||||
}
|
||||
} catch (...) {
|
||||
// Term does not occur. No problem.
|
||||
}
|
||||
// Limit total size
|
||||
if (totaloccs++ > maxtotaloccs)
|
||||
break;
|
||||
}
|
||||
|
||||
LOGDEB(("Abstract:%d:chosen number of positions %d. Populating\n",
|
||||
chron.millis(), qtermposs.size()));
|
||||
|
||||
// Walk the full document position list (for each term walk
|
||||
// position list) and populate slots around the query terms. We
|
||||
// arbitrarily truncate the list to avoid taking forever. If we do
|
||||
// cutoff, the abstract may be inconsistant, which is bad...
|
||||
{
|
||||
Xapian::TermIterator term;
|
||||
int cutoff = 500 * 1000;
|
||||
|
||||
for (term = db.termlist_begin(docid);
|
||||
term != db.termlist_end(docid); term++) {
|
||||
if (cutoff-- < 0) {
|
||||
LOGDEB(("Abstract: max term count cutoff\n"));
|
||||
break;
|
||||
}
|
||||
|
||||
Xapian::PositionIterator pos;
|
||||
for (pos = db.positionlist_begin(docid, *term);
|
||||
pos != db.positionlist_end(docid, *term); pos++) {
|
||||
if (cutoff-- < 0) {
|
||||
LOGDEB(("Abstract: max term count cutoff\n"));
|
||||
break;
|
||||
}
|
||||
map<unsigned int, string>::iterator vit;
|
||||
if ((vit=sparseDoc.find(*pos)) != sparseDoc.end()) {
|
||||
// Don't replace a term: the terms list is in
|
||||
// alphabetic order, and we may have several terms
|
||||
// at the same position, we want to keep only the
|
||||
// first one (ie: dockes and dockes@wanadoo.fr)
|
||||
if (vit->second.empty()) {
|
||||
LOGDEB2(("Abstract: populating: [%s] at %d\n",
|
||||
(*term).c_str(), *pos));
|
||||
sparseDoc[*pos] = *term;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#if 0
|
||||
// Debug only: output the full term[position] vector
|
||||
bool epty = false;
|
||||
int ipos = 0;
|
||||
for (map<unsigned int, string>::iterator it = sparseDoc.begin();
|
||||
it != sparseDoc.end();
|
||||
it++, ipos++) {
|
||||
if (it->empty()) {
|
||||
if (!epty)
|
||||
LOGDEB(("Abstract:vec[%d]: [%s]\n", ipos, it->c_str()));
|
||||
epty=true;
|
||||
} else {
|
||||
epty = false;
|
||||
LOGDEB(("Abstract:vec[%d]: [%s]\n", ipos, it->c_str()));
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
LOGDEB(("Abstract:%d: randomizing and extracting\n", chron.millis()));
|
||||
|
||||
// We randomize the selection of term positions, from which we
|
||||
// shall pull, starting at the beginning, until the abstract is
|
||||
// big enough. The abstract is finally built in correct position
|
||||
// order, thanks to the position map.
|
||||
random_shuffle(qtermposs.begin(), qtermposs.end());
|
||||
map<unsigned int, string> mabs;
|
||||
unsigned int abslen = 0;
|
||||
|
||||
// Extract data around the N first (in random order) query term
|
||||
// positions, and store the terms in the map. Don't concatenate
|
||||
// immediately into chunks because there might be overlaps
|
||||
for (vector<unsigned int>::const_iterator pos = qtermposs.begin();
|
||||
pos != qtermposs.end(); pos++) {
|
||||
|
||||
if (int(abslen) > m_db->m_synthAbsLen)
|
||||
break;
|
||||
|
||||
unsigned int sta = MAX(0, *pos - m_db->m_synthAbsWordCtxLen);
|
||||
unsigned int sto = *pos + m_db->m_synthAbsWordCtxLen;
|
||||
|
||||
LOGDEB2(("Abstract: %d<-%d->%d\n", sta, *pos, sto));
|
||||
|
||||
for (unsigned int ii = sta; ii <= sto; ii++) {
|
||||
|
||||
if (int(abslen) > m_db->m_synthAbsLen)
|
||||
break;
|
||||
map<unsigned int, string>::const_iterator vit =
|
||||
sparseDoc.find(ii);
|
||||
if (vit != sparseDoc.end() && !vit->second.empty()) {
|
||||
LOGDEB2(("Abstract: position %d -> [%s]\n",
|
||||
ii, vit->second.c_str()));
|
||||
mabs[ii] = vit->second;
|
||||
abslen += vit->second.length();
|
||||
} else {
|
||||
LOGDEB2(("Abstract: empty position at %d\n", ii));
|
||||
}
|
||||
}
|
||||
|
||||
// Possibly add a ... at the end of chunk if it's not
|
||||
// overlapping
|
||||
if (mabs.find(sto+1) == mabs.end())
|
||||
mabs[sto+1] = "...";
|
||||
}
|
||||
|
||||
// Build the abstract by walking the map (in order of position)
|
||||
string abstract;
|
||||
for (map<unsigned int, string>::const_iterator it = mabs.begin();
|
||||
it != mabs.end(); it++) {
|
||||
LOGDEB2(("Abtract:output %u -> [%s]\n", it->first,it->second.c_str()));
|
||||
abstract += it->second + " ";
|
||||
}
|
||||
LOGDEB(("Abtract: done in %d mS\n", chron.millis()));
|
||||
return abstract;
|
||||
}
|
||||
|
||||
/* Rcl::Db methods ///////////////////////////////// */
|
||||
|
||||
@ -909,279 +1132,67 @@ bool Db::purgeFile(const string &fn)
|
||||
return false;
|
||||
}
|
||||
|
||||
// Splitter callback for breaking query into terms
|
||||
class wsQData : public TextSplitCB {
|
||||
public:
|
||||
vector<string> terms;
|
||||
string catterms() {
|
||||
string s;
|
||||
for (unsigned int i=0;i<terms.size();i++) {
|
||||
s += "[" + terms[i] + "] ";
|
||||
}
|
||||
return s;
|
||||
}
|
||||
bool takeword(const std::string &term, int , int, int) {
|
||||
LOGDEB1(("wsQData::takeword: %s\n", term.c_str()));
|
||||
terms.push_back(term);
|
||||
return true;
|
||||
}
|
||||
void dumball() {
|
||||
for (vector<string>::iterator it=terms.begin(); it !=terms.end();it++){
|
||||
string dumb;
|
||||
dumb_string(*it, dumb);
|
||||
*it = dumb;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Turn string into list of xapian queries. There is little
|
||||
// interpretation done on the string (no +term -term or filename:term
|
||||
// stuff). We just separate words and phrases, and interpret
|
||||
// capitalized terms as wanting no stem expansion.
|
||||
// The final list contains one query for each term or phrase
|
||||
// - Elements corresponding to a stem-expanded part are an OP_OR
|
||||
// composition of the stem-expanded terms (or a single term query).
|
||||
// - Elements corresponding to a phrase are an OP_PHRASE composition of the
|
||||
// phrase terms (no stem expansion in this case)
|
||||
static void stringToXapianQueries(const string &iq,
|
||||
const string& stemlang,
|
||||
Db *db,
|
||||
list<Xapian::Query> &pqueries,
|
||||
unsigned int opts = Db::QO_NONE)
|
||||
bool Db::filenameWildExp(const string& fnexp, list<string>& names)
|
||||
{
|
||||
string qstring = iq;
|
||||
// File name search, with possible wildcards.
|
||||
// We expand wildcards by scanning the filename terms (prefixed
|
||||
// with XSFN) from the database.
|
||||
// We build an OR query with the expanded values if any.
|
||||
string pattern;
|
||||
dumb_string(fnexp, pattern);
|
||||
|
||||
// Split into (possibly single word) phrases ("this is a phrase"):
|
||||
list<string> phrases;
|
||||
stringToStrings(qstring, phrases);
|
||||
// If pattern is not quoted, and has no wildcards, we add * at
|
||||
// each end: match any substring
|
||||
if (pattern[0] == '"' && pattern[pattern.size()-1] == '"') {
|
||||
pattern = pattern.substr(1, pattern.size() -2);
|
||||
} else if (pattern.find_first_of("*?[") == string::npos) {
|
||||
pattern = "*" + pattern + "*";
|
||||
} // else let it be
|
||||
|
||||
// Then process each phrase: split into terms and transform into
|
||||
// appropriate Xapian Query
|
||||
LOGDEB((" pattern: [%s]\n", pattern.c_str()));
|
||||
|
||||
for (list<string>::iterator it=phrases.begin(); it !=phrases.end(); it++) {
|
||||
LOGDEB(("strToXapianQ: phrase or word: [%s]\n", it->c_str()));
|
||||
|
||||
// If there are both spans and single words in this element,
|
||||
// we need to use a word split, else a phrase query including
|
||||
// a span would fail if we didn't adjust the proximity to
|
||||
// account for the additional span term which is complicated.
|
||||
wsQData splitDataS, splitDataW;
|
||||
TextSplit splitterS(&splitDataS, TextSplit::TXTS_ONLYSPANS);
|
||||
splitterS.text_to_words(*it);
|
||||
TextSplit splitterW(&splitDataW, TextSplit::TXTS_NOSPANS);
|
||||
splitterW.text_to_words(*it);
|
||||
wsQData& splitData = splitDataS;
|
||||
if (splitDataS.terms.size() > 1 && splitDataS.terms.size() !=
|
||||
splitDataW.terms.size())
|
||||
splitData = splitDataW;
|
||||
|
||||
LOGDEB1(("strToXapianQ: splitter term count: %d\n",
|
||||
splitData.terms.size()));
|
||||
switch(splitData.terms.size()) {
|
||||
case 0: continue;// ??
|
||||
case 1: // Not a real phrase: one term
|
||||
{
|
||||
string term = splitData.terms.front();
|
||||
bool nostemexp = false;
|
||||
// Check if the first letter is a majuscule in which
|
||||
// case we do not want to do stem expansion. Note that
|
||||
// the test is convoluted and possibly problematic
|
||||
if (term.length() > 0) {
|
||||
string noacterm,noaclowterm;
|
||||
if (unacmaybefold(term, noacterm, "UTF-8", false) &&
|
||||
unacmaybefold(noacterm, noaclowterm, "UTF-8", true)) {
|
||||
Utf8Iter it1(noacterm);
|
||||
Utf8Iter it2(noaclowterm);
|
||||
if (*it1 != *it2)
|
||||
nostemexp = true;
|
||||
}
|
||||
}
|
||||
LOGDEB1(("Term: %s stem expansion: %s\n",
|
||||
term.c_str(), nostemexp?"no":"yes"));
|
||||
|
||||
list<string> exp;
|
||||
string term1;
|
||||
dumb_string(term, term1);
|
||||
// Possibly perform stem compression/expansion
|
||||
if (!nostemexp && (opts & Db::QO_STEM)) {
|
||||
exp = db->stemExpand(stemlang, term1);
|
||||
} else {
|
||||
exp.push_back(term1);
|
||||
}
|
||||
|
||||
// Push either term or OR of stem-expanded set
|
||||
pqueries.push_back(Xapian::Query(Xapian::Query::OP_OR,
|
||||
exp.begin(), exp.end()));
|
||||
}
|
||||
// Match pattern against all file names in the db
|
||||
Xapian::TermIterator it = m_ndb->db.allterms_begin();
|
||||
it.skip_to("XSFN");
|
||||
for (;it != m_ndb->db.allterms_end(); it++) {
|
||||
if ((*it).find("XSFN") != 0)
|
||||
break;
|
||||
string fn = (*it).substr(4);
|
||||
LOGDEB2(("Matching [%s] and [%s]\n", pattern.c_str(), fn.c_str()));
|
||||
if (fnmatch(pattern.c_str(), fn.c_str(), 0) != FNM_NOMATCH) {
|
||||
names.push_back((*it).c_str());
|
||||
}
|
||||
// Limit the match count
|
||||
if (names.size() > 1000) {
|
||||
LOGERR(("Db::SetQuery: too many matched file names\n"));
|
||||
break;
|
||||
|
||||
default:
|
||||
// Phrase: no stem expansion
|
||||
splitData.dumball();
|
||||
LOGDEB(("Pushing phrase: [%s]\n", splitData.catterms().c_str()));
|
||||
pqueries.push_back(Xapian::Query(Xapian::Query::OP_PHRASE,
|
||||
splitData.terms.begin(),
|
||||
splitData.terms.end()));
|
||||
}
|
||||
}
|
||||
if (names.empty()) {
|
||||
// Build an impossible query: we know its impossible because we
|
||||
// control the prefixes!
|
||||
names.push_back("XIMPOSSIBLE");
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Prepare query out of "advanced search" data
|
||||
bool Db::setQuery(AdvSearchData &sdata, int opts, const string& stemlang)
|
||||
bool Db::setQuery(RefCntr<SearchData> sdata, int opts,
|
||||
const string& stemlang)
|
||||
{
|
||||
LOGDEB(("Db::setQuery: adv:\n"));
|
||||
LOGDEB((" allwords: %s\n", sdata.allwords.c_str()));
|
||||
LOGDEB((" phrase: %s\n", sdata.phrase.c_str()));
|
||||
LOGDEB((" orwords: %s\n", sdata.orwords.c_str()));
|
||||
LOGDEB((" orwords1: %s\n", sdata.orwords1.c_str()));
|
||||
LOGDEB((" nowords: %s\n", sdata.nowords.c_str()));
|
||||
LOGDEB((" filename: %s\n", sdata.filename.c_str()));
|
||||
|
||||
string ft;
|
||||
for (list<string>::iterator it = sdata.filetypes.begin();
|
||||
it != sdata.filetypes.end(); it++) {ft += *it + " ";}
|
||||
if (!ft.empty())
|
||||
LOGDEB((" searched file types: %s\n", ft.c_str()));
|
||||
if (!sdata.topdir.empty())
|
||||
LOGDEB((" restricted to: %s\n", sdata.topdir.c_str()));
|
||||
LOGDEB((" Options: 0x%x\n", opts));
|
||||
|
||||
m_filterTopDir = sdata.topdir;
|
||||
m_dbindices.clear();
|
||||
|
||||
if (!m_ndb)
|
||||
if (!m_ndb) {
|
||||
LOGERR(("Db::setQuery: no db!\n"));
|
||||
return false;
|
||||
list<Xapian::Query> pqueries;
|
||||
Xapian::Query xq;
|
||||
}
|
||||
|
||||
LOGDEB(("Db::setQuery:\n"));
|
||||
|
||||
m_filterTopDir = sdata->m_topdir;
|
||||
m_dbindices.clear();
|
||||
m_qOpts = opts;
|
||||
|
||||
if (!sdata.filename.empty()) {
|
||||
LOGDEB((" filename search\n"));
|
||||
// File name search, with possible wildcards.
|
||||
// We expand wildcards by scanning the filename terms (prefixed
|
||||
// with XSFN) from the database.
|
||||
// We build an OR query with the expanded values if any.
|
||||
string pattern;
|
||||
dumb_string(sdata.filename, pattern);
|
||||
|
||||
// If pattern is not quoted, and has no wildcards, we add * at
|
||||
// each end: match any substring
|
||||
if (pattern[0] == '"' && pattern[pattern.size()-1] == '"') {
|
||||
pattern = pattern.substr(1, pattern.size() -2);
|
||||
} else if (pattern.find_first_of("*?[") == string::npos) {
|
||||
pattern = "*" + pattern + "*";
|
||||
} // else let it be
|
||||
|
||||
LOGDEB((" pattern: [%s]\n", pattern.c_str()));
|
||||
|
||||
// Match pattern against all file names in the db
|
||||
Xapian::TermIterator it = m_ndb->db.allterms_begin();
|
||||
it.skip_to("XSFN");
|
||||
list<string> names;
|
||||
for (;it != m_ndb->db.allterms_end(); it++) {
|
||||
if ((*it).find("XSFN") != 0)
|
||||
break;
|
||||
string fn = (*it).substr(4);
|
||||
LOGDEB2(("Matching [%s] and [%s]\n", pattern.c_str(), fn.c_str()));
|
||||
if (fnmatch(pattern.c_str(), fn.c_str(), 0) != FNM_NOMATCH) {
|
||||
names.push_back((*it).c_str());
|
||||
}
|
||||
// Limit the match count
|
||||
if (names.size() > 1000) {
|
||||
LOGERR(("Db::SetQuery: too many matched file names\n"));
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (names.empty()) {
|
||||
// Build an impossible query: we know its impossible because we
|
||||
// control the prefixes!
|
||||
names.push_back("XIMPOSSIBLE");
|
||||
}
|
||||
// Build a query out of the matching file name terms.
|
||||
xq = Xapian::Query(Xapian::Query::OP_OR, names.begin(), names.end());
|
||||
}
|
||||
|
||||
if (!sdata.allwords.empty()) {
|
||||
stringToXapianQueries(sdata.allwords, stemlang, this,pqueries,m_qOpts);
|
||||
if (!pqueries.empty()) {
|
||||
Xapian::Query nq =
|
||||
Xapian::Query(Xapian::Query::OP_AND, pqueries.begin(),
|
||||
pqueries.end());
|
||||
xq = xq.empty() ? nq :
|
||||
Xapian::Query(Xapian::Query::OP_AND, xq, nq);
|
||||
pqueries.clear();
|
||||
}
|
||||
}
|
||||
|
||||
if (!sdata.orwords.empty()) {
|
||||
stringToXapianQueries(sdata.orwords, stemlang, this,pqueries,m_qOpts);
|
||||
if (!pqueries.empty()) {
|
||||
Xapian::Query nq =
|
||||
Xapian::Query(Xapian::Query::OP_OR, pqueries.begin(),
|
||||
pqueries.end());
|
||||
xq = xq.empty() ? nq :
|
||||
Xapian::Query(Xapian::Query::OP_AND, xq, nq);
|
||||
pqueries.clear();
|
||||
}
|
||||
}
|
||||
|
||||
if (!sdata.orwords1.empty()) {
|
||||
stringToXapianQueries(sdata.orwords1, stemlang, this,pqueries,m_qOpts);
|
||||
if (!pqueries.empty()) {
|
||||
Xapian::Query nq =
|
||||
Xapian::Query(Xapian::Query::OP_OR, pqueries.begin(),
|
||||
pqueries.end());
|
||||
xq = xq.empty() ? nq :
|
||||
Xapian::Query(Xapian::Query::OP_AND, xq, nq);
|
||||
pqueries.clear();
|
||||
}
|
||||
}
|
||||
|
||||
if (!sdata.phrase.empty()) {
|
||||
Xapian::Query nq;
|
||||
string s = string("\"") + sdata.phrase + string("\"");
|
||||
stringToXapianQueries(s, stemlang, this, pqueries);
|
||||
if (!pqueries.empty()) {
|
||||
// There should be a single list element phrase query.
|
||||
xq = xq.empty() ? *pqueries.begin() :
|
||||
Xapian::Query(Xapian::Query::OP_AND, xq, *pqueries.begin());
|
||||
pqueries.clear();
|
||||
}
|
||||
}
|
||||
|
||||
if (!sdata.filetypes.empty()) {
|
||||
Xapian::Query tq;
|
||||
for (list<string>::iterator it = sdata.filetypes.begin();
|
||||
it != sdata.filetypes.end(); it++) {
|
||||
string term = "T" + *it;
|
||||
LOGDEB(("Adding file type term: [%s]\n", term.c_str()));
|
||||
tq = tq.empty() ? Xapian::Query(term) :
|
||||
Xapian::Query(Xapian::Query::OP_OR, tq, Xapian::Query(term));
|
||||
}
|
||||
xq = xq.empty() ? tq : Xapian::Query(Xapian::Query::OP_FILTER, xq, tq);
|
||||
}
|
||||
|
||||
// "And not" part. Must come last, as we have to check it's not
|
||||
// the only term in the query. We do no stem expansion on 'No'
|
||||
// words. Should we ?
|
||||
if (!sdata.nowords.empty()) {
|
||||
stringToXapianQueries(sdata.nowords, stemlang, this, pqueries);
|
||||
if (!pqueries.empty()) {
|
||||
Xapian::Query nq;
|
||||
nq = Xapian::Query(Xapian::Query::OP_OR, pqueries.begin(),
|
||||
pqueries.end());
|
||||
if (xq.empty()) {
|
||||
// Xapian cant do this currently. Have to have a positive
|
||||
// part!
|
||||
sdata.description = "Error: pure negative query\n";
|
||||
LOGERR(("Rcl::Db::setQuery: error: pure negative query\n"));
|
||||
return false;
|
||||
}
|
||||
xq = Xapian::Query(Xapian::Query::OP_AND_NOT, xq, nq);
|
||||
pqueries.clear();
|
||||
}
|
||||
}
|
||||
Xapian::Query xq;
|
||||
sdata->toNativeQuery(*this, &xq, (opts & Db::QO_STEM) ? stemlang : "");
|
||||
|
||||
m_ndb->query = xq;
|
||||
delete m_ndb->enquire;
|
||||
@ -1189,10 +1200,11 @@ bool Db::setQuery(AdvSearchData &sdata, int opts, const string& stemlang)
|
||||
m_ndb->enquire->set_query(m_ndb->query);
|
||||
m_ndb->mset = Xapian::MSet();
|
||||
// Get the query description and trim the "Xapian::Query"
|
||||
sdata.description = m_ndb->query.get_description();
|
||||
if (sdata.description.find("Xapian::Query") == 0)
|
||||
sdata.description = sdata.description.substr(strlen("Xapian::Query"));
|
||||
LOGDEB(("Db::SetQuery: Q: %s\n", sdata.description.c_str()));
|
||||
sdata->m_description = m_ndb->query.get_description();
|
||||
if (sdata->m_description.find("Xapian::Query") == 0)
|
||||
sdata->m_description =
|
||||
sdata->m_description.substr(strlen("Xapian::Query"));
|
||||
LOGDEB(("Db::SetQuery: Q: %s\n", sdata->m_description.c_str()));
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -1422,43 +1434,6 @@ int Db::getResCnt()
|
||||
return m_ndb->mset.get_matches_lower_bound();
|
||||
}
|
||||
|
||||
bool Native::dbDataToRclDoc(std::string &data, Doc &doc,
|
||||
int qopts,
|
||||
Xapian::docid docid, const list<string>& terms)
|
||||
{
|
||||
LOGDEB1(("Db::dbDataToRclDoc: opts %x data: %s\n", qopts, data.c_str()));
|
||||
ConfSimple parms(&data);
|
||||
if (!parms.ok())
|
||||
return false;
|
||||
parms.get(string("url"), doc.url);
|
||||
parms.get(string("mtype"), doc.mimetype);
|
||||
parms.get(string("fmtime"), doc.fmtime);
|
||||
parms.get(string("dmtime"), doc.dmtime);
|
||||
parms.get(string("origcharset"), doc.origcharset);
|
||||
parms.get(string("caption"), doc.title);
|
||||
parms.get(string("keywords"), doc.keywords);
|
||||
parms.get(string("abstract"), doc.abstract);
|
||||
// Possibly remove synthetic abstract indicator (if it's there, we
|
||||
// used to index the beginning of the text as abstract).
|
||||
bool syntabs = false;
|
||||
if (doc.abstract.find(rclSyntAbs) == 0) {
|
||||
doc.abstract = doc.abstract.substr(rclSyntAbs.length());
|
||||
syntabs = true;
|
||||
}
|
||||
// If the option is set and the abstract is synthetic or empty , build
|
||||
// abstract from position data.
|
||||
if ((qopts & Db::QO_BUILD_ABSTRACT) && !terms.empty()) {
|
||||
LOGDEB(("dbDataToRclDoc:: building abstract from position data\n"));
|
||||
if (doc.abstract.empty() || syntabs ||
|
||||
(qopts & Db::QO_REPLACE_ABSTRACT))
|
||||
doc.abstract = makeAbstract(docid, terms);
|
||||
}
|
||||
parms.get(string("ipath"), doc.ipath);
|
||||
parms.get(string("fbytes"), doc.fbytes);
|
||||
parms.get(string("dbytes"), doc.dbytes);
|
||||
doc.xdocid = docid;
|
||||
return true;
|
||||
}
|
||||
|
||||
// Get document at rank i in query (i is the index in the whole result
|
||||
// set, as in the enquire class. We check if the current mset has the
|
||||
@ -1641,191 +1616,6 @@ list<string> Db::expand(const Doc &doc)
|
||||
}
|
||||
|
||||
|
||||
// We build a possibly full size but sparsely populated (only around
|
||||
// the search term occurrences) reconstruction of the document. It
|
||||
// would be possible to compress the array, by having only multiple
|
||||
// chunks around the terms, but this would seriously complicate the
|
||||
// data structure.
|
||||
string Native::makeAbstract(Xapian::docid docid, const list<string>& terms)
|
||||
{
|
||||
LOGDEB(("Native::makeAbstract: maxlen %d wWidth %d\n",
|
||||
m_db->m_synthAbsLen, m_db->m_synthAbsWordCtxLen));
|
||||
|
||||
Chrono chron;
|
||||
|
||||
// For each of the query terms, query xapian for its positions
|
||||
// list in the document. For each position entry, remember it in qtermposs
|
||||
// and insert it and its neighbours in the set of 'interesting' positions
|
||||
|
||||
// The terms 'array' that we partially populate with the document
|
||||
// terms, at their positions around the search terms positions:
|
||||
map<unsigned int, string> sparseDoc;
|
||||
|
||||
// All the query term positions. We remember this mainly because we are
|
||||
// going to random-shuffle it for selecting the chunks that we actually
|
||||
// print.
|
||||
vector<unsigned int> qtermposs;
|
||||
|
||||
// Limit the total number of slots we populate.
|
||||
const unsigned int maxtotaloccs = 300;
|
||||
// Max occurrences per term. We initially know nothing about the
|
||||
// occurrences repartition (it would be possible that only one
|
||||
// term in the list occurs, or that all do). So this is a rather
|
||||
// arbitrary choice.
|
||||
const unsigned int maxoccperterm = maxtotaloccs / 10;
|
||||
unsigned int totaloccs = 0;
|
||||
|
||||
for (list<string>::const_iterator qit = terms.begin(); qit != terms.end();
|
||||
qit++) {
|
||||
Xapian::PositionIterator pos;
|
||||
// There may be query terms not in this doc. This raises an
|
||||
// exception when requesting the position list, we catch it.
|
||||
string emptys;
|
||||
try {
|
||||
unsigned int occurrences = 0;
|
||||
for (pos = db.positionlist_begin(docid, *qit);
|
||||
pos != db.positionlist_end(docid, *qit); pos++) {
|
||||
unsigned int ipos = *pos;
|
||||
LOGDEB2(("Abstract: [%s] at %d\n", qit->c_str(), ipos));
|
||||
// Remember the term position
|
||||
qtermposs.push_back(ipos);
|
||||
// Add adjacent slots to the set to populate at next step
|
||||
unsigned int sta = MAX(0, ipos-m_db->m_synthAbsWordCtxLen);
|
||||
unsigned int sto = ipos+m_db->m_synthAbsWordCtxLen;
|
||||
for (unsigned int ii = sta; ii <= sto; ii++) {
|
||||
if (ii == ipos)
|
||||
sparseDoc[ii] = *qit;
|
||||
else
|
||||
sparseDoc[ii] = emptys;
|
||||
}
|
||||
// Limit the number of occurences we keep for each
|
||||
// term. The abstract has a finite length anyway !
|
||||
if (occurrences++ > maxoccperterm)
|
||||
break;
|
||||
}
|
||||
} catch (...) {
|
||||
// Term does not occur. No problem.
|
||||
}
|
||||
// Limit total size
|
||||
if (totaloccs++ > maxtotaloccs)
|
||||
break;
|
||||
}
|
||||
|
||||
LOGDEB(("Abstract:%d:chosen number of positions %d. Populating\n",
|
||||
chron.millis(), qtermposs.size()));
|
||||
|
||||
// Walk the full document position list (for each term walk
|
||||
// position list) and populate slots around the query terms. We
|
||||
// arbitrarily truncate the list to avoid taking forever. If we do
|
||||
// cutoff, the abstract may be inconsistant, which is bad...
|
||||
{
|
||||
Xapian::TermIterator term;
|
||||
int cutoff = 500 * 1000;
|
||||
|
||||
for (term = db.termlist_begin(docid);
|
||||
term != db.termlist_end(docid); term++) {
|
||||
if (cutoff-- < 0) {
|
||||
LOGDEB(("Abstract: max term count cutoff\n"));
|
||||
break;
|
||||
}
|
||||
|
||||
Xapian::PositionIterator pos;
|
||||
for (pos = db.positionlist_begin(docid, *term);
|
||||
pos != db.positionlist_end(docid, *term); pos++) {
|
||||
if (cutoff-- < 0) {
|
||||
LOGDEB(("Abstract: max term count cutoff\n"));
|
||||
break;
|
||||
}
|
||||
map<unsigned int, string>::iterator vit;
|
||||
if ((vit=sparseDoc.find(*pos)) != sparseDoc.end()) {
|
||||
// Don't replace a term: the terms list is in
|
||||
// alphabetic order, and we may have several terms
|
||||
// at the same position, we want to keep only the
|
||||
// first one (ie: dockes and dockes@wanadoo.fr)
|
||||
if (vit->second.empty()) {
|
||||
LOGDEB2(("Abstract: populating: [%s] at %d\n",
|
||||
(*term).c_str(), *pos));
|
||||
sparseDoc[*pos] = *term;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#if 0
|
||||
// Debug only: output the full term[position] vector
|
||||
bool epty = false;
|
||||
int ipos = 0;
|
||||
for (map<unsigned int, string>::iterator it = sparseDoc.begin();
|
||||
it != sparseDoc.end();
|
||||
it++, ipos++) {
|
||||
if (it->empty()) {
|
||||
if (!epty)
|
||||
LOGDEB(("Abstract:vec[%d]: [%s]\n", ipos, it->c_str()));
|
||||
epty=true;
|
||||
} else {
|
||||
epty = false;
|
||||
LOGDEB(("Abstract:vec[%d]: [%s]\n", ipos, it->c_str()));
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
LOGDEB(("Abstract:%d: randomizing and extracting\n", chron.millis()));
|
||||
|
||||
// We randomize the selection of term positions, from which we
|
||||
// shall pull, starting at the beginning, until the abstract is
|
||||
// big enough. The abstract is finally built in correct position
|
||||
// order, thanks to the position map.
|
||||
random_shuffle(qtermposs.begin(), qtermposs.end());
|
||||
map<unsigned int, string> mabs;
|
||||
unsigned int abslen = 0;
|
||||
|
||||
// Extract data around the N first (in random order) query term
|
||||
// positions, and store the terms in the map. Don't concatenate
|
||||
// immediately into chunks because there might be overlaps
|
||||
for (vector<unsigned int>::const_iterator pos = qtermposs.begin();
|
||||
pos != qtermposs.end(); pos++) {
|
||||
|
||||
if (int(abslen) > m_db->m_synthAbsLen)
|
||||
break;
|
||||
|
||||
unsigned int sta = MAX(0, *pos - m_db->m_synthAbsWordCtxLen);
|
||||
unsigned int sto = *pos + m_db->m_synthAbsWordCtxLen;
|
||||
|
||||
LOGDEB2(("Abstract: %d<-%d->%d\n", sta, *pos, sto));
|
||||
|
||||
for (unsigned int ii = sta; ii <= sto; ii++) {
|
||||
|
||||
if (int(abslen) > m_db->m_synthAbsLen)
|
||||
break;
|
||||
map<unsigned int, string>::const_iterator vit =
|
||||
sparseDoc.find(ii);
|
||||
if (vit != sparseDoc.end() && !vit->second.empty()) {
|
||||
LOGDEB2(("Abstract: position %d -> [%s]\n",
|
||||
ii, vit->second.c_str()));
|
||||
mabs[ii] = vit->second;
|
||||
abslen += vit->second.length();
|
||||
} else {
|
||||
LOGDEB2(("Abstract: empty position at %d\n", ii));
|
||||
}
|
||||
}
|
||||
|
||||
// Possibly add a ... at the end of chunk if it's not
|
||||
// overlapping
|
||||
if (mabs.find(sto+1) == mabs.end())
|
||||
mabs[sto+1] = "...";
|
||||
}
|
||||
|
||||
// Build the abstract by walking the map (in order of position)
|
||||
string abstract;
|
||||
for (map<unsigned int, string>::const_iterator it = mabs.begin();
|
||||
it != mabs.end(); it++) {
|
||||
LOGDEB2(("Abtract:output %u -> [%s]\n", it->first,it->second.c_str()));
|
||||
abstract += it->second + " ";
|
||||
}
|
||||
LOGDEB(("Abtract: done in %d mS\n", chron.millis()));
|
||||
return abstract;
|
||||
}
|
||||
#ifndef NO_NAMESPACES
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -16,12 +16,14 @@
|
||||
*/
|
||||
#ifndef _DB_H_INCLUDED_
|
||||
#define _DB_H_INCLUDED_
|
||||
/* @(#$Id: rcldb.h,v 1.40 2006-10-30 12:59:44 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
/* @(#$Id: rcldb.h,v 1.41 2006-11-13 08:49:44 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
|
||||
#include <string>
|
||||
#include <list>
|
||||
#include <vector>
|
||||
|
||||
#include "refcntr.h"
|
||||
|
||||
#ifndef NO_NAMESPACES
|
||||
using std::string;
|
||||
using std::list;
|
||||
@ -103,7 +105,7 @@ class Doc {
|
||||
}
|
||||
};
|
||||
|
||||
class AdvSearchData;
|
||||
class SearchData;
|
||||
class Native;
|
||||
class TermIter;
|
||||
|
||||
@ -155,7 +157,7 @@ class Db {
|
||||
/* Query-related functions */
|
||||
|
||||
// Parse query string and initialize query
|
||||
bool setQuery(AdvSearchData &q, int opts = QO_NONE,
|
||||
bool setQuery(RefCntr<SearchData> q, int opts = QO_NONE,
|
||||
const string& stemlang = "english");
|
||||
bool getQueryTerms(list<string>& terms);
|
||||
bool getMatchTerms(const Doc& doc, list<string>& terms);
|
||||
@ -213,6 +215,9 @@ class Db {
|
||||
/** Perform stem expansion across all dbs configured for searching */
|
||||
list<string> stemExpand(const string& lang, const string& term);
|
||||
|
||||
/** Filename wildcard expansion */
|
||||
bool filenameWildExp(const string& exp, list<string>& names);
|
||||
|
||||
private:
|
||||
|
||||
string m_filterTopDir; // Current query filter on subtree top directory
|
||||
@ -248,6 +253,7 @@ private:
|
||||
vector<bool> updated;
|
||||
|
||||
bool reOpen(); // Close/open, same mode/opts
|
||||
|
||||
/* Copyconst and assignemt private and forbidden */
|
||||
Db(const Db &) {}
|
||||
Db & operator=(const Db &) {return *this;};
|
||||
|
||||
299
src/rcldb/searchdata.cpp
Normal file
299
src/rcldb/searchdata.cpp
Normal file
@ -0,0 +1,299 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.1 2006-11-13 08:49:44 dockes Exp $ (C) 2006 J.F.Dockes";
|
||||
#endif
|
||||
/*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc.,
|
||||
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
*/
|
||||
|
||||
// Handle translation from rcl's SearchData structures to Xapian Queries
|
||||
|
||||
#include <string>
|
||||
#include <list>
|
||||
#ifndef NO_NAMESPACES
|
||||
using namespace std;
|
||||
#endif
|
||||
|
||||
#include "xapian.h"
|
||||
|
||||
#include "rcldb.h"
|
||||
#include "searchdata.h"
|
||||
#include "debuglog.h"
|
||||
#include "smallut.h"
|
||||
#include "textsplit.h"
|
||||
#include "unacpp.h"
|
||||
#include "utf8iter.h"
|
||||
|
||||
namespace Rcl {
|
||||
|
||||
typedef list<SearchDataClause *>::iterator qlist_it_t;
|
||||
|
||||
bool SearchData::toNativeQuery(Rcl::Db &db, void *d, const string& stemlang)
|
||||
{
|
||||
Xapian::Query xq;
|
||||
|
||||
// Walk the clause list translating each in turn and building the
|
||||
// Xapian query tree
|
||||
for (qlist_it_t it = m_query.begin(); it != m_query.end(); it++) {
|
||||
Xapian::Query nq;
|
||||
(*it)->toNativeQuery(db, &nq, stemlang);
|
||||
Xapian::Query::op op;
|
||||
|
||||
// If this structure is an AND list, must use AND_NOT for excl clauses.
|
||||
// Else this is an OR list, and there can't be excl clauses
|
||||
if (m_tp == SCLT_AND) {
|
||||
op = (*it)->m_tp == SCLT_EXCL ?
|
||||
Xapian::Query::OP_AND_NOT: Xapian::Query::OP_AND;
|
||||
} else {
|
||||
op = Xapian::Query::OP_OR;
|
||||
}
|
||||
xq = xq.empty() ? nq : Xapian::Query(op, xq, nq);
|
||||
}
|
||||
|
||||
// Add the file type filtering clause if any
|
||||
if (!m_filetypes.empty()) {
|
||||
list<Xapian::Query> pqueries;
|
||||
Xapian::Query tq;
|
||||
for (list<string>::iterator it = m_filetypes.begin();
|
||||
it != m_filetypes.end(); it++) {
|
||||
string term = "T" + *it;
|
||||
LOGDEB(("Adding file type term: [%s]\n", term.c_str()));
|
||||
tq = tq.empty() ? Xapian::Query(term) :
|
||||
Xapian::Query(Xapian::Query::OP_OR, tq, Xapian::Query(term));
|
||||
}
|
||||
xq = xq.empty() ? tq : Xapian::Query(Xapian::Query::OP_FILTER, xq, tq);
|
||||
}
|
||||
|
||||
*((Xapian::Query *)d) = xq;
|
||||
return true;
|
||||
}
|
||||
|
||||
// Add clause to current list. OR lists cant have EXCL clauses.
|
||||
bool SearchData::addClause(SearchDataClause* cl)
|
||||
{
|
||||
if (m_tp == SCLT_OR && (cl->m_tp == SCLT_EXCL)) {
|
||||
LOGERR(("SearchData::addClause: cant add EXCL to OR list\n"));
|
||||
return false;
|
||||
}
|
||||
m_query.push_back(cl);
|
||||
return true;
|
||||
}
|
||||
|
||||
// Make me all new
|
||||
void SearchData::erase() {
|
||||
for (qlist_it_t it = m_query.begin(); it != m_query.end(); it++)
|
||||
delete *it;
|
||||
m_query.clear();
|
||||
m_filetypes.clear();
|
||||
m_topdir.erase();
|
||||
m_description.erase();
|
||||
}
|
||||
|
||||
// Am I a file name only search ? This is to turn off term highlighting
|
||||
bool SearchData::fileNameOnly() {
|
||||
for (qlist_it_t it = m_query.begin(); it != m_query.end(); it++)
|
||||
if (!(*it)->isFileName())
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
// Splitter callback for breaking a user query string into simple
|
||||
// terms and phrases
|
||||
class wsQData : public TextSplitCB {
|
||||
public:
|
||||
vector<string> terms;
|
||||
// Debug
|
||||
string catterms() {
|
||||
string s;
|
||||
for (unsigned int i = 0; i < terms.size(); i++) {
|
||||
s += "[" + terms[i] + "] ";
|
||||
}
|
||||
return s;
|
||||
}
|
||||
bool takeword(const std::string &term, int , int, int) {
|
||||
LOGDEB1(("wsQData::takeword: %s\n", term.c_str()));
|
||||
terms.push_back(term);
|
||||
return true;
|
||||
}
|
||||
// Decapital + deaccent all terms
|
||||
void dumball() {
|
||||
for (vector<string>::iterator it=terms.begin(); it !=terms.end();it++){
|
||||
string dumb;
|
||||
dumb_string(*it, dumb);
|
||||
*it = dumb;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
// Turn string into list of xapian queries. There is little
|
||||
// interpretation done on the string (no +term -term or filename:term
|
||||
// stuff). We just separate words and phrases, and interpret
|
||||
// capitalized terms as wanting no stem expansion.
|
||||
// The final list contains one query for each term or phrase
|
||||
// - Elements corresponding to a stem-expanded part are an OP_OR
|
||||
// composition of the stem-expanded terms (or a single term query).
|
||||
// - Elements corresponding to a phrase are an OP_PHRASE composition of the
|
||||
// phrase terms (no stem expansion in this case)
|
||||
static void stringToXapianQueries(const string &iq,
|
||||
const string& stemlang,
|
||||
Db& db,
|
||||
list<Xapian::Query> &pqueries)
|
||||
{
|
||||
string qstring = iq;
|
||||
bool opt_stemexp = !stemlang.empty();
|
||||
|
||||
// Split into (possibly single word) phrases ("this is a phrase"):
|
||||
list<string> phrases;
|
||||
stringToStrings(qstring, phrases);
|
||||
|
||||
// Then process each phrase: split into terms and transform into
|
||||
// appropriate Xapian Query
|
||||
|
||||
for (list<string>::iterator it=phrases.begin(); it !=phrases.end(); it++) {
|
||||
LOGDEB(("strToXapianQ: phrase or word: [%s]\n", it->c_str()));
|
||||
|
||||
// If there are both spans and single words in this element,
|
||||
// we need to use a word split, else a phrase query including
|
||||
// a span would fail if we didn't adjust the proximity to
|
||||
// account for the additional span term which is complicated.
|
||||
wsQData splitDataS, splitDataW;
|
||||
TextSplit splitterS(&splitDataS, TextSplit::TXTS_ONLYSPANS);
|
||||
splitterS.text_to_words(*it);
|
||||
TextSplit splitterW(&splitDataW, TextSplit::TXTS_NOSPANS);
|
||||
splitterW.text_to_words(*it);
|
||||
wsQData& splitData = splitDataS;
|
||||
if (splitDataS.terms.size() > 1 && splitDataS.terms.size() !=
|
||||
splitDataW.terms.size())
|
||||
splitData = splitDataW;
|
||||
|
||||
LOGDEB1(("strToXapianQ: splitter term count: %d\n",
|
||||
splitData.terms.size()));
|
||||
switch(splitData.terms.size()) {
|
||||
case 0: continue;// ??
|
||||
case 1: // Not a real phrase: one term
|
||||
{
|
||||
string term = splitData.terms.front();
|
||||
bool nostemexp = false;
|
||||
// Check if the first letter is a majuscule in which
|
||||
// case we do not want to do stem expansion. Note that
|
||||
// the test is convoluted and possibly problematic
|
||||
if (term.length() > 0) {
|
||||
string noacterm,noaclowterm;
|
||||
if (unacmaybefold(term, noacterm, "UTF-8", false) &&
|
||||
unacmaybefold(noacterm, noaclowterm, "UTF-8", true)) {
|
||||
Utf8Iter it1(noacterm);
|
||||
Utf8Iter it2(noaclowterm);
|
||||
if (*it1 != *it2)
|
||||
nostemexp = true;
|
||||
}
|
||||
}
|
||||
LOGDEB1(("Term: %s stem expansion: %s\n",
|
||||
term.c_str(), nostemexp?"no":"yes"));
|
||||
|
||||
list<string> exp;
|
||||
string term1;
|
||||
dumb_string(term, term1);
|
||||
// Possibly perform stem compression/expansion
|
||||
if (!nostemexp && opt_stemexp) {
|
||||
exp = db.stemExpand(stemlang, term1);
|
||||
} else {
|
||||
exp.push_back(term1);
|
||||
}
|
||||
|
||||
// Push either term or OR of stem-expanded set
|
||||
pqueries.push_back(Xapian::Query(Xapian::Query::OP_OR,
|
||||
exp.begin(), exp.end()));
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
// Phrase: no stem expansion
|
||||
splitData.dumball();
|
||||
LOGDEB(("Pushing phrase: [%s]\n", splitData.catterms().c_str()));
|
||||
pqueries.push_back(Xapian::Query(Xapian::Query::OP_PHRASE,
|
||||
splitData.terms.begin(),
|
||||
splitData.terms.end()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Translate a simple OR, AND, or EXCL search clause.
|
||||
bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p,
|
||||
const string& stemlang)
|
||||
{
|
||||
Xapian::Query *qp = (Xapian::Query *)p;
|
||||
*qp = Xapian::Query();
|
||||
|
||||
Xapian::Query::op op;
|
||||
switch (m_tp) {
|
||||
case SCLT_AND: op = Xapian::Query::OP_AND; break;
|
||||
case SCLT_OR:
|
||||
case SCLT_EXCL: op = Xapian::Query::OP_OR; break;
|
||||
default:
|
||||
LOGERR(("SearchDataClauseSimple: bad m_tp %d\n", m_tp));
|
||||
return false;
|
||||
}
|
||||
list<Xapian::Query> pqueries;
|
||||
stringToXapianQueries(m_text, stemlang, db, pqueries);
|
||||
if (pqueries.empty()) {
|
||||
LOGERR(("SearchDataClauseSimple: resolved to null query\n"));
|
||||
return true;
|
||||
}
|
||||
*qp = Xapian::Query(op, pqueries.begin(), pqueries.end());
|
||||
return true;
|
||||
}
|
||||
|
||||
// Translate a FILENAME search clause.
|
||||
bool SearchDataClauseFilename::toNativeQuery(Rcl::Db &db, void *p,
|
||||
const string& stemlang)
|
||||
{
|
||||
Xapian::Query *qp = (Xapian::Query *)p;
|
||||
*qp = Xapian::Query();
|
||||
|
||||
list<string> names;
|
||||
db.filenameWildExp(m_text, names);
|
||||
// Build a query out of the matching file name terms.
|
||||
*qp = Xapian::Query(Xapian::Query::OP_OR, names.begin(), names.end());
|
||||
return true;
|
||||
}
|
||||
|
||||
// Translate NEAR or PHRASE clause. We're not handling the distance parameter
|
||||
// yet.
|
||||
bool SearchDataClauseDist::toNativeQuery(Rcl::Db &db, void *p,
|
||||
const string& stemlang)
|
||||
{
|
||||
Xapian::Query *qp = (Xapian::Query *)p;
|
||||
*qp = Xapian::Query();
|
||||
|
||||
Xapian::Query::op op = m_tp == SCLT_PHRASE ? Xapian::Query::OP_PHRASE :
|
||||
Xapian::Query::OP_NEAR;
|
||||
|
||||
list<Xapian::Query> pqueries;
|
||||
Xapian::Query nq;
|
||||
string s = string("\"") + m_text + string("\"");
|
||||
|
||||
// Use stringToXapianQueries anyway to lowercase and simplify the
|
||||
// phrase terms etc. The result should be a single element list
|
||||
stringToXapianQueries(s, stemlang, db, pqueries);
|
||||
if (pqueries.empty()) {
|
||||
LOGERR(("SearchDataClauseDist: resolved to null query\n"));
|
||||
return true;
|
||||
}
|
||||
*qp = *pqueries.begin();
|
||||
return true;
|
||||
}
|
||||
|
||||
} // Namespace Rcl
|
||||
@ -1,40 +1,112 @@
|
||||
#ifndef _SEARCHDATA_H_INCLUDED_
|
||||
#define _SEARCHDATA_H_INCLUDED_
|
||||
/* @(#$Id: searchdata.h,v 1.2 2006-04-22 06:27:37 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
/* @(#$Id: searchdata.h,v 1.3 2006-11-13 08:49:45 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
|
||||
#include <string>
|
||||
#include <list>
|
||||
|
||||
#include "rcldb.h"
|
||||
|
||||
#ifndef NO_NAMESPACES
|
||||
using std::list;
|
||||
using std::string;
|
||||
#endif
|
||||
|
||||
namespace Rcl {
|
||||
/**
|
||||
* Holder for query data
|
||||
*/
|
||||
class AdvSearchData {
|
||||
public:
|
||||
string allwords;
|
||||
string phrase;
|
||||
string orwords;
|
||||
string orwords1; // Have two instances of orwords for and'ing them
|
||||
string nowords;
|
||||
string filename;
|
||||
list<string> filetypes; // restrict to types. Empty if inactive
|
||||
string topdir; // restrict to subtree. Empty if inactive
|
||||
string description; // Printable expanded version of the complete query
|
||||
// returned after setQuery.
|
||||
void erase() {
|
||||
allwords.erase();
|
||||
phrase.erase();
|
||||
orwords.erase();
|
||||
orwords1.erase();
|
||||
nowords.erase();
|
||||
filetypes.clear();
|
||||
topdir.erase();
|
||||
filename.erase();
|
||||
description.erase();
|
||||
}
|
||||
bool fileNameOnly() {
|
||||
return allwords.empty() && phrase.empty() && orwords.empty() &&
|
||||
orwords1.empty() && nowords.empty();
|
||||
}
|
||||
|
||||
/** Search clause types */
|
||||
enum SClType {
|
||||
SCLT_AND,
|
||||
SCLT_OR, SCLT_EXCL, SCLT_FILENAME, SCLT_PHRASE, SCLT_NEAR,
|
||||
SCLT_SUB
|
||||
};
|
||||
|
||||
}
|
||||
class SearchDataClause;
|
||||
|
||||
/**
|
||||
* Holder for a list of search clauses. Some of the clauses can be comples
|
||||
* subqueries.
|
||||
*/
|
||||
class SearchData {
|
||||
public:
|
||||
SClType m_tp; // Only SCLT_AND or SCLT_OR here
|
||||
list<SearchDataClause *> m_query;
|
||||
list<string> m_filetypes; // Restrict to filetypes if set.
|
||||
string m_topdir; // Restrict to subtree.
|
||||
// Printable expanded version of the complete query, obtained from Xapian
|
||||
// valid after setQuery() call
|
||||
string m_description;
|
||||
|
||||
SearchData(SClType tp) : m_tp(tp) {}
|
||||
~SearchData() {erase();}
|
||||
|
||||
/** Make pristine */
|
||||
void erase();
|
||||
|
||||
/** Is there anything but a file name search in here ? */
|
||||
bool fileNameOnly();
|
||||
|
||||
/** Translate to Xapian query. rcldb knows about the void* */
|
||||
bool toNativeQuery(Rcl::Db &db, void *, const string& stemlang);
|
||||
|
||||
/** We become the owner of cl and will delete it */
|
||||
bool addClause(SearchDataClause *cl);
|
||||
|
||||
private:
|
||||
/* Copyconst and assignment private and forbidden */
|
||||
SearchData(const SearchData &) {}
|
||||
SearchData& operator=(const SearchData&) {return *this;};
|
||||
};
|
||||
|
||||
class SearchDataClause {
|
||||
public:
|
||||
SClType m_tp;
|
||||
|
||||
SearchDataClause(SClType tp) : m_tp(tp) {}
|
||||
virtual ~SearchDataClause() {}
|
||||
virtual bool toNativeQuery(Rcl::Db &db, void *, const string&) = 0;
|
||||
virtual bool isFileName() {return m_tp == SCLT_FILENAME ? true : false;}
|
||||
};
|
||||
|
||||
class SearchDataClauseSimple : public SearchDataClause {
|
||||
public:
|
||||
SearchDataClauseSimple(SClType tp, string txt)
|
||||
: SearchDataClause(tp), m_text(txt) {}
|
||||
virtual ~SearchDataClauseSimple() {}
|
||||
virtual bool toNativeQuery(Rcl::Db &db, void *, const string& stemlang);
|
||||
protected:
|
||||
string m_text;
|
||||
};
|
||||
|
||||
class SearchDataClauseFilename : public SearchDataClauseSimple {
|
||||
public:
|
||||
SearchDataClauseFilename(string txt)
|
||||
: SearchDataClauseSimple(SCLT_FILENAME, m_text) {}
|
||||
virtual ~SearchDataClauseFilename() {}
|
||||
virtual bool toNativeQuery(Rcl::Db &db, void *, const string& stemlang);
|
||||
};
|
||||
|
||||
class SearchDataClauseDist : public SearchDataClauseSimple {
|
||||
public:
|
||||
SearchDataClauseDist(SClType tp, string txt, int dist)
|
||||
: SearchDataClauseSimple(tp, txt), m_distance(dist) {}
|
||||
virtual ~SearchDataClauseDist() {}
|
||||
virtual bool toNativeQuery(Rcl::Db &db, void *, const string& stemlang);
|
||||
|
||||
protected:
|
||||
int m_distance;
|
||||
};
|
||||
|
||||
class SearchDataClauseSub : public SearchDataClause {
|
||||
public:
|
||||
SearchDataClauseSub(SClType tp, SClType stp)
|
||||
: SearchDataClause(tp), m_sub(stp) {}
|
||||
virtual ~SearchDataClauseSub() {}
|
||||
virtual bool toNativeQuery(Rcl::Db &db, void *, const string& stemlang);
|
||||
|
||||
protected:
|
||||
SearchData m_sub;
|
||||
};
|
||||
|
||||
} // Namespace Rcl
|
||||
#endif /* _SEARCHDATA_H_INCLUDED_ */
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user