make searchdata a more flexible struct
This commit is contained in:
parent
1d7f103fe7
commit
cdbf026738
@ -8,8 +8,8 @@ LIBS = librcl.a
|
|||||||
|
|
||||||
all: $(LIBS)
|
all: $(LIBS)
|
||||||
|
|
||||||
OBJS = conftree.o csguess.o debuglog.o execmd.o idfile.o md5.o wipedir.o fstreewalk.o mh_html.o mh_mail.o mh_exec.o mh_text.o htmlparse.o indexer.o internfile.o mimehandler.o mimeparse.o mimetype.o myhtmlparse.o pathhash.o pathut.o rclconfig.o rcldb.o rclinit.o stemdb.o base64.o readfile.o smallut.o textsplit.o transcode.o unacpp.o history.o docseq.o sortseq.o copyfile.o rclaspell.o
|
OBJS = conftree.o csguess.o debuglog.o execmd.o idfile.o md5.o wipedir.o fstreewalk.o mh_html.o mh_mail.o searchdata.o mh_exec.o mh_text.o htmlparse.o indexer.o internfile.o mimehandler.o mimeparse.o mimetype.o myhtmlparse.o pathhash.o pathut.o rclconfig.o rcldb.o rclinit.o stemdb.o base64.o readfile.o smallut.o textsplit.o transcode.o unacpp.o history.o docseq.o sortseq.o copyfile.o rclaspell.o
|
||||||
DEPS = conftree.dep.stamp csguess.dep.stamp debuglog.dep.stamp execmd.dep.stamp idfile.dep.stamp md5.dep.stamp wipedir.dep.stamp fstreewalk.dep.stamp mh_html.dep.stamp mh_mail.dep.stamp mh_exec.dep.stamp mh_text.dep.stamp htmlparse.dep.stamp indexer.dep.stamp internfile.dep.stamp mimehandler.dep.stamp mimeparse.dep.stamp mimetype.dep.stamp myhtmlparse.dep.stamp pathhash.dep.stamp pathut.dep.stamp rclconfig.dep.stamp rcldb.dep.stamp rclinit.dep.stamp stemdb.dep.stamp base64.dep.stamp readfile.dep.stamp smallut.dep.stamp textsplit.dep.stamp transcode.dep.stamp unacpp.dep.stamp history.dep.stamp docseq.dep.stamp sortseq.dep.stamp copyfile.dep.stamp rclaspell.dep.stamp
|
DEPS = conftree.dep.stamp csguess.dep.stamp debuglog.dep.stamp execmd.dep.stamp idfile.dep.stamp md5.dep.stamp wipedir.dep.stamp fstreewalk.dep.stamp mh_html.dep.stamp mh_mail.dep.stamp searchdata.dep.stamp mh_exec.dep.stamp mh_text.dep.stamp htmlparse.dep.stamp indexer.dep.stamp internfile.dep.stamp mimehandler.dep.stamp mimeparse.dep.stamp mimetype.dep.stamp myhtmlparse.dep.stamp pathhash.dep.stamp pathut.dep.stamp rclconfig.dep.stamp rcldb.dep.stamp rclinit.dep.stamp stemdb.dep.stamp base64.dep.stamp readfile.dep.stamp smallut.dep.stamp textsplit.dep.stamp transcode.dep.stamp unacpp.dep.stamp history.dep.stamp docseq.dep.stamp sortseq.dep.stamp copyfile.dep.stamp rclaspell.dep.stamp
|
||||||
|
|
||||||
librcl.a : $(DEPS) $(OBJS) unac.o
|
librcl.a : $(DEPS) $(OBJS) unac.o
|
||||||
ar ru librcl.a $(OBJS) unac.o
|
ar ru librcl.a $(OBJS) unac.o
|
||||||
@ -37,6 +37,8 @@ mh_html.o : ../common/mh_html.cpp
|
|||||||
$(CXX) $(ALL_CXXFLAGS) -c ../common/mh_html.cpp
|
$(CXX) $(ALL_CXXFLAGS) -c ../common/mh_html.cpp
|
||||||
mh_mail.o : ../common/mh_mail.cpp
|
mh_mail.o : ../common/mh_mail.cpp
|
||||||
$(CXX) $(ALL_CXXFLAGS) -c ../common/mh_mail.cpp
|
$(CXX) $(ALL_CXXFLAGS) -c ../common/mh_mail.cpp
|
||||||
|
searchdata.o : ../common/searchdata.cpp
|
||||||
|
$(CXX) $(ALL_CXXFLAGS) -c ../common/searchdata.cpp
|
||||||
mh_exec.o : ../common/mh_exec.cpp
|
mh_exec.o : ../common/mh_exec.cpp
|
||||||
$(CXX) $(ALL_CXXFLAGS) -c ../common/mh_exec.cpp
|
$(CXX) $(ALL_CXXFLAGS) -c ../common/mh_exec.cpp
|
||||||
mh_text.o : ../common/mh_text.cpp
|
mh_text.o : ../common/mh_text.cpp
|
||||||
@ -125,6 +127,9 @@ mh_html.dep.stamp : ../common/mh_html.cpp
|
|||||||
mh_mail.dep.stamp : ../common/mh_mail.cpp
|
mh_mail.dep.stamp : ../common/mh_mail.cpp
|
||||||
$(CXX) -M $(ALL_CXXFLAGS) ../common/mh_mail.cpp > mh_mail.dep
|
$(CXX) -M $(ALL_CXXFLAGS) ../common/mh_mail.cpp > mh_mail.dep
|
||||||
touch mh_mail.dep.stamp
|
touch mh_mail.dep.stamp
|
||||||
|
searchdata.dep.stamp : ../common/searchdata.cpp
|
||||||
|
$(CXX) -M $(ALL_CXXFLAGS) ../common/searchdata.cpp > searchdata.dep
|
||||||
|
touch searchdata.dep.stamp
|
||||||
mh_exec.dep.stamp : ../common/mh_exec.cpp
|
mh_exec.dep.stamp : ../common/mh_exec.cpp
|
||||||
$(CXX) -M $(ALL_CXXFLAGS) ../common/mh_exec.cpp > mh_exec.dep
|
$(CXX) -M $(ALL_CXXFLAGS) ../common/mh_exec.cpp > mh_exec.dep
|
||||||
touch mh_exec.dep.stamp
|
touch mh_exec.dep.stamp
|
||||||
@ -213,6 +218,7 @@ include wipedir.dep
|
|||||||
include fstreewalk.dep
|
include fstreewalk.dep
|
||||||
include mh_html.dep
|
include mh_html.dep
|
||||||
include mh_mail.dep
|
include mh_mail.dep
|
||||||
|
include searchdata.dep
|
||||||
include mh_exec.dep
|
include mh_exec.dep
|
||||||
include mh_text.dep
|
include mh_text.dep
|
||||||
include htmlparse.dep
|
include htmlparse.dep
|
||||||
|
|||||||
@ -8,6 +8,7 @@ SRCS="${depth}/utils/conftree.cpp ${depth}/index/csguess.cpp \
|
|||||||
${depth}/utils/idfile.cpp ${depth}/utils/md5.cpp \
|
${depth}/utils/idfile.cpp ${depth}/utils/md5.cpp \
|
||||||
${depth}/utils/wipedir.cpp ${depth}/utils/fstreewalk.cpp \
|
${depth}/utils/wipedir.cpp ${depth}/utils/fstreewalk.cpp \
|
||||||
${depth}/common/mh_html.cpp ${depth}/common/mh_mail.cpp \
|
${depth}/common/mh_html.cpp ${depth}/common/mh_mail.cpp \
|
||||||
|
${depth}/common/searchdata.cpp \
|
||||||
${depth}/common/mh_exec.cpp ${depth}/common/mh_text.cpp \
|
${depth}/common/mh_exec.cpp ${depth}/common/mh_text.cpp \
|
||||||
${depth}/common/htmlparse.cpp ${depth}/index/indexer.cpp \
|
${depth}/common/htmlparse.cpp ${depth}/index/indexer.cpp \
|
||||||
${depth}/common/internfile.cpp ${depth}/common/mimehandler.cpp \
|
${depth}/common/internfile.cpp ${depth}/common/mimehandler.cpp \
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
#ifndef lint
|
#ifndef lint
|
||||||
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.90 2006-11-12 08:35:11 dockes Exp $ (C) 2004 J.F.Dockes";
|
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.91 2006-11-13 08:49:44 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||||
#endif
|
#endif
|
||||||
/*
|
/*
|
||||||
* This program is free software; you can redistribute it and/or modify
|
* This program is free software; you can redistribute it and/or modify
|
||||||
@ -174,6 +174,229 @@ bool Native::subDocs(const string &hash, vector<Xapian::docid>& docids)
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool Native::dbDataToRclDoc(std::string &data, Doc &doc,
|
||||||
|
int qopts,
|
||||||
|
Xapian::docid docid, const list<string>& terms)
|
||||||
|
{
|
||||||
|
LOGDEB1(("Db::dbDataToRclDoc: opts %x data: %s\n", qopts, data.c_str()));
|
||||||
|
ConfSimple parms(&data);
|
||||||
|
if (!parms.ok())
|
||||||
|
return false;
|
||||||
|
parms.get(string("url"), doc.url);
|
||||||
|
parms.get(string("mtype"), doc.mimetype);
|
||||||
|
parms.get(string("fmtime"), doc.fmtime);
|
||||||
|
parms.get(string("dmtime"), doc.dmtime);
|
||||||
|
parms.get(string("origcharset"), doc.origcharset);
|
||||||
|
parms.get(string("caption"), doc.title);
|
||||||
|
parms.get(string("keywords"), doc.keywords);
|
||||||
|
parms.get(string("abstract"), doc.abstract);
|
||||||
|
// Possibly remove synthetic abstract indicator (if it's there, we
|
||||||
|
// used to index the beginning of the text as abstract).
|
||||||
|
bool syntabs = false;
|
||||||
|
if (doc.abstract.find(rclSyntAbs) == 0) {
|
||||||
|
doc.abstract = doc.abstract.substr(rclSyntAbs.length());
|
||||||
|
syntabs = true;
|
||||||
|
}
|
||||||
|
// If the option is set and the abstract is synthetic or empty , build
|
||||||
|
// abstract from position data.
|
||||||
|
if ((qopts & Db::QO_BUILD_ABSTRACT) && !terms.empty()) {
|
||||||
|
LOGDEB(("dbDataToRclDoc:: building abstract from position data\n"));
|
||||||
|
if (doc.abstract.empty() || syntabs ||
|
||||||
|
(qopts & Db::QO_REPLACE_ABSTRACT))
|
||||||
|
doc.abstract = makeAbstract(docid, terms);
|
||||||
|
}
|
||||||
|
parms.get(string("ipath"), doc.ipath);
|
||||||
|
parms.get(string("fbytes"), doc.fbytes);
|
||||||
|
parms.get(string("dbytes"), doc.dbytes);
|
||||||
|
doc.xdocid = docid;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// We build a possibly full size but sparsely populated (only around
|
||||||
|
// the search term occurrences) reconstruction of the document. It
|
||||||
|
// would be possible to compress the array, by having only multiple
|
||||||
|
// chunks around the terms, but this would seriously complicate the
|
||||||
|
// data structure.
|
||||||
|
string Native::makeAbstract(Xapian::docid docid, const list<string>& terms)
|
||||||
|
{
|
||||||
|
LOGDEB(("Native::makeAbstract: maxlen %d wWidth %d\n",
|
||||||
|
m_db->m_synthAbsLen, m_db->m_synthAbsWordCtxLen));
|
||||||
|
|
||||||
|
Chrono chron;
|
||||||
|
|
||||||
|
// For each of the query terms, query xapian for its positions
|
||||||
|
// list in the document. For each position entry, remember it in qtermposs
|
||||||
|
// and insert it and its neighbours in the set of 'interesting' positions
|
||||||
|
|
||||||
|
// The terms 'array' that we partially populate with the document
|
||||||
|
// terms, at their positions around the search terms positions:
|
||||||
|
map<unsigned int, string> sparseDoc;
|
||||||
|
|
||||||
|
// All the query term positions. We remember this mainly because we are
|
||||||
|
// going to random-shuffle it for selecting the chunks that we actually
|
||||||
|
// print.
|
||||||
|
vector<unsigned int> qtermposs;
|
||||||
|
|
||||||
|
// Limit the total number of slots we populate.
|
||||||
|
const unsigned int maxtotaloccs = 300;
|
||||||
|
// Max occurrences per term. We initially know nothing about the
|
||||||
|
// occurrences repartition (it would be possible that only one
|
||||||
|
// term in the list occurs, or that all do). So this is a rather
|
||||||
|
// arbitrary choice.
|
||||||
|
const unsigned int maxoccperterm = maxtotaloccs / 10;
|
||||||
|
unsigned int totaloccs = 0;
|
||||||
|
|
||||||
|
for (list<string>::const_iterator qit = terms.begin(); qit != terms.end();
|
||||||
|
qit++) {
|
||||||
|
Xapian::PositionIterator pos;
|
||||||
|
// There may be query terms not in this doc. This raises an
|
||||||
|
// exception when requesting the position list, we catch it.
|
||||||
|
string emptys;
|
||||||
|
try {
|
||||||
|
unsigned int occurrences = 0;
|
||||||
|
for (pos = db.positionlist_begin(docid, *qit);
|
||||||
|
pos != db.positionlist_end(docid, *qit); pos++) {
|
||||||
|
unsigned int ipos = *pos;
|
||||||
|
LOGDEB2(("Abstract: [%s] at %d\n", qit->c_str(), ipos));
|
||||||
|
// Remember the term position
|
||||||
|
qtermposs.push_back(ipos);
|
||||||
|
// Add adjacent slots to the set to populate at next step
|
||||||
|
unsigned int sta = MAX(0, ipos-m_db->m_synthAbsWordCtxLen);
|
||||||
|
unsigned int sto = ipos+m_db->m_synthAbsWordCtxLen;
|
||||||
|
for (unsigned int ii = sta; ii <= sto; ii++) {
|
||||||
|
if (ii == ipos)
|
||||||
|
sparseDoc[ii] = *qit;
|
||||||
|
else
|
||||||
|
sparseDoc[ii] = emptys;
|
||||||
|
}
|
||||||
|
// Limit the number of occurences we keep for each
|
||||||
|
// term. The abstract has a finite length anyway !
|
||||||
|
if (occurrences++ > maxoccperterm)
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
} catch (...) {
|
||||||
|
// Term does not occur. No problem.
|
||||||
|
}
|
||||||
|
// Limit total size
|
||||||
|
if (totaloccs++ > maxtotaloccs)
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
LOGDEB(("Abstract:%d:chosen number of positions %d. Populating\n",
|
||||||
|
chron.millis(), qtermposs.size()));
|
||||||
|
|
||||||
|
// Walk the full document position list (for each term walk
|
||||||
|
// position list) and populate slots around the query terms. We
|
||||||
|
// arbitrarily truncate the list to avoid taking forever. If we do
|
||||||
|
// cutoff, the abstract may be inconsistant, which is bad...
|
||||||
|
{
|
||||||
|
Xapian::TermIterator term;
|
||||||
|
int cutoff = 500 * 1000;
|
||||||
|
|
||||||
|
for (term = db.termlist_begin(docid);
|
||||||
|
term != db.termlist_end(docid); term++) {
|
||||||
|
if (cutoff-- < 0) {
|
||||||
|
LOGDEB(("Abstract: max term count cutoff\n"));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
Xapian::PositionIterator pos;
|
||||||
|
for (pos = db.positionlist_begin(docid, *term);
|
||||||
|
pos != db.positionlist_end(docid, *term); pos++) {
|
||||||
|
if (cutoff-- < 0) {
|
||||||
|
LOGDEB(("Abstract: max term count cutoff\n"));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
map<unsigned int, string>::iterator vit;
|
||||||
|
if ((vit=sparseDoc.find(*pos)) != sparseDoc.end()) {
|
||||||
|
// Don't replace a term: the terms list is in
|
||||||
|
// alphabetic order, and we may have several terms
|
||||||
|
// at the same position, we want to keep only the
|
||||||
|
// first one (ie: dockes and dockes@wanadoo.fr)
|
||||||
|
if (vit->second.empty()) {
|
||||||
|
LOGDEB2(("Abstract: populating: [%s] at %d\n",
|
||||||
|
(*term).c_str(), *pos));
|
||||||
|
sparseDoc[*pos] = *term;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
// Debug only: output the full term[position] vector
|
||||||
|
bool epty = false;
|
||||||
|
int ipos = 0;
|
||||||
|
for (map<unsigned int, string>::iterator it = sparseDoc.begin();
|
||||||
|
it != sparseDoc.end();
|
||||||
|
it++, ipos++) {
|
||||||
|
if (it->empty()) {
|
||||||
|
if (!epty)
|
||||||
|
LOGDEB(("Abstract:vec[%d]: [%s]\n", ipos, it->c_str()));
|
||||||
|
epty=true;
|
||||||
|
} else {
|
||||||
|
epty = false;
|
||||||
|
LOGDEB(("Abstract:vec[%d]: [%s]\n", ipos, it->c_str()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
LOGDEB(("Abstract:%d: randomizing and extracting\n", chron.millis()));
|
||||||
|
|
||||||
|
// We randomize the selection of term positions, from which we
|
||||||
|
// shall pull, starting at the beginning, until the abstract is
|
||||||
|
// big enough. The abstract is finally built in correct position
|
||||||
|
// order, thanks to the position map.
|
||||||
|
random_shuffle(qtermposs.begin(), qtermposs.end());
|
||||||
|
map<unsigned int, string> mabs;
|
||||||
|
unsigned int abslen = 0;
|
||||||
|
|
||||||
|
// Extract data around the N first (in random order) query term
|
||||||
|
// positions, and store the terms in the map. Don't concatenate
|
||||||
|
// immediately into chunks because there might be overlaps
|
||||||
|
for (vector<unsigned int>::const_iterator pos = qtermposs.begin();
|
||||||
|
pos != qtermposs.end(); pos++) {
|
||||||
|
|
||||||
|
if (int(abslen) > m_db->m_synthAbsLen)
|
||||||
|
break;
|
||||||
|
|
||||||
|
unsigned int sta = MAX(0, *pos - m_db->m_synthAbsWordCtxLen);
|
||||||
|
unsigned int sto = *pos + m_db->m_synthAbsWordCtxLen;
|
||||||
|
|
||||||
|
LOGDEB2(("Abstract: %d<-%d->%d\n", sta, *pos, sto));
|
||||||
|
|
||||||
|
for (unsigned int ii = sta; ii <= sto; ii++) {
|
||||||
|
|
||||||
|
if (int(abslen) > m_db->m_synthAbsLen)
|
||||||
|
break;
|
||||||
|
map<unsigned int, string>::const_iterator vit =
|
||||||
|
sparseDoc.find(ii);
|
||||||
|
if (vit != sparseDoc.end() && !vit->second.empty()) {
|
||||||
|
LOGDEB2(("Abstract: position %d -> [%s]\n",
|
||||||
|
ii, vit->second.c_str()));
|
||||||
|
mabs[ii] = vit->second;
|
||||||
|
abslen += vit->second.length();
|
||||||
|
} else {
|
||||||
|
LOGDEB2(("Abstract: empty position at %d\n", ii));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Possibly add a ... at the end of chunk if it's not
|
||||||
|
// overlapping
|
||||||
|
if (mabs.find(sto+1) == mabs.end())
|
||||||
|
mabs[sto+1] = "...";
|
||||||
|
}
|
||||||
|
|
||||||
|
// Build the abstract by walking the map (in order of position)
|
||||||
|
string abstract;
|
||||||
|
for (map<unsigned int, string>::const_iterator it = mabs.begin();
|
||||||
|
it != mabs.end(); it++) {
|
||||||
|
LOGDEB2(("Abtract:output %u -> [%s]\n", it->first,it->second.c_str()));
|
||||||
|
abstract += it->second + " ";
|
||||||
|
}
|
||||||
|
LOGDEB(("Abtract: done in %d mS\n", chron.millis()));
|
||||||
|
return abstract;
|
||||||
|
}
|
||||||
|
|
||||||
/* Rcl::Db methods ///////////////////////////////// */
|
/* Rcl::Db methods ///////////////////////////////// */
|
||||||
|
|
||||||
@ -909,279 +1132,67 @@ bool Db::purgeFile(const string &fn)
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Splitter callback for breaking query into terms
|
bool Db::filenameWildExp(const string& fnexp, list<string>& names)
|
||||||
class wsQData : public TextSplitCB {
|
|
||||||
public:
|
|
||||||
vector<string> terms;
|
|
||||||
string catterms() {
|
|
||||||
string s;
|
|
||||||
for (unsigned int i=0;i<terms.size();i++) {
|
|
||||||
s += "[" + terms[i] + "] ";
|
|
||||||
}
|
|
||||||
return s;
|
|
||||||
}
|
|
||||||
bool takeword(const std::string &term, int , int, int) {
|
|
||||||
LOGDEB1(("wsQData::takeword: %s\n", term.c_str()));
|
|
||||||
terms.push_back(term);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
void dumball() {
|
|
||||||
for (vector<string>::iterator it=terms.begin(); it !=terms.end();it++){
|
|
||||||
string dumb;
|
|
||||||
dumb_string(*it, dumb);
|
|
||||||
*it = dumb;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
// Turn string into list of xapian queries. There is little
|
|
||||||
// interpretation done on the string (no +term -term or filename:term
|
|
||||||
// stuff). We just separate words and phrases, and interpret
|
|
||||||
// capitalized terms as wanting no stem expansion.
|
|
||||||
// The final list contains one query for each term or phrase
|
|
||||||
// - Elements corresponding to a stem-expanded part are an OP_OR
|
|
||||||
// composition of the stem-expanded terms (or a single term query).
|
|
||||||
// - Elements corresponding to a phrase are an OP_PHRASE composition of the
|
|
||||||
// phrase terms (no stem expansion in this case)
|
|
||||||
static void stringToXapianQueries(const string &iq,
|
|
||||||
const string& stemlang,
|
|
||||||
Db *db,
|
|
||||||
list<Xapian::Query> &pqueries,
|
|
||||||
unsigned int opts = Db::QO_NONE)
|
|
||||||
{
|
{
|
||||||
string qstring = iq;
|
// File name search, with possible wildcards.
|
||||||
|
// We expand wildcards by scanning the filename terms (prefixed
|
||||||
|
// with XSFN) from the database.
|
||||||
|
// We build an OR query with the expanded values if any.
|
||||||
|
string pattern;
|
||||||
|
dumb_string(fnexp, pattern);
|
||||||
|
|
||||||
// Split into (possibly single word) phrases ("this is a phrase"):
|
// If pattern is not quoted, and has no wildcards, we add * at
|
||||||
list<string> phrases;
|
// each end: match any substring
|
||||||
stringToStrings(qstring, phrases);
|
if (pattern[0] == '"' && pattern[pattern.size()-1] == '"') {
|
||||||
|
pattern = pattern.substr(1, pattern.size() -2);
|
||||||
|
} else if (pattern.find_first_of("*?[") == string::npos) {
|
||||||
|
pattern = "*" + pattern + "*";
|
||||||
|
} // else let it be
|
||||||
|
|
||||||
// Then process each phrase: split into terms and transform into
|
LOGDEB((" pattern: [%s]\n", pattern.c_str()));
|
||||||
// appropriate Xapian Query
|
|
||||||
|
|
||||||
for (list<string>::iterator it=phrases.begin(); it !=phrases.end(); it++) {
|
// Match pattern against all file names in the db
|
||||||
LOGDEB(("strToXapianQ: phrase or word: [%s]\n", it->c_str()));
|
Xapian::TermIterator it = m_ndb->db.allterms_begin();
|
||||||
|
it.skip_to("XSFN");
|
||||||
// If there are both spans and single words in this element,
|
for (;it != m_ndb->db.allterms_end(); it++) {
|
||||||
// we need to use a word split, else a phrase query including
|
if ((*it).find("XSFN") != 0)
|
||||||
// a span would fail if we didn't adjust the proximity to
|
break;
|
||||||
// account for the additional span term which is complicated.
|
string fn = (*it).substr(4);
|
||||||
wsQData splitDataS, splitDataW;
|
LOGDEB2(("Matching [%s] and [%s]\n", pattern.c_str(), fn.c_str()));
|
||||||
TextSplit splitterS(&splitDataS, TextSplit::TXTS_ONLYSPANS);
|
if (fnmatch(pattern.c_str(), fn.c_str(), 0) != FNM_NOMATCH) {
|
||||||
splitterS.text_to_words(*it);
|
names.push_back((*it).c_str());
|
||||||
TextSplit splitterW(&splitDataW, TextSplit::TXTS_NOSPANS);
|
}
|
||||||
splitterW.text_to_words(*it);
|
// Limit the match count
|
||||||
wsQData& splitData = splitDataS;
|
if (names.size() > 1000) {
|
||||||
if (splitDataS.terms.size() > 1 && splitDataS.terms.size() !=
|
LOGERR(("Db::SetQuery: too many matched file names\n"));
|
||||||
splitDataW.terms.size())
|
|
||||||
splitData = splitDataW;
|
|
||||||
|
|
||||||
LOGDEB1(("strToXapianQ: splitter term count: %d\n",
|
|
||||||
splitData.terms.size()));
|
|
||||||
switch(splitData.terms.size()) {
|
|
||||||
case 0: continue;// ??
|
|
||||||
case 1: // Not a real phrase: one term
|
|
||||||
{
|
|
||||||
string term = splitData.terms.front();
|
|
||||||
bool nostemexp = false;
|
|
||||||
// Check if the first letter is a majuscule in which
|
|
||||||
// case we do not want to do stem expansion. Note that
|
|
||||||
// the test is convoluted and possibly problematic
|
|
||||||
if (term.length() > 0) {
|
|
||||||
string noacterm,noaclowterm;
|
|
||||||
if (unacmaybefold(term, noacterm, "UTF-8", false) &&
|
|
||||||
unacmaybefold(noacterm, noaclowterm, "UTF-8", true)) {
|
|
||||||
Utf8Iter it1(noacterm);
|
|
||||||
Utf8Iter it2(noaclowterm);
|
|
||||||
if (*it1 != *it2)
|
|
||||||
nostemexp = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
LOGDEB1(("Term: %s stem expansion: %s\n",
|
|
||||||
term.c_str(), nostemexp?"no":"yes"));
|
|
||||||
|
|
||||||
list<string> exp;
|
|
||||||
string term1;
|
|
||||||
dumb_string(term, term1);
|
|
||||||
// Possibly perform stem compression/expansion
|
|
||||||
if (!nostemexp && (opts & Db::QO_STEM)) {
|
|
||||||
exp = db->stemExpand(stemlang, term1);
|
|
||||||
} else {
|
|
||||||
exp.push_back(term1);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Push either term or OR of stem-expanded set
|
|
||||||
pqueries.push_back(Xapian::Query(Xapian::Query::OP_OR,
|
|
||||||
exp.begin(), exp.end()));
|
|
||||||
}
|
|
||||||
break;
|
break;
|
||||||
|
|
||||||
default:
|
|
||||||
// Phrase: no stem expansion
|
|
||||||
splitData.dumball();
|
|
||||||
LOGDEB(("Pushing phrase: [%s]\n", splitData.catterms().c_str()));
|
|
||||||
pqueries.push_back(Xapian::Query(Xapian::Query::OP_PHRASE,
|
|
||||||
splitData.terms.begin(),
|
|
||||||
splitData.terms.end()));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (names.empty()) {
|
||||||
|
// Build an impossible query: we know its impossible because we
|
||||||
|
// control the prefixes!
|
||||||
|
names.push_back("XIMPOSSIBLE");
|
||||||
|
}
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Prepare query out of "advanced search" data
|
// Prepare query out of "advanced search" data
|
||||||
bool Db::setQuery(AdvSearchData &sdata, int opts, const string& stemlang)
|
bool Db::setQuery(RefCntr<SearchData> sdata, int opts,
|
||||||
|
const string& stemlang)
|
||||||
{
|
{
|
||||||
LOGDEB(("Db::setQuery: adv:\n"));
|
if (!m_ndb) {
|
||||||
LOGDEB((" allwords: %s\n", sdata.allwords.c_str()));
|
LOGERR(("Db::setQuery: no db!\n"));
|
||||||
LOGDEB((" phrase: %s\n", sdata.phrase.c_str()));
|
|
||||||
LOGDEB((" orwords: %s\n", sdata.orwords.c_str()));
|
|
||||||
LOGDEB((" orwords1: %s\n", sdata.orwords1.c_str()));
|
|
||||||
LOGDEB((" nowords: %s\n", sdata.nowords.c_str()));
|
|
||||||
LOGDEB((" filename: %s\n", sdata.filename.c_str()));
|
|
||||||
|
|
||||||
string ft;
|
|
||||||
for (list<string>::iterator it = sdata.filetypes.begin();
|
|
||||||
it != sdata.filetypes.end(); it++) {ft += *it + " ";}
|
|
||||||
if (!ft.empty())
|
|
||||||
LOGDEB((" searched file types: %s\n", ft.c_str()));
|
|
||||||
if (!sdata.topdir.empty())
|
|
||||||
LOGDEB((" restricted to: %s\n", sdata.topdir.c_str()));
|
|
||||||
LOGDEB((" Options: 0x%x\n", opts));
|
|
||||||
|
|
||||||
m_filterTopDir = sdata.topdir;
|
|
||||||
m_dbindices.clear();
|
|
||||||
|
|
||||||
if (!m_ndb)
|
|
||||||
return false;
|
return false;
|
||||||
list<Xapian::Query> pqueries;
|
}
|
||||||
Xapian::Query xq;
|
|
||||||
|
|
||||||
|
LOGDEB(("Db::setQuery:\n"));
|
||||||
|
|
||||||
|
m_filterTopDir = sdata->m_topdir;
|
||||||
|
m_dbindices.clear();
|
||||||
m_qOpts = opts;
|
m_qOpts = opts;
|
||||||
|
|
||||||
if (!sdata.filename.empty()) {
|
Xapian::Query xq;
|
||||||
LOGDEB((" filename search\n"));
|
sdata->toNativeQuery(*this, &xq, (opts & Db::QO_STEM) ? stemlang : "");
|
||||||
// File name search, with possible wildcards.
|
|
||||||
// We expand wildcards by scanning the filename terms (prefixed
|
|
||||||
// with XSFN) from the database.
|
|
||||||
// We build an OR query with the expanded values if any.
|
|
||||||
string pattern;
|
|
||||||
dumb_string(sdata.filename, pattern);
|
|
||||||
|
|
||||||
// If pattern is not quoted, and has no wildcards, we add * at
|
|
||||||
// each end: match any substring
|
|
||||||
if (pattern[0] == '"' && pattern[pattern.size()-1] == '"') {
|
|
||||||
pattern = pattern.substr(1, pattern.size() -2);
|
|
||||||
} else if (pattern.find_first_of("*?[") == string::npos) {
|
|
||||||
pattern = "*" + pattern + "*";
|
|
||||||
} // else let it be
|
|
||||||
|
|
||||||
LOGDEB((" pattern: [%s]\n", pattern.c_str()));
|
|
||||||
|
|
||||||
// Match pattern against all file names in the db
|
|
||||||
Xapian::TermIterator it = m_ndb->db.allterms_begin();
|
|
||||||
it.skip_to("XSFN");
|
|
||||||
list<string> names;
|
|
||||||
for (;it != m_ndb->db.allterms_end(); it++) {
|
|
||||||
if ((*it).find("XSFN") != 0)
|
|
||||||
break;
|
|
||||||
string fn = (*it).substr(4);
|
|
||||||
LOGDEB2(("Matching [%s] and [%s]\n", pattern.c_str(), fn.c_str()));
|
|
||||||
if (fnmatch(pattern.c_str(), fn.c_str(), 0) != FNM_NOMATCH) {
|
|
||||||
names.push_back((*it).c_str());
|
|
||||||
}
|
|
||||||
// Limit the match count
|
|
||||||
if (names.size() > 1000) {
|
|
||||||
LOGERR(("Db::SetQuery: too many matched file names\n"));
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (names.empty()) {
|
|
||||||
// Build an impossible query: we know its impossible because we
|
|
||||||
// control the prefixes!
|
|
||||||
names.push_back("XIMPOSSIBLE");
|
|
||||||
}
|
|
||||||
// Build a query out of the matching file name terms.
|
|
||||||
xq = Xapian::Query(Xapian::Query::OP_OR, names.begin(), names.end());
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!sdata.allwords.empty()) {
|
|
||||||
stringToXapianQueries(sdata.allwords, stemlang, this,pqueries,m_qOpts);
|
|
||||||
if (!pqueries.empty()) {
|
|
||||||
Xapian::Query nq =
|
|
||||||
Xapian::Query(Xapian::Query::OP_AND, pqueries.begin(),
|
|
||||||
pqueries.end());
|
|
||||||
xq = xq.empty() ? nq :
|
|
||||||
Xapian::Query(Xapian::Query::OP_AND, xq, nq);
|
|
||||||
pqueries.clear();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!sdata.orwords.empty()) {
|
|
||||||
stringToXapianQueries(sdata.orwords, stemlang, this,pqueries,m_qOpts);
|
|
||||||
if (!pqueries.empty()) {
|
|
||||||
Xapian::Query nq =
|
|
||||||
Xapian::Query(Xapian::Query::OP_OR, pqueries.begin(),
|
|
||||||
pqueries.end());
|
|
||||||
xq = xq.empty() ? nq :
|
|
||||||
Xapian::Query(Xapian::Query::OP_AND, xq, nq);
|
|
||||||
pqueries.clear();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!sdata.orwords1.empty()) {
|
|
||||||
stringToXapianQueries(sdata.orwords1, stemlang, this,pqueries,m_qOpts);
|
|
||||||
if (!pqueries.empty()) {
|
|
||||||
Xapian::Query nq =
|
|
||||||
Xapian::Query(Xapian::Query::OP_OR, pqueries.begin(),
|
|
||||||
pqueries.end());
|
|
||||||
xq = xq.empty() ? nq :
|
|
||||||
Xapian::Query(Xapian::Query::OP_AND, xq, nq);
|
|
||||||
pqueries.clear();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!sdata.phrase.empty()) {
|
|
||||||
Xapian::Query nq;
|
|
||||||
string s = string("\"") + sdata.phrase + string("\"");
|
|
||||||
stringToXapianQueries(s, stemlang, this, pqueries);
|
|
||||||
if (!pqueries.empty()) {
|
|
||||||
// There should be a single list element phrase query.
|
|
||||||
xq = xq.empty() ? *pqueries.begin() :
|
|
||||||
Xapian::Query(Xapian::Query::OP_AND, xq, *pqueries.begin());
|
|
||||||
pqueries.clear();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!sdata.filetypes.empty()) {
|
|
||||||
Xapian::Query tq;
|
|
||||||
for (list<string>::iterator it = sdata.filetypes.begin();
|
|
||||||
it != sdata.filetypes.end(); it++) {
|
|
||||||
string term = "T" + *it;
|
|
||||||
LOGDEB(("Adding file type term: [%s]\n", term.c_str()));
|
|
||||||
tq = tq.empty() ? Xapian::Query(term) :
|
|
||||||
Xapian::Query(Xapian::Query::OP_OR, tq, Xapian::Query(term));
|
|
||||||
}
|
|
||||||
xq = xq.empty() ? tq : Xapian::Query(Xapian::Query::OP_FILTER, xq, tq);
|
|
||||||
}
|
|
||||||
|
|
||||||
// "And not" part. Must come last, as we have to check it's not
|
|
||||||
// the only term in the query. We do no stem expansion on 'No'
|
|
||||||
// words. Should we ?
|
|
||||||
if (!sdata.nowords.empty()) {
|
|
||||||
stringToXapianQueries(sdata.nowords, stemlang, this, pqueries);
|
|
||||||
if (!pqueries.empty()) {
|
|
||||||
Xapian::Query nq;
|
|
||||||
nq = Xapian::Query(Xapian::Query::OP_OR, pqueries.begin(),
|
|
||||||
pqueries.end());
|
|
||||||
if (xq.empty()) {
|
|
||||||
// Xapian cant do this currently. Have to have a positive
|
|
||||||
// part!
|
|
||||||
sdata.description = "Error: pure negative query\n";
|
|
||||||
LOGERR(("Rcl::Db::setQuery: error: pure negative query\n"));
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
xq = Xapian::Query(Xapian::Query::OP_AND_NOT, xq, nq);
|
|
||||||
pqueries.clear();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
m_ndb->query = xq;
|
m_ndb->query = xq;
|
||||||
delete m_ndb->enquire;
|
delete m_ndb->enquire;
|
||||||
@ -1189,10 +1200,11 @@ bool Db::setQuery(AdvSearchData &sdata, int opts, const string& stemlang)
|
|||||||
m_ndb->enquire->set_query(m_ndb->query);
|
m_ndb->enquire->set_query(m_ndb->query);
|
||||||
m_ndb->mset = Xapian::MSet();
|
m_ndb->mset = Xapian::MSet();
|
||||||
// Get the query description and trim the "Xapian::Query"
|
// Get the query description and trim the "Xapian::Query"
|
||||||
sdata.description = m_ndb->query.get_description();
|
sdata->m_description = m_ndb->query.get_description();
|
||||||
if (sdata.description.find("Xapian::Query") == 0)
|
if (sdata->m_description.find("Xapian::Query") == 0)
|
||||||
sdata.description = sdata.description.substr(strlen("Xapian::Query"));
|
sdata->m_description =
|
||||||
LOGDEB(("Db::SetQuery: Q: %s\n", sdata.description.c_str()));
|
sdata->m_description.substr(strlen("Xapian::Query"));
|
||||||
|
LOGDEB(("Db::SetQuery: Q: %s\n", sdata->m_description.c_str()));
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1422,43 +1434,6 @@ int Db::getResCnt()
|
|||||||
return m_ndb->mset.get_matches_lower_bound();
|
return m_ndb->mset.get_matches_lower_bound();
|
||||||
}
|
}
|
||||||
|
|
||||||
bool Native::dbDataToRclDoc(std::string &data, Doc &doc,
|
|
||||||
int qopts,
|
|
||||||
Xapian::docid docid, const list<string>& terms)
|
|
||||||
{
|
|
||||||
LOGDEB1(("Db::dbDataToRclDoc: opts %x data: %s\n", qopts, data.c_str()));
|
|
||||||
ConfSimple parms(&data);
|
|
||||||
if (!parms.ok())
|
|
||||||
return false;
|
|
||||||
parms.get(string("url"), doc.url);
|
|
||||||
parms.get(string("mtype"), doc.mimetype);
|
|
||||||
parms.get(string("fmtime"), doc.fmtime);
|
|
||||||
parms.get(string("dmtime"), doc.dmtime);
|
|
||||||
parms.get(string("origcharset"), doc.origcharset);
|
|
||||||
parms.get(string("caption"), doc.title);
|
|
||||||
parms.get(string("keywords"), doc.keywords);
|
|
||||||
parms.get(string("abstract"), doc.abstract);
|
|
||||||
// Possibly remove synthetic abstract indicator (if it's there, we
|
|
||||||
// used to index the beginning of the text as abstract).
|
|
||||||
bool syntabs = false;
|
|
||||||
if (doc.abstract.find(rclSyntAbs) == 0) {
|
|
||||||
doc.abstract = doc.abstract.substr(rclSyntAbs.length());
|
|
||||||
syntabs = true;
|
|
||||||
}
|
|
||||||
// If the option is set and the abstract is synthetic or empty , build
|
|
||||||
// abstract from position data.
|
|
||||||
if ((qopts & Db::QO_BUILD_ABSTRACT) && !terms.empty()) {
|
|
||||||
LOGDEB(("dbDataToRclDoc:: building abstract from position data\n"));
|
|
||||||
if (doc.abstract.empty() || syntabs ||
|
|
||||||
(qopts & Db::QO_REPLACE_ABSTRACT))
|
|
||||||
doc.abstract = makeAbstract(docid, terms);
|
|
||||||
}
|
|
||||||
parms.get(string("ipath"), doc.ipath);
|
|
||||||
parms.get(string("fbytes"), doc.fbytes);
|
|
||||||
parms.get(string("dbytes"), doc.dbytes);
|
|
||||||
doc.xdocid = docid;
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Get document at rank i in query (i is the index in the whole result
|
// Get document at rank i in query (i is the index in the whole result
|
||||||
// set, as in the enquire class. We check if the current mset has the
|
// set, as in the enquire class. We check if the current mset has the
|
||||||
@ -1641,191 +1616,6 @@ list<string> Db::expand(const Doc &doc)
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// We build a possibly full size but sparsely populated (only around
|
|
||||||
// the search term occurrences) reconstruction of the document. It
|
|
||||||
// would be possible to compress the array, by having only multiple
|
|
||||||
// chunks around the terms, but this would seriously complicate the
|
|
||||||
// data structure.
|
|
||||||
string Native::makeAbstract(Xapian::docid docid, const list<string>& terms)
|
|
||||||
{
|
|
||||||
LOGDEB(("Native::makeAbstract: maxlen %d wWidth %d\n",
|
|
||||||
m_db->m_synthAbsLen, m_db->m_synthAbsWordCtxLen));
|
|
||||||
|
|
||||||
Chrono chron;
|
|
||||||
|
|
||||||
// For each of the query terms, query xapian for its positions
|
|
||||||
// list in the document. For each position entry, remember it in qtermposs
|
|
||||||
// and insert it and its neighbours in the set of 'interesting' positions
|
|
||||||
|
|
||||||
// The terms 'array' that we partially populate with the document
|
|
||||||
// terms, at their positions around the search terms positions:
|
|
||||||
map<unsigned int, string> sparseDoc;
|
|
||||||
|
|
||||||
// All the query term positions. We remember this mainly because we are
|
|
||||||
// going to random-shuffle it for selecting the chunks that we actually
|
|
||||||
// print.
|
|
||||||
vector<unsigned int> qtermposs;
|
|
||||||
|
|
||||||
// Limit the total number of slots we populate.
|
|
||||||
const unsigned int maxtotaloccs = 300;
|
|
||||||
// Max occurrences per term. We initially know nothing about the
|
|
||||||
// occurrences repartition (it would be possible that only one
|
|
||||||
// term in the list occurs, or that all do). So this is a rather
|
|
||||||
// arbitrary choice.
|
|
||||||
const unsigned int maxoccperterm = maxtotaloccs / 10;
|
|
||||||
unsigned int totaloccs = 0;
|
|
||||||
|
|
||||||
for (list<string>::const_iterator qit = terms.begin(); qit != terms.end();
|
|
||||||
qit++) {
|
|
||||||
Xapian::PositionIterator pos;
|
|
||||||
// There may be query terms not in this doc. This raises an
|
|
||||||
// exception when requesting the position list, we catch it.
|
|
||||||
string emptys;
|
|
||||||
try {
|
|
||||||
unsigned int occurrences = 0;
|
|
||||||
for (pos = db.positionlist_begin(docid, *qit);
|
|
||||||
pos != db.positionlist_end(docid, *qit); pos++) {
|
|
||||||
unsigned int ipos = *pos;
|
|
||||||
LOGDEB2(("Abstract: [%s] at %d\n", qit->c_str(), ipos));
|
|
||||||
// Remember the term position
|
|
||||||
qtermposs.push_back(ipos);
|
|
||||||
// Add adjacent slots to the set to populate at next step
|
|
||||||
unsigned int sta = MAX(0, ipos-m_db->m_synthAbsWordCtxLen);
|
|
||||||
unsigned int sto = ipos+m_db->m_synthAbsWordCtxLen;
|
|
||||||
for (unsigned int ii = sta; ii <= sto; ii++) {
|
|
||||||
if (ii == ipos)
|
|
||||||
sparseDoc[ii] = *qit;
|
|
||||||
else
|
|
||||||
sparseDoc[ii] = emptys;
|
|
||||||
}
|
|
||||||
// Limit the number of occurences we keep for each
|
|
||||||
// term. The abstract has a finite length anyway !
|
|
||||||
if (occurrences++ > maxoccperterm)
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
} catch (...) {
|
|
||||||
// Term does not occur. No problem.
|
|
||||||
}
|
|
||||||
// Limit total size
|
|
||||||
if (totaloccs++ > maxtotaloccs)
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
LOGDEB(("Abstract:%d:chosen number of positions %d. Populating\n",
|
|
||||||
chron.millis(), qtermposs.size()));
|
|
||||||
|
|
||||||
// Walk the full document position list (for each term walk
|
|
||||||
// position list) and populate slots around the query terms. We
|
|
||||||
// arbitrarily truncate the list to avoid taking forever. If we do
|
|
||||||
// cutoff, the abstract may be inconsistant, which is bad...
|
|
||||||
{
|
|
||||||
Xapian::TermIterator term;
|
|
||||||
int cutoff = 500 * 1000;
|
|
||||||
|
|
||||||
for (term = db.termlist_begin(docid);
|
|
||||||
term != db.termlist_end(docid); term++) {
|
|
||||||
if (cutoff-- < 0) {
|
|
||||||
LOGDEB(("Abstract: max term count cutoff\n"));
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
Xapian::PositionIterator pos;
|
|
||||||
for (pos = db.positionlist_begin(docid, *term);
|
|
||||||
pos != db.positionlist_end(docid, *term); pos++) {
|
|
||||||
if (cutoff-- < 0) {
|
|
||||||
LOGDEB(("Abstract: max term count cutoff\n"));
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
map<unsigned int, string>::iterator vit;
|
|
||||||
if ((vit=sparseDoc.find(*pos)) != sparseDoc.end()) {
|
|
||||||
// Don't replace a term: the terms list is in
|
|
||||||
// alphabetic order, and we may have several terms
|
|
||||||
// at the same position, we want to keep only the
|
|
||||||
// first one (ie: dockes and dockes@wanadoo.fr)
|
|
||||||
if (vit->second.empty()) {
|
|
||||||
LOGDEB2(("Abstract: populating: [%s] at %d\n",
|
|
||||||
(*term).c_str(), *pos));
|
|
||||||
sparseDoc[*pos] = *term;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#if 0
|
|
||||||
// Debug only: output the full term[position] vector
|
|
||||||
bool epty = false;
|
|
||||||
int ipos = 0;
|
|
||||||
for (map<unsigned int, string>::iterator it = sparseDoc.begin();
|
|
||||||
it != sparseDoc.end();
|
|
||||||
it++, ipos++) {
|
|
||||||
if (it->empty()) {
|
|
||||||
if (!epty)
|
|
||||||
LOGDEB(("Abstract:vec[%d]: [%s]\n", ipos, it->c_str()));
|
|
||||||
epty=true;
|
|
||||||
} else {
|
|
||||||
epty = false;
|
|
||||||
LOGDEB(("Abstract:vec[%d]: [%s]\n", ipos, it->c_str()));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
LOGDEB(("Abstract:%d: randomizing and extracting\n", chron.millis()));
|
|
||||||
|
|
||||||
// We randomize the selection of term positions, from which we
|
|
||||||
// shall pull, starting at the beginning, until the abstract is
|
|
||||||
// big enough. The abstract is finally built in correct position
|
|
||||||
// order, thanks to the position map.
|
|
||||||
random_shuffle(qtermposs.begin(), qtermposs.end());
|
|
||||||
map<unsigned int, string> mabs;
|
|
||||||
unsigned int abslen = 0;
|
|
||||||
|
|
||||||
// Extract data around the N first (in random order) query term
|
|
||||||
// positions, and store the terms in the map. Don't concatenate
|
|
||||||
// immediately into chunks because there might be overlaps
|
|
||||||
for (vector<unsigned int>::const_iterator pos = qtermposs.begin();
|
|
||||||
pos != qtermposs.end(); pos++) {
|
|
||||||
|
|
||||||
if (int(abslen) > m_db->m_synthAbsLen)
|
|
||||||
break;
|
|
||||||
|
|
||||||
unsigned int sta = MAX(0, *pos - m_db->m_synthAbsWordCtxLen);
|
|
||||||
unsigned int sto = *pos + m_db->m_synthAbsWordCtxLen;
|
|
||||||
|
|
||||||
LOGDEB2(("Abstract: %d<-%d->%d\n", sta, *pos, sto));
|
|
||||||
|
|
||||||
for (unsigned int ii = sta; ii <= sto; ii++) {
|
|
||||||
|
|
||||||
if (int(abslen) > m_db->m_synthAbsLen)
|
|
||||||
break;
|
|
||||||
map<unsigned int, string>::const_iterator vit =
|
|
||||||
sparseDoc.find(ii);
|
|
||||||
if (vit != sparseDoc.end() && !vit->second.empty()) {
|
|
||||||
LOGDEB2(("Abstract: position %d -> [%s]\n",
|
|
||||||
ii, vit->second.c_str()));
|
|
||||||
mabs[ii] = vit->second;
|
|
||||||
abslen += vit->second.length();
|
|
||||||
} else {
|
|
||||||
LOGDEB2(("Abstract: empty position at %d\n", ii));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Possibly add a ... at the end of chunk if it's not
|
|
||||||
// overlapping
|
|
||||||
if (mabs.find(sto+1) == mabs.end())
|
|
||||||
mabs[sto+1] = "...";
|
|
||||||
}
|
|
||||||
|
|
||||||
// Build the abstract by walking the map (in order of position)
|
|
||||||
string abstract;
|
|
||||||
for (map<unsigned int, string>::const_iterator it = mabs.begin();
|
|
||||||
it != mabs.end(); it++) {
|
|
||||||
LOGDEB2(("Abtract:output %u -> [%s]\n", it->first,it->second.c_str()));
|
|
||||||
abstract += it->second + " ";
|
|
||||||
}
|
|
||||||
LOGDEB(("Abtract: done in %d mS\n", chron.millis()));
|
|
||||||
return abstract;
|
|
||||||
}
|
|
||||||
#ifndef NO_NAMESPACES
|
#ifndef NO_NAMESPACES
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@ -16,12 +16,14 @@
|
|||||||
*/
|
*/
|
||||||
#ifndef _DB_H_INCLUDED_
|
#ifndef _DB_H_INCLUDED_
|
||||||
#define _DB_H_INCLUDED_
|
#define _DB_H_INCLUDED_
|
||||||
/* @(#$Id: rcldb.h,v 1.40 2006-10-30 12:59:44 dockes Exp $ (C) 2004 J.F.Dockes */
|
/* @(#$Id: rcldb.h,v 1.41 2006-11-13 08:49:44 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <list>
|
#include <list>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
|
#include "refcntr.h"
|
||||||
|
|
||||||
#ifndef NO_NAMESPACES
|
#ifndef NO_NAMESPACES
|
||||||
using std::string;
|
using std::string;
|
||||||
using std::list;
|
using std::list;
|
||||||
@ -103,7 +105,7 @@ class Doc {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
class AdvSearchData;
|
class SearchData;
|
||||||
class Native;
|
class Native;
|
||||||
class TermIter;
|
class TermIter;
|
||||||
|
|
||||||
@ -155,7 +157,7 @@ class Db {
|
|||||||
/* Query-related functions */
|
/* Query-related functions */
|
||||||
|
|
||||||
// Parse query string and initialize query
|
// Parse query string and initialize query
|
||||||
bool setQuery(AdvSearchData &q, int opts = QO_NONE,
|
bool setQuery(RefCntr<SearchData> q, int opts = QO_NONE,
|
||||||
const string& stemlang = "english");
|
const string& stemlang = "english");
|
||||||
bool getQueryTerms(list<string>& terms);
|
bool getQueryTerms(list<string>& terms);
|
||||||
bool getMatchTerms(const Doc& doc, list<string>& terms);
|
bool getMatchTerms(const Doc& doc, list<string>& terms);
|
||||||
@ -213,6 +215,9 @@ class Db {
|
|||||||
/** Perform stem expansion across all dbs configured for searching */
|
/** Perform stem expansion across all dbs configured for searching */
|
||||||
list<string> stemExpand(const string& lang, const string& term);
|
list<string> stemExpand(const string& lang, const string& term);
|
||||||
|
|
||||||
|
/** Filename wildcard expansion */
|
||||||
|
bool filenameWildExp(const string& exp, list<string>& names);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
|
||||||
string m_filterTopDir; // Current query filter on subtree top directory
|
string m_filterTopDir; // Current query filter on subtree top directory
|
||||||
@ -248,6 +253,7 @@ private:
|
|||||||
vector<bool> updated;
|
vector<bool> updated;
|
||||||
|
|
||||||
bool reOpen(); // Close/open, same mode/opts
|
bool reOpen(); // Close/open, same mode/opts
|
||||||
|
|
||||||
/* Copyconst and assignemt private and forbidden */
|
/* Copyconst and assignemt private and forbidden */
|
||||||
Db(const Db &) {}
|
Db(const Db &) {}
|
||||||
Db & operator=(const Db &) {return *this;};
|
Db & operator=(const Db &) {return *this;};
|
||||||
|
|||||||
299
src/rcldb/searchdata.cpp
Normal file
299
src/rcldb/searchdata.cpp
Normal file
@ -0,0 +1,299 @@
|
|||||||
|
#ifndef lint
|
||||||
|
static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.1 2006-11-13 08:49:44 dockes Exp $ (C) 2006 J.F.Dockes";
|
||||||
|
#endif
|
||||||
|
/*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the
|
||||||
|
* Free Software Foundation, Inc.,
|
||||||
|
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||||
|
*/
|
||||||
|
|
||||||
|
// Handle translation from rcl's SearchData structures to Xapian Queries
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <list>
|
||||||
|
#ifndef NO_NAMESPACES
|
||||||
|
using namespace std;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include "xapian.h"
|
||||||
|
|
||||||
|
#include "rcldb.h"
|
||||||
|
#include "searchdata.h"
|
||||||
|
#include "debuglog.h"
|
||||||
|
#include "smallut.h"
|
||||||
|
#include "textsplit.h"
|
||||||
|
#include "unacpp.h"
|
||||||
|
#include "utf8iter.h"
|
||||||
|
|
||||||
|
namespace Rcl {
|
||||||
|
|
||||||
|
typedef list<SearchDataClause *>::iterator qlist_it_t;
|
||||||
|
|
||||||
|
bool SearchData::toNativeQuery(Rcl::Db &db, void *d, const string& stemlang)
|
||||||
|
{
|
||||||
|
Xapian::Query xq;
|
||||||
|
|
||||||
|
// Walk the clause list translating each in turn and building the
|
||||||
|
// Xapian query tree
|
||||||
|
for (qlist_it_t it = m_query.begin(); it != m_query.end(); it++) {
|
||||||
|
Xapian::Query nq;
|
||||||
|
(*it)->toNativeQuery(db, &nq, stemlang);
|
||||||
|
Xapian::Query::op op;
|
||||||
|
|
||||||
|
// If this structure is an AND list, must use AND_NOT for excl clauses.
|
||||||
|
// Else this is an OR list, and there can't be excl clauses
|
||||||
|
if (m_tp == SCLT_AND) {
|
||||||
|
op = (*it)->m_tp == SCLT_EXCL ?
|
||||||
|
Xapian::Query::OP_AND_NOT: Xapian::Query::OP_AND;
|
||||||
|
} else {
|
||||||
|
op = Xapian::Query::OP_OR;
|
||||||
|
}
|
||||||
|
xq = xq.empty() ? nq : Xapian::Query(op, xq, nq);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add the file type filtering clause if any
|
||||||
|
if (!m_filetypes.empty()) {
|
||||||
|
list<Xapian::Query> pqueries;
|
||||||
|
Xapian::Query tq;
|
||||||
|
for (list<string>::iterator it = m_filetypes.begin();
|
||||||
|
it != m_filetypes.end(); it++) {
|
||||||
|
string term = "T" + *it;
|
||||||
|
LOGDEB(("Adding file type term: [%s]\n", term.c_str()));
|
||||||
|
tq = tq.empty() ? Xapian::Query(term) :
|
||||||
|
Xapian::Query(Xapian::Query::OP_OR, tq, Xapian::Query(term));
|
||||||
|
}
|
||||||
|
xq = xq.empty() ? tq : Xapian::Query(Xapian::Query::OP_FILTER, xq, tq);
|
||||||
|
}
|
||||||
|
|
||||||
|
*((Xapian::Query *)d) = xq;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add clause to current list. OR lists cant have EXCL clauses.
|
||||||
|
bool SearchData::addClause(SearchDataClause* cl)
|
||||||
|
{
|
||||||
|
if (m_tp == SCLT_OR && (cl->m_tp == SCLT_EXCL)) {
|
||||||
|
LOGERR(("SearchData::addClause: cant add EXCL to OR list\n"));
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
m_query.push_back(cl);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Make me all new
|
||||||
|
void SearchData::erase() {
|
||||||
|
for (qlist_it_t it = m_query.begin(); it != m_query.end(); it++)
|
||||||
|
delete *it;
|
||||||
|
m_query.clear();
|
||||||
|
m_filetypes.clear();
|
||||||
|
m_topdir.erase();
|
||||||
|
m_description.erase();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Am I a file name only search ? This is to turn off term highlighting
|
||||||
|
bool SearchData::fileNameOnly() {
|
||||||
|
for (qlist_it_t it = m_query.begin(); it != m_query.end(); it++)
|
||||||
|
if (!(*it)->isFileName())
|
||||||
|
return false;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Splitter callback for breaking a user query string into simple
|
||||||
|
// terms and phrases
|
||||||
|
class wsQData : public TextSplitCB {
|
||||||
|
public:
|
||||||
|
vector<string> terms;
|
||||||
|
// Debug
|
||||||
|
string catterms() {
|
||||||
|
string s;
|
||||||
|
for (unsigned int i = 0; i < terms.size(); i++) {
|
||||||
|
s += "[" + terms[i] + "] ";
|
||||||
|
}
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
bool takeword(const std::string &term, int , int, int) {
|
||||||
|
LOGDEB1(("wsQData::takeword: %s\n", term.c_str()));
|
||||||
|
terms.push_back(term);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
// Decapital + deaccent all terms
|
||||||
|
void dumball() {
|
||||||
|
for (vector<string>::iterator it=terms.begin(); it !=terms.end();it++){
|
||||||
|
string dumb;
|
||||||
|
dumb_string(*it, dumb);
|
||||||
|
*it = dumb;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
// Turn string into list of xapian queries. There is little
|
||||||
|
// interpretation done on the string (no +term -term or filename:term
|
||||||
|
// stuff). We just separate words and phrases, and interpret
|
||||||
|
// capitalized terms as wanting no stem expansion.
|
||||||
|
// The final list contains one query for each term or phrase
|
||||||
|
// - Elements corresponding to a stem-expanded part are an OP_OR
|
||||||
|
// composition of the stem-expanded terms (or a single term query).
|
||||||
|
// - Elements corresponding to a phrase are an OP_PHRASE composition of the
|
||||||
|
// phrase terms (no stem expansion in this case)
|
||||||
|
static void stringToXapianQueries(const string &iq,
|
||||||
|
const string& stemlang,
|
||||||
|
Db& db,
|
||||||
|
list<Xapian::Query> &pqueries)
|
||||||
|
{
|
||||||
|
string qstring = iq;
|
||||||
|
bool opt_stemexp = !stemlang.empty();
|
||||||
|
|
||||||
|
// Split into (possibly single word) phrases ("this is a phrase"):
|
||||||
|
list<string> phrases;
|
||||||
|
stringToStrings(qstring, phrases);
|
||||||
|
|
||||||
|
// Then process each phrase: split into terms and transform into
|
||||||
|
// appropriate Xapian Query
|
||||||
|
|
||||||
|
for (list<string>::iterator it=phrases.begin(); it !=phrases.end(); it++) {
|
||||||
|
LOGDEB(("strToXapianQ: phrase or word: [%s]\n", it->c_str()));
|
||||||
|
|
||||||
|
// If there are both spans and single words in this element,
|
||||||
|
// we need to use a word split, else a phrase query including
|
||||||
|
// a span would fail if we didn't adjust the proximity to
|
||||||
|
// account for the additional span term which is complicated.
|
||||||
|
wsQData splitDataS, splitDataW;
|
||||||
|
TextSplit splitterS(&splitDataS, TextSplit::TXTS_ONLYSPANS);
|
||||||
|
splitterS.text_to_words(*it);
|
||||||
|
TextSplit splitterW(&splitDataW, TextSplit::TXTS_NOSPANS);
|
||||||
|
splitterW.text_to_words(*it);
|
||||||
|
wsQData& splitData = splitDataS;
|
||||||
|
if (splitDataS.terms.size() > 1 && splitDataS.terms.size() !=
|
||||||
|
splitDataW.terms.size())
|
||||||
|
splitData = splitDataW;
|
||||||
|
|
||||||
|
LOGDEB1(("strToXapianQ: splitter term count: %d\n",
|
||||||
|
splitData.terms.size()));
|
||||||
|
switch(splitData.terms.size()) {
|
||||||
|
case 0: continue;// ??
|
||||||
|
case 1: // Not a real phrase: one term
|
||||||
|
{
|
||||||
|
string term = splitData.terms.front();
|
||||||
|
bool nostemexp = false;
|
||||||
|
// Check if the first letter is a majuscule in which
|
||||||
|
// case we do not want to do stem expansion. Note that
|
||||||
|
// the test is convoluted and possibly problematic
|
||||||
|
if (term.length() > 0) {
|
||||||
|
string noacterm,noaclowterm;
|
||||||
|
if (unacmaybefold(term, noacterm, "UTF-8", false) &&
|
||||||
|
unacmaybefold(noacterm, noaclowterm, "UTF-8", true)) {
|
||||||
|
Utf8Iter it1(noacterm);
|
||||||
|
Utf8Iter it2(noaclowterm);
|
||||||
|
if (*it1 != *it2)
|
||||||
|
nostemexp = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
LOGDEB1(("Term: %s stem expansion: %s\n",
|
||||||
|
term.c_str(), nostemexp?"no":"yes"));
|
||||||
|
|
||||||
|
list<string> exp;
|
||||||
|
string term1;
|
||||||
|
dumb_string(term, term1);
|
||||||
|
// Possibly perform stem compression/expansion
|
||||||
|
if (!nostemexp && opt_stemexp) {
|
||||||
|
exp = db.stemExpand(stemlang, term1);
|
||||||
|
} else {
|
||||||
|
exp.push_back(term1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Push either term or OR of stem-expanded set
|
||||||
|
pqueries.push_back(Xapian::Query(Xapian::Query::OP_OR,
|
||||||
|
exp.begin(), exp.end()));
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
|
default:
|
||||||
|
// Phrase: no stem expansion
|
||||||
|
splitData.dumball();
|
||||||
|
LOGDEB(("Pushing phrase: [%s]\n", splitData.catterms().c_str()));
|
||||||
|
pqueries.push_back(Xapian::Query(Xapian::Query::OP_PHRASE,
|
||||||
|
splitData.terms.begin(),
|
||||||
|
splitData.terms.end()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Translate a simple OR, AND, or EXCL search clause.
|
||||||
|
bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p,
|
||||||
|
const string& stemlang)
|
||||||
|
{
|
||||||
|
Xapian::Query *qp = (Xapian::Query *)p;
|
||||||
|
*qp = Xapian::Query();
|
||||||
|
|
||||||
|
Xapian::Query::op op;
|
||||||
|
switch (m_tp) {
|
||||||
|
case SCLT_AND: op = Xapian::Query::OP_AND; break;
|
||||||
|
case SCLT_OR:
|
||||||
|
case SCLT_EXCL: op = Xapian::Query::OP_OR; break;
|
||||||
|
default:
|
||||||
|
LOGERR(("SearchDataClauseSimple: bad m_tp %d\n", m_tp));
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
list<Xapian::Query> pqueries;
|
||||||
|
stringToXapianQueries(m_text, stemlang, db, pqueries);
|
||||||
|
if (pqueries.empty()) {
|
||||||
|
LOGERR(("SearchDataClauseSimple: resolved to null query\n"));
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
*qp = Xapian::Query(op, pqueries.begin(), pqueries.end());
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Translate a FILENAME search clause.
|
||||||
|
bool SearchDataClauseFilename::toNativeQuery(Rcl::Db &db, void *p,
|
||||||
|
const string& stemlang)
|
||||||
|
{
|
||||||
|
Xapian::Query *qp = (Xapian::Query *)p;
|
||||||
|
*qp = Xapian::Query();
|
||||||
|
|
||||||
|
list<string> names;
|
||||||
|
db.filenameWildExp(m_text, names);
|
||||||
|
// Build a query out of the matching file name terms.
|
||||||
|
*qp = Xapian::Query(Xapian::Query::OP_OR, names.begin(), names.end());
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Translate NEAR or PHRASE clause. We're not handling the distance parameter
|
||||||
|
// yet.
|
||||||
|
bool SearchDataClauseDist::toNativeQuery(Rcl::Db &db, void *p,
|
||||||
|
const string& stemlang)
|
||||||
|
{
|
||||||
|
Xapian::Query *qp = (Xapian::Query *)p;
|
||||||
|
*qp = Xapian::Query();
|
||||||
|
|
||||||
|
Xapian::Query::op op = m_tp == SCLT_PHRASE ? Xapian::Query::OP_PHRASE :
|
||||||
|
Xapian::Query::OP_NEAR;
|
||||||
|
|
||||||
|
list<Xapian::Query> pqueries;
|
||||||
|
Xapian::Query nq;
|
||||||
|
string s = string("\"") + m_text + string("\"");
|
||||||
|
|
||||||
|
// Use stringToXapianQueries anyway to lowercase and simplify the
|
||||||
|
// phrase terms etc. The result should be a single element list
|
||||||
|
stringToXapianQueries(s, stemlang, db, pqueries);
|
||||||
|
if (pqueries.empty()) {
|
||||||
|
LOGERR(("SearchDataClauseDist: resolved to null query\n"));
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
*qp = *pqueries.begin();
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // Namespace Rcl
|
||||||
@ -1,40 +1,112 @@
|
|||||||
#ifndef _SEARCHDATA_H_INCLUDED_
|
#ifndef _SEARCHDATA_H_INCLUDED_
|
||||||
#define _SEARCHDATA_H_INCLUDED_
|
#define _SEARCHDATA_H_INCLUDED_
|
||||||
/* @(#$Id: searchdata.h,v 1.2 2006-04-22 06:27:37 dockes Exp $ (C) 2004 J.F.Dockes */
|
/* @(#$Id: searchdata.h,v 1.3 2006-11-13 08:49:45 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <list>
|
||||||
|
|
||||||
|
#include "rcldb.h"
|
||||||
|
|
||||||
|
#ifndef NO_NAMESPACES
|
||||||
|
using std::list;
|
||||||
|
using std::string;
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace Rcl {
|
namespace Rcl {
|
||||||
/**
|
|
||||||
* Holder for query data
|
/** Search clause types */
|
||||||
*/
|
enum SClType {
|
||||||
class AdvSearchData {
|
SCLT_AND,
|
||||||
public:
|
SCLT_OR, SCLT_EXCL, SCLT_FILENAME, SCLT_PHRASE, SCLT_NEAR,
|
||||||
string allwords;
|
SCLT_SUB
|
||||||
string phrase;
|
|
||||||
string orwords;
|
|
||||||
string orwords1; // Have two instances of orwords for and'ing them
|
|
||||||
string nowords;
|
|
||||||
string filename;
|
|
||||||
list<string> filetypes; // restrict to types. Empty if inactive
|
|
||||||
string topdir; // restrict to subtree. Empty if inactive
|
|
||||||
string description; // Printable expanded version of the complete query
|
|
||||||
// returned after setQuery.
|
|
||||||
void erase() {
|
|
||||||
allwords.erase();
|
|
||||||
phrase.erase();
|
|
||||||
orwords.erase();
|
|
||||||
orwords1.erase();
|
|
||||||
nowords.erase();
|
|
||||||
filetypes.clear();
|
|
||||||
topdir.erase();
|
|
||||||
filename.erase();
|
|
||||||
description.erase();
|
|
||||||
}
|
|
||||||
bool fileNameOnly() {
|
|
||||||
return allwords.empty() && phrase.empty() && orwords.empty() &&
|
|
||||||
orwords1.empty() && nowords.empty();
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
}
|
class SearchDataClause;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Holder for a list of search clauses. Some of the clauses can be comples
|
||||||
|
* subqueries.
|
||||||
|
*/
|
||||||
|
class SearchData {
|
||||||
|
public:
|
||||||
|
SClType m_tp; // Only SCLT_AND or SCLT_OR here
|
||||||
|
list<SearchDataClause *> m_query;
|
||||||
|
list<string> m_filetypes; // Restrict to filetypes if set.
|
||||||
|
string m_topdir; // Restrict to subtree.
|
||||||
|
// Printable expanded version of the complete query, obtained from Xapian
|
||||||
|
// valid after setQuery() call
|
||||||
|
string m_description;
|
||||||
|
|
||||||
|
SearchData(SClType tp) : m_tp(tp) {}
|
||||||
|
~SearchData() {erase();}
|
||||||
|
|
||||||
|
/** Make pristine */
|
||||||
|
void erase();
|
||||||
|
|
||||||
|
/** Is there anything but a file name search in here ? */
|
||||||
|
bool fileNameOnly();
|
||||||
|
|
||||||
|
/** Translate to Xapian query. rcldb knows about the void* */
|
||||||
|
bool toNativeQuery(Rcl::Db &db, void *, const string& stemlang);
|
||||||
|
|
||||||
|
/** We become the owner of cl and will delete it */
|
||||||
|
bool addClause(SearchDataClause *cl);
|
||||||
|
|
||||||
|
private:
|
||||||
|
/* Copyconst and assignment private and forbidden */
|
||||||
|
SearchData(const SearchData &) {}
|
||||||
|
SearchData& operator=(const SearchData&) {return *this;};
|
||||||
|
};
|
||||||
|
|
||||||
|
class SearchDataClause {
|
||||||
|
public:
|
||||||
|
SClType m_tp;
|
||||||
|
|
||||||
|
SearchDataClause(SClType tp) : m_tp(tp) {}
|
||||||
|
virtual ~SearchDataClause() {}
|
||||||
|
virtual bool toNativeQuery(Rcl::Db &db, void *, const string&) = 0;
|
||||||
|
virtual bool isFileName() {return m_tp == SCLT_FILENAME ? true : false;}
|
||||||
|
};
|
||||||
|
|
||||||
|
class SearchDataClauseSimple : public SearchDataClause {
|
||||||
|
public:
|
||||||
|
SearchDataClauseSimple(SClType tp, string txt)
|
||||||
|
: SearchDataClause(tp), m_text(txt) {}
|
||||||
|
virtual ~SearchDataClauseSimple() {}
|
||||||
|
virtual bool toNativeQuery(Rcl::Db &db, void *, const string& stemlang);
|
||||||
|
protected:
|
||||||
|
string m_text;
|
||||||
|
};
|
||||||
|
|
||||||
|
class SearchDataClauseFilename : public SearchDataClauseSimple {
|
||||||
|
public:
|
||||||
|
SearchDataClauseFilename(string txt)
|
||||||
|
: SearchDataClauseSimple(SCLT_FILENAME, m_text) {}
|
||||||
|
virtual ~SearchDataClauseFilename() {}
|
||||||
|
virtual bool toNativeQuery(Rcl::Db &db, void *, const string& stemlang);
|
||||||
|
};
|
||||||
|
|
||||||
|
class SearchDataClauseDist : public SearchDataClauseSimple {
|
||||||
|
public:
|
||||||
|
SearchDataClauseDist(SClType tp, string txt, int dist)
|
||||||
|
: SearchDataClauseSimple(tp, txt), m_distance(dist) {}
|
||||||
|
virtual ~SearchDataClauseDist() {}
|
||||||
|
virtual bool toNativeQuery(Rcl::Db &db, void *, const string& stemlang);
|
||||||
|
|
||||||
|
protected:
|
||||||
|
int m_distance;
|
||||||
|
};
|
||||||
|
|
||||||
|
class SearchDataClauseSub : public SearchDataClause {
|
||||||
|
public:
|
||||||
|
SearchDataClauseSub(SClType tp, SClType stp)
|
||||||
|
: SearchDataClause(tp), m_sub(stp) {}
|
||||||
|
virtual ~SearchDataClauseSub() {}
|
||||||
|
virtual bool toNativeQuery(Rcl::Db &db, void *, const string& stemlang);
|
||||||
|
|
||||||
|
protected:
|
||||||
|
SearchData m_sub;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // Namespace Rcl
|
||||||
#endif /* _SEARCHDATA_H_INCLUDED_ */
|
#endif /* _SEARCHDATA_H_INCLUDED_ */
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user