#ifndef lint static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.73 2006-04-27 06:12:10 dockes Exp $ (C) 2004 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the * Free Software Foundation, Inc., * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #include #include #include #include #include #include #include #include #ifndef NO_NAMESPACES using namespace std; #endif /* NO_NAMESPACES */ #define RCLDB_INTERNAL #include "rcldb.h" #include "stemdb.h" #include "textsplit.h" #include "transcode.h" #include "unacpp.h" #include "conftree.h" #include "debuglog.h" #include "pathut.h" #include "smallut.h" #include "pathhash.h" #include "utf8iter.h" #include "searchdata.h" #include "xapian.h" #ifndef MAX #define MAX(A,B) (A>B?A:B) #endif #ifndef MIN #define MIN(A,B) (A m_extraDbs; // Indexing Xapian::WritableDatabase wdb; vector updated; // Querying Xapian::Database db; Xapian::Query query; // query descriptor: terms and subqueries // joined by operators (or/and etc...) Xapian::Enquire *enquire; // Open query descriptor. Xapian::MSet mset; // Partial result set string makeAbstract(Xapian::docid id, const list& terms); bool dbDataToRclDoc(std::string &data, Doc &doc, int qopts, Xapian::docid docid, const list& terms); Native() : m_isopen(false), m_iswritable(false), m_mode(Db::DbRO), enquire(0) { } ~Native() { delete enquire; } bool filterMatch(Db *rdb, Xapian::Document &xdoc) { // Parse xapian document's data and populate doc fields string data = xdoc.get_data(); ConfSimple parms(&data); // The only filtering for now is on file path (subtree) string url; parms.get(string("url"), url); url = url.substr(7); if (url.find(rdb->m_filterTopDir) == 0) return true; return false; } }; Db::Db() : m_qOpts(QO_NONE) { m_ndb = new Native; } Db::~Db() { LOGDEB1(("Db::~Db\n")); if (m_ndb == 0) return; LOGDEB(("Db::~Db: isopen %d m_iswritable %d\n", m_ndb->m_isopen, m_ndb->m_iswritable)); if (m_ndb->m_isopen == false) return; const char *ermsg = "Unknown error"; try { LOGDEB(("Db::~Db: closing native database\n")); if (m_ndb->m_iswritable == true) { m_ndb->wdb.flush(); } delete m_ndb; m_ndb = 0; return; } catch (const Xapian::Error &e) { ermsg = e.get_msg().c_str(); } catch (const string &s) { ermsg = s.c_str(); } catch (const char *s) { ermsg = s; } catch (...) { ermsg = "Caught unknown exception"; } LOGERR(("Db::~Db: got exception: %s\n", ermsg)); } bool Db::open(const string& dir, OpenMode mode, int qops) { if (m_ndb == 0) return false; LOGDEB(("Db::open: m_isopen %d m_iswritable %d\n", m_ndb->m_isopen, m_ndb->m_iswritable)); if (m_ndb->m_isopen) { // We used to return an error here but I see no reason to if (!close()) return false; } const char *ermsg = "Unknown"; try { switch (mode) { case DbUpd: case DbTrunc: { int action = (mode == DbUpd) ? Xapian::DB_CREATE_OR_OPEN : Xapian::DB_CREATE_OR_OVERWRITE; m_ndb->wdb = Xapian::WritableDatabase(dir, action); LOGDEB(("Db::open: lastdocid: %d\n", m_ndb->wdb.get_lastdocid())); m_ndb->updated.resize(m_ndb->wdb.get_lastdocid() + 1); for (unsigned int i = 0; i < m_ndb->updated.size(); i++) m_ndb->updated[i] = false; m_ndb->m_iswritable = true; } break; case DbRO: default: m_ndb->m_iswritable = false; m_ndb->db = Xapian::Database(dir); for (list::iterator it = m_ndb->m_extraDbs.begin(); it != m_ndb->m_extraDbs.end(); it++) { string aerr; LOGDEB(("Db::Open: adding query db [%s]\n", it->c_str())); aerr.clear(); try { // Make this non-fatal m_ndb->db.add_database(Xapian::Database(*it)); } catch (const Xapian::Error &e) { aerr = e.get_msg().c_str(); } catch (const string &s) { aerr = s.c_str(); } catch (const char *s) { aerr = s; } catch (...) { aerr = "Caught unknown exception"; } if (!aerr.empty()) LOGERR(("Db::Open: error while trying to add database " "from [%s]: %s\n", it->c_str(), aerr.c_str())); } break; } m_qOpts = qops; m_ndb->m_mode = mode; m_ndb->m_isopen = true; m_ndb->m_basedir = dir; return true; } catch (const Xapian::Error &e) { ermsg = e.get_msg().c_str(); } catch (const string &s) { ermsg = s.c_str(); } catch (const char *s) { ermsg = s; } catch (...) { ermsg = "Caught unknown exception"; } LOGERR(("Db::open: exception while opening [%s]: %s\n", dir.c_str(), ermsg)); return false; } string Db::getDbDir() { if (m_ndb == 0) return ""; return m_ndb->m_basedir; } // Note: xapian has no close call, we delete and recreate the db bool Db::close() { if (m_ndb == 0) return false; LOGDEB(("Db::close(): m_isopen %d m_iswritable %d\n", m_ndb->m_isopen, m_ndb->m_iswritable)); if (m_ndb->m_isopen == false) return true; const char *ermsg = "Unknown"; try { if (m_ndb->m_iswritable == true) { m_ndb->wdb.flush(); LOGDEB(("Rcl:Db: Called xapian flush\n")); } delete m_ndb; m_ndb = new Native; if (m_ndb) return true; } catch (const Xapian::Error &e) { ermsg = e.get_msg().c_str(); } catch (const string &s) { ermsg = s.c_str(); } catch (const char *s) { ermsg = s; } catch (...) { ermsg = "Caught unknown exception"; } LOGERR(("Db:close: exception while deleting db: %s\n", ermsg)); return false; } bool Db::reOpen() { if (m_ndb && m_ndb->m_isopen) { if (!close()) return false; if (!open(m_ndb->m_basedir, m_ndb->m_mode, m_qOpts)) { return false; } } return true; } int Db::docCnt() { if (m_ndb && m_ndb->m_isopen) { return m_ndb->m_iswritable ? m_ndb->wdb.get_doccount() : m_ndb->db.get_doccount(); } return -1; } bool Db::addQueryDb(const string &dir) { LOGDEB(("Db::addQueryDb: ndb %p iswritable %d db [%s]\n", m_ndb, (m_ndb)?m_ndb->m_iswritable:0, dir.c_str())); if (!m_ndb) return false; if (m_ndb->m_iswritable) return false; if (find(m_ndb->m_extraDbs.begin(), m_ndb->m_extraDbs.end(), dir) == m_ndb->m_extraDbs.end()) { m_ndb->m_extraDbs.push_back(dir); } return reOpen(); } bool Db::rmQueryDb(const string &dir) { if (!m_ndb) return false; if (m_ndb->m_iswritable) return false; if (dir.empty()) { m_ndb->m_extraDbs.clear(); } else { list::iterator it = find(m_ndb->m_extraDbs.begin(), m_ndb->m_extraDbs.end(), dir); if (it != m_ndb->m_extraDbs.end()) { m_ndb->m_extraDbs.erase(it); } } return reOpen(); } bool Db::testDbDir(const string &dir) { string aerr; LOGDEB(("Db::testDbDir: [%s]\n", dir.c_str())); try { Xapian::Database db(dir); } catch (const Xapian::Error &e) { aerr = e.get_msg().c_str(); } catch (const string &s) { aerr = s.c_str(); } catch (const char *s) { aerr = s; } catch (...) { aerr = "Caught unknown exception"; } if (!aerr.empty()) { LOGERR(("Db::Open: error while trying to open database " "from [%s]: %s\n", dir.c_str(), aerr.c_str())); return false; } return true; } bool Db::isopen() { if (m_ndb == 0) return false; return m_ndb->m_isopen; } // A small class to hold state while splitting text class mySplitterCB : public TextSplitCB { public: Xapian::Document &doc; Xapian::termpos basepos; // Base for document section Xapian::termpos curpos; // Last position sent to callback mySplitterCB(Xapian::Document &d) : doc(d), basepos(1), curpos(0) {} bool takeword(const std::string &term, int pos, int, int); }; // Callback for the document to word splitting class during indexation bool mySplitterCB::takeword(const std::string &term, int pos, int, int) { #if 0 LOGDEB(("mySplitterCB::takeword:splitCb: [%s]\n", term.c_str())); string printable; if (transcode(term, printable, "UTF-8", "ISO-8859-1")) { LOGDEB((" [%s]\n", printable.c_str())); } #endif const char *ermsg; try { // Note: 1 is the within document frequency increment. It would // be possible to assign different weigths to doc parts (ie title) // by using a higher value curpos = pos; doc.add_posting(term, basepos + curpos, 1); return true; } catch (const Xapian::Error &e) { ermsg = e.get_msg().c_str(); } catch (...) { ermsg= "Unknown error"; } LOGERR(("Db: xapian add_posting error %s\n", ermsg)); return false; } // Unaccent and lowercase data, replace \n\r with spaces // Removing crlfs is so that we can use the text in the document data fields. // Use unac (with folding extension) for removing accents and casefolding // // Note that we always return true (but set out to "" on error). We don't // want to stop indexation because of a bad string bool dumb_string(const string &in, string &out) { out.erase(); if (in.empty()) return true; string s1 = neutchars(in, "\n\r"); if (!unacmaybefold(s1, out, "UTF-8", true)) { LOGERR(("dumb_string: unac failed for %s\n", in.c_str())); out.erase(); // See comment at start of func return true; } return true; } // Add document in internal form to the database: index the terms in // the title abstract and body and add special terms for file name, // date, mime type ... , create the document data record (more // metadata), and update database bool Db::add(const string &fn, const Doc &idoc, const struct stat *stp) { LOGDEB1(("Db::add: fn %s\n", fn.c_str())); if (m_ndb == 0) return false; Doc doc = idoc; // Truncate abstract, title and keywords to reasonable lengths. If // abstract is currently empty, we make up one with the beginning // of the document. bool syntabs = false; if (doc.abstract.empty()) { syntabs = true; doc.abstract = rclSyntAbs + truncate_to_word(doc.text, INDEX_ABSTRACT_SIZE); } else { doc.abstract = truncate_to_word(doc.abstract, INDEX_ABSTRACT_SIZE); } doc.abstract = neutchars(doc.abstract, "\n\r"); doc.title = truncate_to_word(doc.title, 100); doc.keywords = truncate_to_word(doc.keywords, 300); Xapian::Document newdocument; mySplitterCB splitData(newdocument); TextSplit splitter(&splitData); // /////// Split and index terms in document body and auxiliary fields string noacc; // Split and index file name as document term(s) LOGDEB2(("Db::add: split file name [%s]\n", fn.c_str())); if (dumb_string(doc.utf8fn, noacc)) { splitter.text_to_words(noacc); splitData.basepos += splitData.curpos + 100; } // Split and index title LOGDEB2(("Db::add: split title [%s]\n", doc.title.c_str())); if (!dumb_string(doc.title, noacc)) { LOGERR(("Db::add: dumb_string failed\n")); return false; } splitter.text_to_words(noacc); splitData.basepos += splitData.curpos + 100; // Split and index body LOGDEB2(("Db::add: split body\n")); if (!dumb_string(doc.text, noacc)) { LOGERR(("Db::add: dumb_string failed\n")); return false; } splitter.text_to_words(noacc); splitData.basepos += splitData.curpos + 100; // Split and index keywords LOGDEB2(("Db::add: split kw [%s]\n", doc.keywords.c_str())); if (!dumb_string(doc.keywords, noacc)) { LOGERR(("Db::add: dumb_string failed\n")); return false; } splitter.text_to_words(noacc); splitData.basepos += splitData.curpos + 100; // Split and index abstract LOGDEB2(("Db::add: split abstract [%s]\n", doc.abstract.c_str())); if (!dumb_string(syntabs ? doc.abstract.substr(rclSyntAbs.length()) : doc.abstract, noacc)) { LOGERR(("Db::add: dumb_string failed\n")); return false; } splitter.text_to_words(noacc); splitData.basepos += splitData.curpos + 100; ////// Special terms for metadata // Mime type newdocument.add_term("T" + doc.mimetype); // Simple file name. This is used for file name searches only. We index // it with a term prefix. utf8fn used to be the full path, but it's now // the simple file name. if (dumb_string(doc.utf8fn, noacc) && !noacc.empty()) { noacc = string("XSFN") + noacc; newdocument.add_term(noacc); } // Pathname/ipath terms. This is used for file existence/uptodate // checks, and unique id for the replace_document() call // Truncate the filepath part to a reasonable length and // replace the truncated part with a hopefully unique hash string hash; pathHash(fn, hash, PATHHASHLEN); LOGDEB2(("Db::add: pathhash [%s]\n", hash.c_str())); // Unique term: makes unique identifier for documents // either path or path+ipath inside multidocument files. // We only add a path term if ipath is empty. Else there will be a qterm // (path+ipath), and a pseudo-doc will be created to stand for the file // itself (for up to date checks). This is handled by // DbIndexer::processone() string uniterm; if (doc.ipath.empty()) { uniterm = "P" + hash; } else { uniterm = "Q" + hash + "|" + doc.ipath; } newdocument.add_term(uniterm); // Dates etc... time_t mtime = atol(doc.dmtime.empty() ? doc.fmtime.c_str() : doc.dmtime.c_str()); struct tm *tm = localtime(&mtime); char buf[9]; sprintf(buf, "%04d%02d%02d",tm->tm_year+1900, tm->tm_mon + 1, tm->tm_mday); newdocument.add_term("D" + string(buf)); // Date (YYYYMMDD) buf[7] = '\0'; if (buf[6] == '3') buf[6] = '2'; newdocument.add_term("W" + string(buf)); // "Weak" - 10ish day interval buf[6] = '\0'; newdocument.add_term("M" + string(buf)); // Month (YYYYMM) buf[4] = '\0'; newdocument.add_term("Y" + string(buf)); // Year (YYYY) // Document data record. omindex has the following nl separated fields: // - url // - sample // - caption (title limited to 100 chars) // - mime type string record = "url=file://" + fn; record += "\nmtype=" + doc.mimetype; record += "\nfmtime=" + doc.fmtime; if (!doc.dmtime.empty()) { record += "\ndmtime=" + doc.dmtime; } record += "\norigcharset=" + doc.origcharset; char sizebuf[20]; sizebuf[0] = 0; if (stp) sprintf(sizebuf, "%ld", (long)stp->st_size); if (sizebuf[0]) record += string("\nfbytes=") + sizebuf; sprintf(sizebuf, "%u", (unsigned int)doc.text.length()); record += string("\ndbytes=") + sizebuf; if (!doc.ipath.empty()) { record += "\nipath=" + doc.ipath; } record += "\ncaption=" + doc.title; record += "\nkeywords=" + doc.keywords; record += "\nabstract=" + doc.abstract; record += "\n"; LOGDEB1(("Newdocument data: %s\n", record.c_str())); newdocument.set_data(record); const char *fnc = fn.c_str(); // Add db entry or update existing entry: try { Xapian::docid did = m_ndb->wdb.replace_document(uniterm, newdocument); if (did < m_ndb->updated.size()) { m_ndb->updated[did] = true; LOGDEB(("Db::add: docid %d updated [%s , %s]\n", did, fnc, doc.ipath.c_str())); } else { LOGDEB(("Db::add: docid %d added [%s , %s]\n", did, fnc, doc.ipath.c_str())); } } catch (...) { // FIXME: is this ever actually needed? try { m_ndb->wdb.add_document(newdocument); LOGDEB(("Db::add: %s added (failed re-seek for duplicate)\n", fnc)); } catch (...) { LOGERR(("Db::add: failed again after replace_document\n")); return false; } } return true; } // Test if given filename has changed since last indexed: bool Db::needUpdate(const string &filename, const struct stat *stp) { if (m_ndb == 0) return false; string hash; pathHash(filename, hash, PATHHASHLEN); string pterm = "P" + hash; const char *ermsg; string qterm = "Q"+ hash + "|"; // Look for all documents with this path. We need to look at all // to set their existence flag. We check the update time on the // fmtime field which will be identical for all docs inside a // multi-document file (we currently always reindex all if the // file changed) Xapian::PostingIterator doc; try { if (!m_ndb->wdb.term_exists(pterm)) { // If no document exist with this path, we do need update LOGDEB2(("Db::needUpdate: no such path: [%s]\n", pterm.c_str())); return true; } // Check the date using the Pterm doc or pseudo-doc Xapian::PostingIterator docid = m_ndb->wdb.postlist_begin(pterm); Xapian::Document doc = m_ndb->wdb.get_document(*docid); string data = doc.get_data(); const char *cp = strstr(data.c_str(), "fmtime="); if (cp) { cp += 7; } else { cp = strstr(data.c_str(), "mtime="); if (cp) cp+= 6; } long mtime = cp ? atol(cp) : 0; if (mtime < stp->st_mtime) { LOGDEB2(("Db::needUpdate: yes: mtime: Db %ld file %ld\n", (long)mtime, (long)stp->st_mtime)); // Db is not up to date. Let's index the file return true; } LOGDEB2(("Db::needUpdate: uptodate: [%s]\n", pterm.c_str())); // Up to date. // Set the uptodate flag for doc / pseudo doc m_ndb->updated[*docid] = true; // Set the existence flag for all the subdocs (if any) Xapian::TermIterator it = m_ndb->wdb.allterms_begin(); it.skip_to(qterm); LOGDEB2(("First qterm: [%s]\n", (*it).c_str())); for (;it != m_ndb->wdb.allterms_end(); it++) { // If current term does not begin with qterm or has another |, not // the same file if ((*it).find(qterm) != 0 || (*it).find_last_of("|") != qterm.length() -1) break; docid = m_ndb->wdb.postlist_begin(*it); if (*docid < m_ndb->updated.size()) { LOGDEB2(("Db::needUpdate: set exist flag for docid %d [%s]\n", *docid, (*it).c_str())); m_ndb->updated[*docid] = true; } } return false; } catch (const Xapian::Error &e) { ermsg = e.get_msg().c_str(); } catch (...) { ermsg= "Unknown error"; } LOGERR(("Db::needUpdate: error while checking existence: %s\n", ermsg)); return true; } // Return list of existing stem db languages list Db::getStemLangs() { LOGDEB(("Db::getStemLang\n")); list dirs; if (m_ndb == 0 || m_ndb->m_isopen == false) return dirs; dirs = StemDb::getLangs(m_ndb->m_basedir); return dirs; } /** * Delete stem db for given language */ bool Db::deleteStemDb(const string& lang) { LOGDEB(("Db::deleteStemDb(%s)\n", lang.c_str())); if (m_ndb == 0 || m_ndb->m_isopen == false) return false; return StemDb::deleteDb(m_ndb->m_basedir, lang); } /** * Create database of stem to parents associations for a given language. * We walk the list of all terms, stem them, and create another Xapian db * with documents indexed by a single term (the stem), and with the list of * parent terms in the document data. */ bool Db::createStemDb(const string& lang) { LOGDEB(("Db::createStemDb(%s)\n", lang.c_str())); if (m_ndb == 0 || m_ndb->m_isopen == false) return false; return StemDb:: createDb(m_ndb->m_iswritable ? m_ndb->wdb : m_ndb->db, m_ndb->m_basedir, lang); } /** * This is called at the end of an indexing session, to delete the * documents for files that are no longer there. */ bool Db::purge() { LOGDEB(("Db::purge\n")); if (m_ndb == 0) return false; LOGDEB(("Db::purge: m_isopen %d m_iswritable %d\n", m_ndb->m_isopen, m_ndb->m_iswritable)); if (m_ndb->m_isopen == false || m_ndb->m_iswritable == false) return false; // There seems to be problems with the document delete code, when // we do this, the database is not actually updated. Especially, // if we delete a bunch of docs, so that there is a hole in the // docids at the beginning, we can't add anything (appears to work // and does nothing). Maybe related to the exceptions below when // trying to delete an unexistant document ? // Flushing before trying the deletes seeems to work around the problem try { m_ndb->wdb.flush(); } catch (...) { LOGDEB(("Db::purge: 1st flush failed\n")); } for (Xapian::docid docid = 1; docid < m_ndb->updated.size(); ++docid) { if (!m_ndb->updated[docid]) { try { m_ndb->wdb.delete_document(docid); LOGDEB(("Db::purge: deleted document #%d\n", docid)); } catch (const Xapian::DocNotFoundError &) { LOGDEB(("Db::purge: document #%d not found\n", docid)); } } } try { m_ndb->wdb.flush(); } catch (...) { LOGDEB(("Db::purge: 2nd flush failed\n")); } return true; } // Splitter callback for breaking query into terms class wsQData : public TextSplitCB { public: vector terms; string catterms() { string s; for (unsigned int i=0;i::iterator it=terms.begin(); it !=terms.end();it++){ string dumb; dumb_string(*it, dumb); *it = dumb; } } }; // Turn string into list of xapian queries. There is little // interpretation done on the string (no +term -term or filename:term // stuff). We just separate words and phrases, and interpret // capitalized terms as wanting no stem expansion. // The final list contains one query for each term or phrase // - Elements corresponding to a stem-expanded part are an OP_OR // composition of the stem-expanded terms (or a single term query). // - Elements corresponding to a phrase are an OP_PHRASE composition of the // phrase terms (no stem expansion in this case) static void stringToXapianQueries(const string &iq, const string& stemlang, Native *m_ndb, list &pqueries, unsigned int opts = Db::QO_NONE) { string qstring = iq; // Split into (possibly single word) phrases ("this is a phrase"): list phrases; stringToStrings(qstring, phrases); // Then process each phrase: split into terms and transform into // appropriate Xapian Query for (list::iterator it=phrases.begin(); it !=phrases.end(); it++) { LOGDEB(("strToXapianQ: phrase or word: [%s]\n", it->c_str())); wsQData splitData; TextSplit splitter(&splitData, true); splitter.text_to_words(*it); LOGDEB1(("strToXapianQ: splitter term count: %d\n", splitData.terms.size())); switch(splitData.terms.size()) { case 0: continue;// ?? case 1: // Not a real phrase: one term { string term = splitData.terms.front(); bool nostemexp = false; // Check if the first letter is a majuscule in which // case we do not want to do stem expansion. Note that // the test is convoluted and possibly problematic if (term.length() > 0) { string noacterm,noaclowterm; if (unacmaybefold(term, noacterm, "UTF-8", false) && unacmaybefold(noacterm, noaclowterm, "UTF-8", true)) { Utf8Iter it1(noacterm); Utf8Iter it2(noaclowterm); if (*it1 != *it2) nostemexp = true; } } LOGDEB1(("Term: %s stem expansion: %s\n", term.c_str(), nostemexp?"no":"yes")); list exp; string term1; dumb_string(term, term1); // Possibly perform stem compression/expansion if (!nostemexp && (opts & Db::QO_STEM)) { exp = StemDb::stemExpand(m_ndb->m_basedir, stemlang,term1); } else { exp.push_back(term1); } // Push either term or OR of stem-expanded set pqueries.push_back(Xapian::Query(Xapian::Query::OP_OR, exp.begin(), exp.end())); } break; default: // Phrase: no stem expansion splitData.dumball(); LOGDEB(("Pushing phrase: [%s]\n", splitData.catterms().c_str())); pqueries.push_back(Xapian::Query(Xapian::Query::OP_PHRASE, splitData.terms.begin(), splitData.terms.end())); } } } // Prepare query out of "advanced search" data bool Db::setQuery(AdvSearchData &sdata, int opts, const string& stemlang) { LOGDEB(("Db::setQuery: adv:\n")); LOGDEB((" allwords: %s\n", sdata.allwords.c_str())); LOGDEB((" phrase: %s\n", sdata.phrase.c_str())); LOGDEB((" orwords: %s\n", sdata.orwords.c_str())); LOGDEB((" orwords1: %s\n", sdata.orwords1.c_str())); LOGDEB((" nowords: %s\n", sdata.nowords.c_str())); LOGDEB((" filename: %s\n", sdata.filename.c_str())); string ft; for (list::iterator it = sdata.filetypes.begin(); it != sdata.filetypes.end(); it++) {ft += *it + " ";} if (!ft.empty()) LOGDEB((" searched file types: %s\n", ft.c_str())); if (!sdata.topdir.empty()) LOGDEB((" restricted to: %s\n", sdata.topdir.c_str())); LOGDEB((" Options: 0x%x\n", opts)); m_filterTopDir = sdata.topdir; m_dbindices.clear(); if (!m_ndb) return false; list pqueries; Xapian::Query xq; m_qOpts = opts; if (!sdata.filename.empty()) { LOGDEB((" filename search\n")); // File name search, with possible wildcards. // We expand wildcards by scanning the filename terms (prefixed // with XSFN) from the database. // We build an OR query with the expanded values if any. string pattern; dumb_string(sdata.filename, pattern); // If pattern is not quoted, and has no wildcards, we add * at // each end: match any substring if (pattern[0] == '"' && pattern[pattern.size()-1] == '"') { pattern = pattern.substr(1, pattern.size() -2); } else if (pattern.find_first_of("*?[") == string::npos) { pattern = "*" + pattern + "*"; } // else let it be LOGDEB((" pattern: [%s]\n", pattern.c_str())); // Match pattern against all file names in the db Xapian::TermIterator it = m_ndb->db.allterms_begin(); it.skip_to("XSFN"); list names; for (;it != m_ndb->db.allterms_end(); it++) { if ((*it).find("XSFN") != 0) break; string fn = (*it).substr(4); LOGDEB2(("Matching [%s] and [%s]\n", pattern.c_str(), fn.c_str())); if (fnmatch(pattern.c_str(), fn.c_str(), 0) != FNM_NOMATCH) { names.push_back((*it).c_str()); } // Limit the match count if (names.size() > 1000) { LOGERR(("Db::SetQuery: too many matched file names\n")); break; } } if (names.empty()) { // Build an impossible query: we know its impossible because we // control the prefixes! names.push_back("XIMPOSSIBLE"); } // Build a query out of the matching file name terms. xq = Xapian::Query(Xapian::Query::OP_OR, names.begin(), names.end()); } if (!sdata.allwords.empty()) { stringToXapianQueries(sdata.allwords, stemlang, m_ndb, pqueries, m_qOpts); if (!pqueries.empty()) { Xapian::Query nq = Xapian::Query(Xapian::Query::OP_AND, pqueries.begin(), pqueries.end()); xq = xq.empty() ? nq : Xapian::Query(Xapian::Query::OP_AND, xq, nq); pqueries.clear(); } } if (!sdata.orwords.empty()) { stringToXapianQueries(sdata.orwords, stemlang, m_ndb, pqueries, m_qOpts); if (!pqueries.empty()) { Xapian::Query nq = Xapian::Query(Xapian::Query::OP_OR, pqueries.begin(), pqueries.end()); xq = xq.empty() ? nq : Xapian::Query(Xapian::Query::OP_AND, xq, nq); pqueries.clear(); } } if (!sdata.orwords1.empty()) { stringToXapianQueries(sdata.orwords1, stemlang, m_ndb, pqueries, m_qOpts); if (!pqueries.empty()) { Xapian::Query nq = Xapian::Query(Xapian::Query::OP_OR, pqueries.begin(), pqueries.end()); xq = xq.empty() ? nq : Xapian::Query(Xapian::Query::OP_AND, xq, nq); pqueries.clear(); } } if (!sdata.phrase.empty()) { Xapian::Query nq; string s = string("\"") + sdata.phrase + string("\""); stringToXapianQueries(s, stemlang, m_ndb, pqueries); if (!pqueries.empty()) { // There should be a single list element phrase query. xq = xq.empty() ? *pqueries.begin() : Xapian::Query(Xapian::Query::OP_AND, xq, *pqueries.begin()); pqueries.clear(); } } if (!sdata.filetypes.empty()) { Xapian::Query tq; for (list::iterator it = sdata.filetypes.begin(); it != sdata.filetypes.end(); it++) { string term = "T" + *it; LOGDEB(("Adding file type term: [%s]\n", term.c_str())); tq = tq.empty() ? Xapian::Query(term) : Xapian::Query(Xapian::Query::OP_OR, tq, Xapian::Query(term)); } xq = xq.empty() ? tq : Xapian::Query(Xapian::Query::OP_FILTER, xq, tq); } // "And not" part. Must come last, as we have to check it's not // the only term in the query. We do no stem expansion on 'No' // words. Should we ? if (!sdata.nowords.empty()) { stringToXapianQueries(sdata.nowords, stemlang, m_ndb, pqueries); if (!pqueries.empty()) { Xapian::Query nq; nq = Xapian::Query(Xapian::Query::OP_OR, pqueries.begin(), pqueries.end()); if (xq.empty()) { // Xapian cant do this currently. Have to have a positive // part! sdata.description = "Error: pure negative query\n"; LOGERR(("Rcl::Db::setQuery: error: pure negative query\n")); return false; } xq = Xapian::Query(Xapian::Query::OP_AND_NOT, xq, nq); pqueries.clear(); } } m_ndb->query = xq; delete m_ndb->enquire; m_ndb->enquire = new Xapian::Enquire(m_ndb->db); m_ndb->enquire->set_query(m_ndb->query); m_ndb->mset = Xapian::MSet(); // Get the query description and trim the "Xapian::Query" sdata.description = m_ndb->query.get_description(); if (sdata.description.find("Xapian::Query") == 0) sdata.description = sdata.description.substr(strlen("Xapian::Query")); LOGDEB(("Db::SetQuery: Q: %s\n", sdata.description.c_str())); return true; } list Db::completions(const string &root, const string &lang, int max) { Xapian::Database db; list res; if (!m_ndb || !m_ndb->m_isopen) return res; string droot; dumb_string(root, droot); db = m_ndb->m_iswritable ? m_ndb->wdb: m_ndb->db; Xapian::TermIterator it = db.allterms_begin(); it.skip_to(droot.c_str()); for (int n = 0;it != db.allterms_end(); it++) { if ((*it).find(droot) != 0) break; if (lang.empty()) { res.push_back(*it); ++n; } else { list stemexps = StemDb::stemExpand(m_ndb->m_basedir, lang, *it); unsigned int cnt = (int)stemexps.size() > max - n ? max - n : stemexps.size(); list::iterator sit = stemexps.begin(); while (cnt--) { res.push_back(*sit++); n++; } } if (n >= max) break; } res.sort(); res.unique(); return res; } bool Db::getQueryTerms(list& terms) { if (!m_ndb) return false; terms.clear(); Xapian::TermIterator it; try { for (it = m_ndb->query.get_terms_begin(); it != m_ndb->query.get_terms_end(); it++) { terms.push_back(*it); } } catch (...) { return false; } return true; } bool Db::getMatchTerms(const Doc& doc, list& terms) { if (!m_ndb || !m_ndb->enquire) { LOGERR(("Db::getMatchTerms: no query opened\n")); return -1; } terms.clear(); Xapian::TermIterator it; Xapian::docid id = Xapian::docid(doc.xdocid); try { for (it=m_ndb->enquire->get_matching_terms_begin(id); it != m_ndb->enquire->get_matching_terms_end(id); it++) { terms.push_back(*it); } } catch (...) { return false; } return true; } // Mset size static const int qquantum = 30; int Db::getResCnt() { if (!m_ndb || !m_ndb->enquire) { LOGERR(("Db::getResCnt: no query opened\n")); return -1; } if (m_ndb->mset.size() <= 0) { try { m_ndb->mset = m_ndb->enquire->get_mset(0, qquantum); } catch (const Xapian::DatabaseModifiedError &error) { m_ndb->db.reopen(); m_ndb->mset = m_ndb->enquire->get_mset(0, qquantum); } catch (const Xapian::Error & error) { LOGERR(("enquire->get_mset: exception: %s\n", error.get_msg().c_str())); return -1; } } return m_ndb->mset.get_matches_lower_bound(); } bool Native::dbDataToRclDoc(std::string &data, Doc &doc, int qopts, Xapian::docid docid, const list& terms) { LOGDEB1(("Db::dbDataToRclDoc: opts %x data: %s\n", qopts, data.c_str())); ConfSimple parms(&data); if (!parms.ok()) return false; parms.get(string("url"), doc.url); parms.get(string("mtype"), doc.mimetype); parms.get(string("fmtime"), doc.fmtime); parms.get(string("dmtime"), doc.dmtime); parms.get(string("origcharset"), doc.origcharset); parms.get(string("caption"), doc.title); parms.get(string("keywords"), doc.keywords); parms.get(string("abstract"), doc.abstract); bool syntabs = false; if (doc.abstract.find(rclSyntAbs) == 0) { doc.abstract = doc.abstract.substr(rclSyntAbs.length()); syntabs = true; } if ((qopts & Db::QO_BUILD_ABSTRACT) && !terms.empty()) { LOGDEB1(("dbDataToRclDoc:: building abstract from position data\n")); if (doc.abstract.empty() || syntabs || (qopts & Db::QO_REPLACE_ABSTRACT)) doc.abstract = makeAbstract(docid, terms); } parms.get(string("ipath"), doc.ipath); parms.get(string("fbytes"), doc.fbytes); parms.get(string("dbytes"), doc.dbytes); doc.xdocid = docid; return true; } // Get document at rank i in query (i is the index in the whole result // set, as in the enquire class. We check if the current mset has the // doc, else ask for an other one. We use msets of 10 documents. Don't // know if the whole thing makes sense at all but it seems to work. // // If there is a postquery filter (ie: file names), we have to // maintain a correspondance from the sequential external index // sequence to the internal Xapian hole-y one (the holes being the documents // that dont match the filter). bool Db::getDoc(int exti, Doc &doc, int *percent) { LOGDEB1(("Db::getDoc: exti %d\n", exti)); if (!m_ndb || !m_ndb->enquire) { LOGERR(("Db::getDoc: no query opened\n")); return false; } // For now the only post-query filter is on dir subtree bool postqfilter = !m_filterTopDir.empty(); LOGDEB1(("Topdir %s postqflt %d\n", m_asdata.topdir.c_str(), postqfilter)); int xapi; if (postqfilter) { // There is a postquery filter, does this fall in already known area ? if (exti >= (int)m_dbindices.size()) { // Have to fetch xapian docs and filter until we get // enough or fail m_dbindices.reserve(exti+1); // First xapian doc we fetch is the one after last stored int first = m_dbindices.size() > 0 ? m_dbindices.back() + 1 : 0; // Loop until we get enough docs while (exti >= (int)m_dbindices.size()) { LOGDEB(("Db::getDoc: fetching %d starting at %d\n", qquantum, first)); try { m_ndb->mset = m_ndb->enquire->get_mset(first, qquantum); } catch (const Xapian::DatabaseModifiedError &error) { m_ndb->db.reopen(); m_ndb->mset = m_ndb->enquire->get_mset(first, qquantum); } catch (const Xapian::Error & error) { LOGERR(("enquire->get_mset: exception: %s\n", error.get_msg().c_str())); abort(); } if (m_ndb->mset.empty()) { LOGDEB(("Db::getDoc: got empty mset\n")); return false; } first = m_ndb->mset.get_firstitem(); for (unsigned int i = 0; i < m_ndb->mset.size() ; i++) { LOGDEB(("Db::getDoc: [%d]\n", i)); Xapian::Document xdoc = m_ndb->mset[i].get_document(); if (m_ndb->filterMatch(this, xdoc)) { m_dbindices.push_back(first + i); } } first = first + m_ndb->mset.size(); } } xapi = m_dbindices[exti]; } else { xapi = exti; } // From there on, we work with a xapian enquire item number. Fetch it int first = m_ndb->mset.get_firstitem(); int last = first + m_ndb->mset.size() -1; if (!(xapi >= first && xapi <= last)) { LOGDEB(("Fetching for first %d, count %d\n", xapi, qquantum)); try { m_ndb->mset = m_ndb->enquire->get_mset(xapi, qquantum); } catch (const Xapian::DatabaseModifiedError &error) { m_ndb->db.reopen(); m_ndb->mset = m_ndb->enquire->get_mset(xapi, qquantum); } catch (const Xapian::Error & error) { LOGERR(("enquire->get_mset: exception: %s\n", error.get_msg().c_str())); abort(); } if (m_ndb->mset.empty()) return false; first = m_ndb->mset.get_firstitem(); last = first + m_ndb->mset.size() -1; } LOGDEB1(("Db::getDoc: Qry [%s] win [%d-%d] Estimated results: %d", m_ndb->query.get_description().c_str(), first, last, m_ndb->mset.get_matches_lower_bound())); Xapian::Document xdoc = m_ndb->mset[xapi-first].get_document(); Xapian::docid docid = *(m_ndb->mset[xapi-first]); if (percent) *percent = m_ndb->mset.convert_to_percent(m_ndb->mset[xapi-first]); // Parse xapian document's data and populate doc fields string data = xdoc.get_data(); list terms; getQueryTerms(terms); return m_ndb->dbDataToRclDoc(data, doc, m_qOpts, docid, terms); } // Retrieve document defined by file name and internal path. bool Db::getDoc(const string &fn, const string &ipath, Doc &doc, int *pc) { LOGDEB(("Db:getDoc: [%s] (%d) [%s]\n", fn.c_str(), fn.length(), ipath.c_str())); if (m_ndb == 0) return false; // Initialize what we can in any case. If this is history, caller // will make partial display in case of error doc.ipath = ipath; doc.url = string("file://") + fn; if (*pc) *pc = 100; string hash; pathHash(fn, hash, PATHHASHLEN); string pqterm = ipath.empty() ? "P" + hash : "Q" + hash + "|" + ipath; const char *ermsg = ""; try { if (!m_ndb->db.term_exists(pqterm)) { // Document found in history no longer in the database. // We return true (because their might be other ok docs further) // but indicate the error with pc = -1 if (*pc) *pc = -1; LOGINFO(("Db:getDoc: no such doc in index: [%s] (len %d)\n", pqterm.c_str(), pqterm.length())); return true; } Xapian::PostingIterator docid = m_ndb->db.postlist_begin(pqterm); Xapian::Document xdoc = m_ndb->db.get_document(*docid); string data = xdoc.get_data(); list terms; return m_ndb->dbDataToRclDoc(data, doc, QO_NONE, *docid, terms); } catch (const Xapian::Error &e) { ermsg = e.get_msg().c_str(); } catch (const string &s) { ermsg = s.c_str(); } catch (const char *s) { ermsg = s; } catch (...) { ermsg = "Caught unknown exception"; } if (*ermsg) { LOGERR(("Db::getDoc: %s\n", ermsg)); } return false; } list Db::expand(const Doc &doc) { list res; if (!m_ndb || !m_ndb->enquire) { LOGERR(("Db::expand: no query opened\n")); return res; } Xapian::RSet rset; rset.add_document(Xapian::docid(doc.xdocid)); // We don't exclude the original query terms. Xapian::ESet eset = m_ndb->enquire->get_eset(20, rset, false); LOGDEB(("ESet terms:\n")); // We filter out the special terms for (Xapian::ESetIterator it = eset.begin(); it != eset.end(); it++) { LOGDEB((" [%s]\n", (*it).c_str())); if ((*it).empty() || ((*it).at(0)>='A' && (*it).at(0)<='Z')) continue; res.push_back(*it); if (res.size() >= 10) break; } return res; } // Width of a sample extract around a query term // // We build a possibly full size but sparsely populated (only around // the search term) reconstruction of the document. It would be // possible to compress the array, by having only multiple chunks // around the terms, but this would seriously complicate the data // structure. string Native::makeAbstract(Xapian::docid docid, const list& terms) { Chrono chron; // A buffer that we populate with the document terms, at their position vector buf; // Go through the list of query terms. For each entry in each // position list, populate the slot in the document buffer, and // remember the position and its neigbours vector qtermposs; // The term positions set chunkposs; // All the positions we shall populate for (list::const_iterator qit = terms.begin(); qit != terms.end(); qit++) { Xapian::PositionIterator pos; // There may be query terms not in this doc. This raises an // exception when requesting the position list, we just catch it. try { unsigned int occurrences = 0; for (pos = db.positionlist_begin(docid, *qit); pos != db.positionlist_end(docid, *qit); pos++) { unsigned int ipos = *pos; LOGDEB1(("Abstract: [%s] at %d\n", qit->c_str(), ipos)); // Possibly extend the array. Do it in big chunks if (ipos + MA_EXTRACT_WIDTH >= buf.size()) { buf.resize(ipos + MA_EXTRACT_WIDTH + 1000); } buf[ipos] = *qit; // Remember the term position qtermposs.push_back(ipos); // Add adjacent slots to the set to populate at next step for (unsigned int ii = MAX(0, ipos-MA_EXTRACT_WIDTH); ii <= MIN(ipos+MA_EXTRACT_WIDTH, buf.size()-1); ii++) { chunkposs.insert(ii); } // Limit the number of occurences we keep for each // term. The abstract has a finite length anyway ! if (occurrences++ > 10) break; } } catch (...) { } } LOGDEB1(("Abstract:%d:chosen number of positions %d. Populating\n", chron.millis(), qtermposs.size())); // Walk the full document position list and populate slots around // the query terms. We arbitrarily truncate the list to avoid // taking forever. If we do cutoff, the abstract may be // inconsistant, which is bad... { Xapian::TermIterator term; int cutoff = 500 * 1000; for (term = db.termlist_begin(docid); term != db.termlist_end(docid); term++) { Xapian::PositionIterator pos; for (pos = db.positionlist_begin(docid, *term); pos != db.positionlist_end(docid, *term); pos++) { if (cutoff-- < 0) break; unsigned int ipos = *pos; if (chunkposs.find(ipos) != chunkposs.end()) { buf[ipos] = *term; } } if (cutoff-- < 0) break; } } LOGDEB1(("Abstract:%d: randomizing and extracting\n", chron.millis())); // We randomize the selection of term positions, from which we // shall pull, starting at the beginning, until the abstract is // big enough. The abstract is finally built in correct position // order, thanks to the position map. random_shuffle(qtermposs.begin(), qtermposs.end()); map mabs; unsigned int abslen = 0; LOGDEB1(("Abstract:%d: extracting\n", chron.millis())); // Extract data around the first (in random order) term positions, // and store the chunks in the map for (vector::const_iterator it = qtermposs.begin(); it != qtermposs.end(); it++) { unsigned int ipos = *it; unsigned int start = MAX(0, ipos-MA_EXTRACT_WIDTH); unsigned int end = MIN(ipos+MA_EXTRACT_WIDTH, buf.size()-1); string chunk; for (unsigned int ii = start; ii <= end; ii++) { if (!buf[ii].empty()) { chunk += buf[ii] + " "; abslen += buf[ii].length(); } if (abslen > MA_ABSTRACT_SIZE) break; } if (end != buf.size()-1) chunk += "... "; mabs[ipos] = chunk; if (abslen > MA_ABSTRACT_SIZE) break; } // Build the abstract by walking the map (in order of position) string abstract; for (map::const_iterator it = mabs.begin(); it != mabs.end(); it++) { abstract += (*it).second; } LOGDEB(("Abtract: done in %d mS\n", chron.millis())); return abstract; } #ifndef NO_NAMESPACES } #endif