#ifndef lint static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.53 2006-01-23 13:32:28 dockes Exp $ (C) 2004 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the * Free Software Foundation, Inc., * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #include #include #include #include #include #include #include #ifndef NO_NAMESPACES using namespace std; #endif /* NO_NAMESPACES */ #include "rcldb.h" #include "textsplit.h" #include "transcode.h" #include "unacpp.h" #include "conftree.h" #include "debuglog.h" #include "pathut.h" #include "smallut.h" #include "pathhash.h" #include "utf8iter.h" #include "wipedir.h" #include "xapian.h" #include // Data for a xapian database. There could actually be 2 different // ones for indexing or query as there is not much in common. class Native { public: bool isopen; bool iswritable; string basedir; // Indexing Xapian::WritableDatabase wdb; vector updated; // Querying Xapian::Database db; Xapian::Query query; // query descriptor: terms and subqueries // joined by operators (or/and etc...) Xapian::Enquire *enquire; Xapian::MSet mset; Native() : isopen(false), iswritable(false), enquire(0) { } ~Native() { delete enquire; } }; Rcl::Db::Db() { pdata = new Native; } Rcl::Db::~Db() { LOGDEB1(("Rcl::Db::~Db\n")); if (pdata == 0) return; Native *ndb = (Native *)pdata; LOGDEB(("Db::~Db: isopen %d iswritable %d\n", ndb->isopen, ndb->iswritable)); if (ndb->isopen == false) return; const char *ermsg = "Unknown error"; try { LOGDEB(("Rcl::Db::~Db: closing native database\n")); if (ndb->iswritable == true) { ndb->wdb.flush(); } delete ndb; return; } catch (const Xapian::Error &e) { ermsg = e.get_msg().c_str(); } catch (const string &s) { ermsg = s.c_str(); } catch (const char *s) { ermsg = s; } catch (...) { ermsg = "Caught unknown exception"; } LOGERR(("Rcl::Db::~Db: got exception: %s\n", ermsg)); } bool Rcl::Db::open(const string& dir, OpenMode mode) { if (pdata == 0) return false; Native *ndb = (Native *)pdata; LOGDEB(("Db::open: isopen %d iswritable %d\n", ndb->isopen, ndb->iswritable)); if (ndb->isopen) { LOGERR(("Rcl::Db::open: already open\n")); return false; } const char *ermsg = "Unknown"; try { switch (mode) { case DbUpd: case DbTrunc: { int action = (mode == DbUpd) ? Xapian::DB_CREATE_OR_OPEN : Xapian::DB_CREATE_OR_OVERWRITE; ndb->wdb = Xapian::WritableDatabase(dir, action); LOGDEB(("Rcl::Db::open: lastdocid: %d\n", ndb->wdb.get_lastdocid())); ndb->updated.resize(ndb->wdb.get_lastdocid() + 1); for (unsigned int i = 0; i < ndb->updated.size(); i++) ndb->updated[i] = false; ndb->iswritable = true; } break; case DbRO: default: ndb->iswritable = false; ndb->db = Xapian::Database(dir); break; } ndb->isopen = true; ndb->basedir = dir; return true; } catch (const Xapian::Error &e) { ermsg = e.get_msg().c_str(); } catch (const string &s) { ermsg = s.c_str(); } catch (const char *s) { ermsg = s; } catch (...) { ermsg = "Caught unknown exception"; } LOGERR(("Rcl::Db::open: exception while opening [%s]: %s\n", dir.c_str(), ermsg)); return false; } // Note: xapian has no close call, we delete and recreate the db bool Rcl::Db::close() { if (pdata == 0) return false; Native *ndb = (Native *)pdata; LOGDEB(("Db::close(): isopen %d iswritable %d\n", ndb->isopen, ndb->iswritable)); if (ndb->isopen == false) return true; const char *ermsg = "Unknown"; try { if (ndb->iswritable == true) { ndb->wdb.flush(); LOGDEB(("Rcl:Db: Called xapian flush\n")); } delete ndb; pdata = new Native; if (pdata) return true; } catch (const Xapian::Error &e) { ermsg = e.get_msg().c_str(); } catch (const string &s) { ermsg = s.c_str(); } catch (const char *s) { ermsg = s; } catch (...) { ermsg = "Caught unknown exception"; } LOGERR(("Rcl::Db:close: exception while deleting db: %s\n", ermsg)); return false; } bool Rcl::Db::isopen() { if (pdata == 0) return false; Native *ndb = (Native *)pdata; return ndb->isopen; } // A small class to hold state while splitting text class mySplitterCB : public TextSplitCB { public: Xapian::Document &doc; Xapian::termpos basepos; // Base for document section Xapian::termpos curpos; // Last position sent to callback mySplitterCB(Xapian::Document &d) : doc(d), basepos(1), curpos(0) {} bool takeword(const std::string &term, int pos, int, int); }; // Callback for the document to word splitting class during indexation bool mySplitterCB::takeword(const std::string &term, int pos, int, int) { #if 0 LOGDEB(("mySplitterCB::takeword:splitCb: [%s]\n", term.c_str())); string printable; if (transcode(term, printable, "UTF-8", "ISO-8859-1")) { LOGDEB((" [%s]\n", printable.c_str())); } #endif const char *ermsg; try { // Note: 1 is the within document frequency increment. It would // be possible to assign different weigths to doc parts (ie title) // by using a higher value curpos = pos; doc.add_posting(term, basepos + curpos, 1); return true; } catch (const Xapian::Error &e) { ermsg = e.get_msg().c_str(); } catch (...) { ermsg= "Unknown error"; } LOGERR(("Rcl::Db: xapian add_posting error %s\n", ermsg)); return false; } // Unaccent and lowercase data, replace \n\r with spaces // Removing crlfs is so that we can use the text in the document data fields. // Use unac (with folding extension) for removing accents and casefolding // // Note that we always return true (but set out to "" on error). We don't // want to stop indexation because of a bad string bool Rcl::dumb_string(const string &in, string &out) { out.erase(); if (in.empty()) return true; string s1; s1.reserve(in.length()); for (unsigned int i = 0; i < in.length(); i++) { if (in[i] == '\n' || in[i] == '\r') s1 += ' '; else s1 += in[i]; } if (!unacmaybefold(s1, out, "UTF-8", true)) { LOGERR(("dumb_string: unac failed for %s\n", in.c_str())); out.erase(); return true; } return true; } /* From omindex direct */ /* Truncate a string to a given maxlength, avoiding cutting off midword * if reasonably possible. */ string truncate_to_word(string & input, string::size_type maxlen) { string output; if (input.length() <= maxlen) { output = input; } else { output = input.substr(0, maxlen); const char *SEPAR = " \t\n\r-:.;,/[]{}"; string::size_type space = output.find_last_of(SEPAR); // Original version only truncated at space if space was found after // maxlen/2. But we HAVE to truncate at space, else we'd need to do // utf8 stuff to avoid truncating at multibyte char. In any case, // not finding space means that the text probably has no value. // Except probably for Asian languages, so we may want to fix this // one day if (space == string::npos) { output.erase(); } else { output.erase(space); } output += " ..."; } // No need to replace newlines with spaces, we do this in dumb_string() return output; } // Truncate longer path and uniquize with hash . The goal for this is // to avoid xapian max term length limitations, not to gain space (we // gain very little even with very short maxlens like 30) #define PATHHASHLEN 150 // Add document in internal form to the database: index the terms in // the title abstract and body and add special terms for file name, // date, mime type ... , create the document data record (more // metadata), and update database bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc) { LOGDEB1(("Rcl::Db::add: fn %s\n", fn.c_str())); if (pdata == 0) return false; Native *ndb = (Native *)pdata; // Truncate abstract, title and keywords to reasonable lengths Rcl::Doc doc = idoc; if (doc.abstract.empty()) doc.abstract = truncate_to_word(doc.text, 100); else doc.abstract = truncate_to_word(doc.abstract, 100); doc.title = truncate_to_word(doc.title, 100); doc.keywords = truncate_to_word(doc.keywords, 300); Xapian::Document newdocument; mySplitterCB splitData(newdocument); TextSplit splitter(&splitData); // /////// Split and index terms in document body and auxiliary fields string noacc; // Split and index file name. This supposes that it's either ascii // or utf-8. If this fails, we just go on. We need a config // parameter for file name charset. // Do we really want to fold case here ? if (dumb_string(fn, noacc)) { splitter.text_to_words(noacc); splitData.basepos += splitData.curpos + 100; } // Split and index title if (!dumb_string(doc.title, noacc)) { LOGERR(("Rcl::Db::add: dumb_string failed\n")); return false; } splitter.text_to_words(noacc); splitData.basepos += splitData.curpos + 100; // Split and index body if (!dumb_string(doc.text, noacc)) { LOGERR(("Rcl::Db::add: dumb_string failed\n")); return false; } splitter.text_to_words(noacc); splitData.basepos += splitData.curpos + 100; // Split and index keywords if (!dumb_string(doc.keywords, noacc)) { LOGERR(("Rcl::Db::add: dumb_string failed\n")); return false; } splitter.text_to_words(noacc); splitData.basepos += splitData.curpos + 100; // Split and index abstract if (!dumb_string(doc.abstract, noacc)) { LOGERR(("Rcl::Db::add: dumb_string failed\n")); return false; } splitter.text_to_words(noacc); splitData.basepos += splitData.curpos + 100; ////// Special terms for metadata // Mime type newdocument.add_term("T" + doc.mimetype); // Path name string hash; pathHash(fn, hash, PATHHASHLEN); LOGDEB2(("Rcl::Db::add: pathhash [%s]\n", hash.c_str())); string pathterm = "P" + hash; newdocument.add_term(pathterm); // Internal path: with path, makes unique identifier for documents // inside multidocument files. string uniterm; if (!doc.ipath.empty()) { uniterm = "Q" + hash + "|" + doc.ipath; newdocument.add_term(uniterm); } // Dates etc... time_t mtime = atol(doc.dmtime.empty() ? doc.fmtime.c_str() : doc.dmtime.c_str()); struct tm *tm = localtime(&mtime); char buf[9]; sprintf(buf, "%04d%02d%02d",tm->tm_year+1900, tm->tm_mon + 1, tm->tm_mday); newdocument.add_term("D" + string(buf)); // Date (YYYYMMDD) buf[7] = '\0'; if (buf[6] == '3') buf[6] = '2'; newdocument.add_term("W" + string(buf)); // "Weak" - 10ish day interval buf[6] = '\0'; newdocument.add_term("M" + string(buf)); // Month (YYYYMM) buf[4] = '\0'; newdocument.add_term("Y" + string(buf)); // Year (YYYY) // Document data record. omindex has the following nl separated fields: // - url // - sample // - caption (title limited to 100 chars) // - mime type string record = "url=file://" + fn; record += "\nmtype=" + doc.mimetype; record += "\nfmtime=" + doc.fmtime; if (!doc.dmtime.empty()) { record += "\ndmtime=" + doc.dmtime; } record += "\norigcharset=" + doc.origcharset; record += "\ncaption=" + doc.title; record += "\nkeywords=" + doc.keywords; record += "\nabstract=" + doc.abstract; if (!doc.ipath.empty()) { record += "\nipath=" + doc.ipath; } record += "\n"; LOGDEB1(("Newdocument data: %s\n", record.c_str())); newdocument.set_data(record); const char *fnc = fn.c_str(); // Add db entry or update existing entry: try { Xapian::docid did = ndb->wdb.replace_document(uniterm.empty() ? pathterm : uniterm, newdocument); if (did < ndb->updated.size()) { ndb->updated[did] = true; LOGDEB(("Rcl::Db::add: docid %d updated [%s , %s]\n", did, fnc, doc.ipath.c_str())); } else { LOGDEB(("Rcl::Db::add: docid %d added [%s , %s]\n", did, fnc, doc.ipath.c_str())); } } catch (...) { // FIXME: is this ever actually needed? try { ndb->wdb.add_document(newdocument); LOGDEB(("Rcl::Db::add: %s added (failed re-seek for duplicate)\n", fnc)); } catch (...) { LOGERR(("Rcl::Db::add: failed again after replace_document\n")); return false; } } return true; } // Test if given filename has changed since last indexed: bool Rcl::Db::needUpdate(const string &filename, const struct stat *stp) { if (pdata == 0) return false; Native *ndb = (Native *)pdata; // If no document exist with this path, we do need update string hash; pathHash(filename, hash, PATHHASHLEN); string pathterm = "P" + hash; const char *ermsg; // Look for all documents with this path. We need to look at all // to set their existence flag. We check the update time on the // fmtime field which will be identical for all docs inside a // multi-document file (we currently always reindex all if the // file changed) Xapian::PostingIterator doc; try { if (!ndb->wdb.term_exists(pathterm)) { LOGDEB1(("Db::needUpdate: no such path: %s\n", pathterm.c_str())); return true; } Xapian::PostingIterator docid0 = ndb->wdb.postlist_begin(pathterm); for (Xapian::PostingIterator docid = docid0; docid != ndb->wdb.postlist_end(pathterm); docid++) { Xapian::Document doc = ndb->wdb.get_document(*docid); // Check the date once. no need to look at the others if // the db needs updating. Note that the fmtime used to be // called mtime, and we're keeping compat if (docid == docid0) { string data = doc.get_data(); const char *cp = strstr(data.c_str(), "fmtime="); if (cp) { cp += 7; } else { cp = strstr(data.c_str(), "mtime="); if (cp) cp+= 6; } long mtime = cp ? atol(cp) : 0; if (mtime < stp->st_mtime) { LOGDEB2(("Db::needUpdate: yes: mtime: Db %ld file %ld\n", (long)mtime, (long)stp->st_mtime)); // Db is not up to date. Let's index the file return true; } } // Db is up to date. Make a note that this document exists. if (*docid < ndb->updated.size()) ndb->updated[*docid] = true; } return false; } catch (const Xapian::Error &e) { ermsg = e.get_msg().c_str(); } catch (...) { ermsg= "Unknown error"; } LOGERR(("Db::needUpdate: error while checking existence: %s\n", ermsg)); return true; } const static string stemdirstem = "stem_"; /// Compute name of stem db for given base database and language static string stemdbname(const string& basename, string lang) { string nm = path_cat(basename, stemdirstem + lang); return nm; } // Deciding if we try to stem the term. If it has numerals or capitals // we don't inline static bool p_notlowerorutf(unsigned int c) { if (c < 'a' || (c > 'z' && c < 128)) return true; return false; } /** * Delete stem db for given language */ bool Rcl::Db::deleteStemDb(const string& lang) { LOGDEB(("Rcl::Db::deleteStemDb(%s)\n", lang.c_str())); if (pdata == 0) return false; Native *ndb = (Native *)pdata; if (ndb->isopen == false) return false; string dir = stemdbname(ndb->basedir, lang); if (wipedir(dir) == 0 && rmdir(dir.c_str()) == 0) return true; return false; } /** * Create database of stem to parents associations for a given language. * We walk the list of all terms, stem them, and create another Xapian db * with documents indexed by a single term (the stem), and with the list of * parent terms in the document data. */ bool Rcl::Db::createStemDb(const string& lang) { LOGDEB(("Rcl::Db::createStemDb(%s)\n", lang.c_str())); if (pdata == 0) return false; Native *ndb = (Native *)pdata; if (ndb->isopen == false) return false; // First build the in-memory stem database: // We walk the list of all terms, and stem each. // If the stem is identical to the term, no need to create an entry // Else, we add an entry to the multimap. // At the end, we only save stem-terms associations with several terms, the // others are not useful multimap assocs; // Statistics int nostem=0; // Dont even try: not-alphanum (incomplete for now) int stemconst=0; // Stem == term int stemdiff=0; // Count of all different stems int stemmultiple = 0; // Count of stems with multiple derivatives try { Xapian::Stem stemmer(lang); Xapian::TermIterator it; for (it = ndb->wdb.allterms_begin(); it != ndb->wdb.allterms_end(); it++) { // If it has any non-lowercase 7bit char, cant be stemmable string::iterator sit = (*it).begin(), eit = sit + (*it).length(); if ((sit = find_if(sit, eit, p_notlowerorutf)) != eit) { ++nostem; // LOGDEB(("stemskipped: [%s], because of 0x%x\n", // (*it).c_str(), *sit)); continue; } string stem = stemmer.stem_word(*it); //cerr << "word " << *it << " stem " << stem << endl; if (stem == *it) { ++stemconst; continue; } assocs.insert(pair(stem, *it)); } } catch (const Xapian::Error &e) { LOGERR(("Db::createStemDb: build failed: %s\n", e.get_msg().c_str())); return false; } catch (...) { LOGERR(("Db::createStemDb: build failed: no stemmer for %s ? \n", lang.c_str())); return false; } class DirWiper { public: string dir; bool do_it; DirWiper(string d) : dir(d), do_it(true) {} ~DirWiper() { if (do_it) { wipedir(dir); rmdir(dir.c_str()); } } }; // Create xapian database for stem relations string stemdbdir = stemdbname(ndb->basedir, lang); // We want to get rid of the db dir in case of error. This gets disarmed // just before success return. DirWiper wiper(stemdbdir); const char *ermsg = "NOERROR"; Xapian::WritableDatabase sdb; try { sdb = Xapian::WritableDatabase(stemdbdir, Xapian::DB_CREATE_OR_OVERWRITE); } catch (const Xapian::Error &e) { ermsg = e.get_msg().c_str(); } catch (const string &s) { ermsg = s.c_str(); } catch (const char *s) { ermsg = s; } catch (...) { ermsg = "Caught unknown exception"; } if (ermsg != "NOERROR") { LOGERR(("Rcl::Db::createstemdb: exception while opening [%s]: %s\n", stemdbdir.c_str(), ermsg)); return false; } // Enter pseud-docs in db. Walk the multimap, only enter // associations where there are several parent terms string stem; list derivs; for (multimap::const_iterator it = assocs.begin(); it != assocs.end(); it++) { if (stem == it->first) { // Staying with same stem derivs.push_back(it->second); // cerr << " " << it->second << endl; } else { // Changing stems ++stemdiff; if (derivs.size() > 1) { // Previous stem has multiple derivatives. Enter in db ++stemmultiple; Xapian::Document newdocument; newdocument.add_term(stem); // The doc data is just parents=blank-separated-list string record = "parents="; for (list::const_iterator it = derivs.begin(); it != derivs.end(); it++) { record += *it + " "; } record += "\n"; LOGDEB1(("stemdocument data: %s\n", record.c_str())); newdocument.set_data(record); try { sdb.replace_document(stem, newdocument); } catch (...) { LOGERR(("Rcl::Db::createstemdb: replace failed\n")); return false; } } derivs.clear(); stem = it->first; derivs.push_back(it->second); // cerr << "\n" << stem << " " << it->second; } } LOGDEB(("Stem map size: %d stems %d mult %d no %d const %d\n", assocs.size(), stemdiff, stemmultiple, nostem, stemconst)); wiper.do_it = false; return true; } list Rcl::Db::getStemLangs() { list dirs; LOGDEB(("Rcl::Db::getStemLang\n")); if (pdata == 0) return dirs; Native *ndb = (Native *)pdata; string pattern = stemdirstem + "*"; dirs = path_dirglob(ndb->basedir, pattern); for (list::iterator it = dirs.begin(); it != dirs.end(); it++) { *it = path_basename(*it); *it = it->substr(stemdirstem.length(), string::npos); } return dirs; } /** * This is called at the end of an indexing session, to delete the * documents for files that are no longer there. We also build the * stem database while we are at it. */ bool Rcl::Db::purge() { LOGDEB(("Rcl::Db::purge\n")); if (pdata == 0) return false; Native *ndb = (Native *)pdata; LOGDEB(("Rcl::Db::purge: isopen %d iswritable %d\n", ndb->isopen, ndb->iswritable)); if (ndb->isopen == false || ndb->iswritable == false) return false; // There seems to be problems with the document delete code, when // we do this, the database is not actually updated. Especially, // if we delete a bunch of docs, so that there is a hole in the // docids at the beginning, we can't add anything (appears to work // and does nothing). Maybe related to the exceptions below when // trying to delete an unexistant document ? // Flushing before trying the deletes seeems to work around the problem try { ndb->wdb.flush(); } catch (...) { LOGDEB(("Rcl::Db::purge: 1st flush failed\n")); } for (Xapian::docid docid = 1; docid < ndb->updated.size(); ++docid) { if (!ndb->updated[docid]) { try { ndb->wdb.delete_document(docid); LOGDEB(("Rcl::Db::purge: deleted document #%d\n", docid)); } catch (const Xapian::DocNotFoundError &) { LOGDEB(("Rcl::Db::purge: document #%d not found\n", docid)); } } } try { ndb->wdb.flush(); } catch (...) { LOGDEB(("Rcl::Db::purge: 2nd flush failed\n")); } return true; } /** * Expand term to list of all terms which stem to the same term. */ static list stemexpand(Native *ndb, string term, const string& lang) { list explist; try { Xapian::Stem stemmer(lang); string stem = stemmer.stem_word(term); LOGDEB(("stemexpand: [%s] stem-> [%s]\n", term.c_str(), stem.c_str())); // Try to fetch the doc from the stem db string stemdbdir = stemdbname(ndb->basedir, lang); Xapian::Database sdb(stemdbdir); LOGDEB1(("stemexpand: %s lastdocid: %d\n", stemdbdir.c_str(), sdb.get_lastdocid())); if (!sdb.term_exists(stem)) { LOGDEB1(("Rcl::Db::stemexpand: no term for %s\n", stem.c_str())); explist.push_back(term); return explist; } Xapian::PostingIterator did = sdb.postlist_begin(stem); if (did == sdb.postlist_end(stem)) { LOGDEB1(("stemexpand: no term(1) for %s\n",stem.c_str())); explist.push_back(term); return explist; } Xapian::Document doc = sdb.get_document(*did); string data = doc.get_data(); // No need for a conftree, but we need to massage the data a little string::size_type pos = data.find_first_of("="); ++pos; string::size_type pos1 = data.find_last_of("\n"); if (pos == string::npos || pos1 == string::npos ||pos1 <= pos) { // ?? explist.push_back(term); return explist; } stringToStrings(data.substr(pos, pos1-pos), explist); if (find(explist.begin(), explist.end(), term) == explist.end()) { explist.push_back(term); } LOGDEB(("stemexpand: %s -> %s\n", stem.c_str(), stringlistdisp(explist).c_str())); } catch (...) { LOGERR(("stemexpand: error accessing stem db\n")); explist.push_back(term); return explist; } return explist; } class wsQData : public TextSplitCB { public: vector terms; string catterms() { string s; for (unsigned int i=0;i::iterator it=terms.begin(); it !=terms.end();it++){ string dumb; Rcl::dumb_string(*it, dumb); *it = dumb; } } }; // Turn string into list of xapian queries. There is little // interpretation done on the string (no +term -term or filename:term // stuff). We just separate words and phrases, and interpret // capitalized terms as wanting no stem expansion. // The final list contains one query for each term or phrase // - Elements corresponding to a stem-expanded part are an OP_OR // composition of the stem-expanded terms (or a single term query). // - Elements corresponding to a phrase are an OP_PHRASE composition of the // phrase terms (no stem expansion in this case) static void stringToXapianQueries(const string &iq, const string& stemlang, Native *ndb, list &pqueries, Rcl::Db::QueryOpts opts = Rcl::Db::QO_NONE) { string qstring = iq; // Split into (possibly single word) phrases ("this is a phrase"): list phrases; stringToStrings(qstring, phrases); // Then process each phrase: split into terms and transform into // appropriate Xapian Query for (list::iterator it=phrases.begin(); it !=phrases.end(); it++) { LOGDEB(("strToXapianQ: phrase or word: [%s]\n", it->c_str())); wsQData splitData; TextSplit splitter(&splitData, true); splitter.text_to_words(*it); LOGDEB1(("strToXapianQ: splitter term count: %d\n", splitData.terms.size())); switch(splitData.terms.size()) { case 0: continue;// ?? case 1: // Not a real phrase: one term { string term = splitData.terms.front(); bool nostemexp = false; // Check if the first letter is a majuscule in which // case we do not want to do stem expansion. Note that // the test is convoluted and possibly problematic if (term.length() > 0) { string noacterm,noaclowterm; if (unacmaybefold(term, noacterm, "UTF-8", false) && unacmaybefold(noacterm, noaclowterm, "UTF-8", true)) { Utf8Iter it1(noacterm); Utf8Iter it2(noaclowterm); if (*it1 != *it2) nostemexp = true; } } LOGDEB1(("Term: %s stem expansion: %s\n", term.c_str(), nostemexp?"no":"yes")); list exp; string term1; Rcl::dumb_string(term, term1); // Possibly perform stem compression/expansion if (!nostemexp && (opts & Rcl::Db::QO_STEM)) { exp = stemexpand(ndb, term1, stemlang); } else { exp.push_back(term1); } // Push either term or OR of stem-expanded set pqueries.push_back(Xapian::Query(Xapian::Query::OP_OR, exp.begin(), exp.end())); } break; default: // Phrase: no stem expansion splitData.dumball(); LOGDEB(("Pushing phrase: [%s]\n", splitData.catterms().c_str())); pqueries.push_back(Xapian::Query(Xapian::Query::OP_PHRASE, splitData.terms.begin(), splitData.terms.end())); } } } // Prepare query out of simple query string bool Rcl::Db::setQuery(const std::string &iqstring, QueryOpts opts, const string& stemlang) { LOGDEB(("Rcl::Db::setQuery: q: [%s], opts 0x%x, stemlang %s\n", iqstring.c_str(), (unsigned int)opts, stemlang.c_str())); Native *ndb = (Native *)pdata; if (!ndb) return false; asdata.erase(); dbindices.clear(); list pqueries; stringToXapianQueries(iqstring, stemlang, ndb, pqueries, opts); ndb->query = Xapian::Query(Xapian::Query::OP_OR, pqueries.begin(), pqueries.end()); delete ndb->enquire; ndb->enquire = new Xapian::Enquire(ndb->db); ndb->enquire->set_query(ndb->query); ndb->mset = Xapian::MSet(); return true; } // Prepare query out of "advanced search" data bool Rcl::Db::setQuery(AdvSearchData &sdata, QueryOpts opts, const string& stemlang) { LOGDEB(("Rcl::Db::setQuery: adv:\n")); LOGDEB((" allwords: %s\n", sdata.allwords.c_str())); LOGDEB((" phrase: %s\n", sdata.phrase.c_str())); LOGDEB((" orwords: %s\n", sdata.orwords.c_str())); LOGDEB((" nowords: %s\n", sdata.nowords.c_str())); string ft; for (list::iterator it = sdata.filetypes.begin(); it != sdata.filetypes.end(); it++) {ft += *it + " ";} if (!ft.empty()) LOGDEB((" searched file types: %s\n", ft.c_str())); if (!sdata.topdir.empty()) LOGDEB((" restricted to: %s\n", sdata.topdir.c_str())); asdata = sdata; dbindices.clear(); Native *ndb = (Native *)pdata; if (!ndb) return false; list pqueries; Xapian::Query xq; if (!sdata.allwords.empty()) { stringToXapianQueries(sdata.allwords, stemlang, ndb, pqueries, opts); if (!pqueries.empty()) { xq = Xapian::Query(Xapian::Query::OP_AND, pqueries.begin(), pqueries.end()); pqueries.clear(); } } if (!sdata.orwords.empty()) { stringToXapianQueries(sdata.orwords, stemlang, ndb, pqueries, opts); if (!pqueries.empty()) { Xapian::Query nq; nq = Xapian::Query(Xapian::Query::OP_OR, pqueries.begin(), pqueries.end()); xq = xq.empty() ? nq : Xapian::Query(Xapian::Query::OP_AND_MAYBE, xq, nq); pqueries.clear(); } } // We do no stem expansion on 'No' words. Should we ? if (!sdata.nowords.empty()) { stringToXapianQueries(sdata.nowords, stemlang, ndb, pqueries); if (!pqueries.empty()) { Xapian::Query nq; nq = Xapian::Query(Xapian::Query::OP_OR, pqueries.begin(), pqueries.end()); xq = xq.empty() ? nq : Xapian::Query(Xapian::Query::OP_AND_NOT, xq, nq); pqueries.clear(); } } if (!sdata.phrase.empty()) { Xapian::Query nq; string s = string("\"") + sdata.phrase + string("\""); stringToXapianQueries(s, stemlang, ndb, pqueries); if (!pqueries.empty()) { // There should be a single list element phrase query. xq = xq.empty() ? *pqueries.begin() : Xapian::Query(Xapian::Query::OP_AND, xq, *pqueries.begin()); pqueries.clear(); } } if (!sdata.filetypes.empty()) { Xapian::Query tq; for (list::iterator it = sdata.filetypes.begin(); it != sdata.filetypes.end(); it++) { string term = "T" + *it; LOGDEB(("Adding file type term: [%s]\n", term.c_str())); tq = tq.empty() ? Xapian::Query(term) : Xapian::Query(Xapian::Query::OP_OR, tq, Xapian::Query(term)); } xq = xq.empty() ? tq : Xapian::Query(Xapian::Query::OP_FILTER, xq, tq); } ndb->query = xq; delete ndb->enquire; ndb->enquire = new Xapian::Enquire(ndb->db); ndb->enquire->set_query(ndb->query); ndb->mset = Xapian::MSet(); // Get the query description and trim the "Xapian::Query" sdata.description = ndb->query.get_description(); if (sdata.description.find("Xapian::Query") == 0) sdata.description = sdata.description.substr(strlen("Xapian::Query")); LOGDEB(("Rcl::Db::SetQuery: Q: %s\n", sdata.description.c_str())); return true; } bool Rcl::Db::getQueryTerms(list& terms) { Native *ndb = (Native *)pdata; if (!ndb) return false; terms.clear(); Xapian::TermIterator it; for (it = ndb->query.get_terms_begin(); it != ndb->query.get_terms_end(); it++) { terms.push_back(*it); } return true; } int Rcl::Db::getResCnt() { Native *ndb = (Native *)pdata; if (!ndb || !ndb->enquire) { LOGERR(("Rcl::Db::getResCnt: no query opened\n")); return -1; } if (ndb->mset.size() <= 0) return -1; return ndb->mset.get_matches_lower_bound(); } // This class (friend to RclDb) exists so that we can have functions that // access private RclDb data and have Xapian-specific parameters (so that we // don't want them to appear in the public rcldb.h). class Rcl::DbPops { public: static bool filterMatch(Rcl::Db *rdb, Xapian::Document &xdoc) { // Parse xapian document's data and populate doc fields string data = xdoc.get_data(); ConfSimple parms(&data); // The only filtering for now is on file path (subtree) string url; parms.get(string("url"), url); url = url.substr(7); if (url.find(rdb->asdata.topdir) == 0) return true; return false; } }; bool Rcl::Db::dbDataToRclDoc(std::string &data, Doc &doc) { LOGDEB1(("Rcl::Db::dbDataToRclDoc: data: %s\n", data.c_str())); ConfSimple parms(&data); if (!parms.ok()) return false; parms.get(string("url"), doc.url); parms.get(string("mtype"), doc.mimetype); parms.get(string("fmtime"), doc.fmtime); parms.get(string("dmtime"), doc.dmtime); parms.get(string("origcharset"), doc.origcharset); parms.get(string("caption"), doc.title); parms.get(string("keywords"), doc.keywords); parms.get(string("abstract"), doc.abstract); parms.get(string("ipath"), doc.ipath); return true; } // Get document at rank i in query (i is the index in the whole result // set, as in the enquire class. We check if the current mset has the // doc, else ask for an other one. We use msets of 10 documents. Don't // know if the whole thing makes sense at all but it seems to work. // // If there is a postquery filter (ie: file names), we have to // maintain a correspondance from the sequential external index // sequence to the internal Xapian hole-y one (the holes being the documents // that dont match the filter). bool Rcl::Db::getDoc(int exti, Doc &doc, int *percent) { const int qquantum = 30; LOGDEB1(("Rcl::Db::getDoc: exti %d\n", exti)); Native *ndb = (Native *)pdata; if (!ndb || !ndb->enquire) { LOGERR(("Rcl::Db::getDoc: no query opened\n")); return false; } // For now the only post-query filter is on dir subtree bool postqfilter = !asdata.topdir.empty(); LOGDEB1(("Topdir %s postqflt %d\n", asdata.topdir.c_str(), postqfilter)); int xapi; if (postqfilter) { // There is a postquery filter, does this fall in already known area ? if (exti >= (int)dbindices.size()) { // Have to fetch xapian docs and filter until we get // enough or fail dbindices.reserve(exti+1); // First xapian doc we fetch is the one after last stored int first = dbindices.size() > 0 ? dbindices.back() + 1 : 0; // Loop until we get enough docs while (exti >= (int)dbindices.size()) { LOGDEB(("Rcl::Db::getDoc: fetching %d starting at %d\n", qquantum, first)); try { ndb->mset = ndb->enquire->get_mset(first, qquantum); } catch (const Xapian::DatabaseModifiedError &error) { ndb->db.reopen(); ndb->mset = ndb->enquire->get_mset(first, qquantum); } catch (const Xapian::Error & error) { LOGERR(("enquire->get_mset: exception: %s\n", error.get_msg().c_str())); abort(); } if (ndb->mset.empty()) { LOGDEB(("Rcl::Db::getDoc: got empty mset\n")); return false; } first = ndb->mset.get_firstitem(); for (unsigned int i = 0; i < ndb->mset.size() ; i++) { LOGDEB(("Rcl::Db::getDoc: [%d]\n", i)); Xapian::Document xdoc = ndb->mset[i].get_document(); if (Rcl::DbPops::filterMatch(this, xdoc)) { dbindices.push_back(first + i); } } first = first + ndb->mset.size(); } } xapi = dbindices[exti]; } else { xapi = exti; } // From there on, we work with a xapian enquire item number. Fetch it int first = ndb->mset.get_firstitem(); int last = first + ndb->mset.size() -1; if (!(xapi >= first && xapi <= last)) { LOGDEB(("Fetching for first %d, count %d\n", xapi, qquantum)); try { ndb->mset = ndb->enquire->get_mset(xapi, qquantum); } catch (const Xapian::DatabaseModifiedError &error) { ndb->db.reopen(); ndb->mset = ndb->enquire->get_mset(xapi, qquantum); } catch (const Xapian::Error & error) { LOGERR(("enquire->get_mset: exception: %s\n", error.get_msg().c_str())); abort(); } if (ndb->mset.empty()) return false; first = ndb->mset.get_firstitem(); last = first + ndb->mset.size() -1; } LOGDEB1(("Rcl::Db::getDoc: Qry [%s] win [%d-%d] Estimated results: %d", ndb->query.get_description().c_str(), first, last, ndb->mset.get_matches_lower_bound())); Xapian::Document xdoc = ndb->mset[xapi-first].get_document(); if (percent) *percent = ndb->mset.convert_to_percent(ndb->mset[xapi-first]); // Parse xapian document's data and populate doc fields string data = xdoc.get_data(); return dbDataToRclDoc(data, doc); } // Retrieve document defined by file name and internal path. Very inefficient, // used only for history display. We'd need to enter path+ipath terms in the // db if we wanted to make this more efficient. bool Rcl::Db::getDoc(const string &fn, const string &ipath, Doc &doc) { LOGDEB(("Rcl::Db:getDoc: [%s] (%d) [%s]\n", fn.c_str(), fn.length(), ipath.c_str())); if (pdata == 0) return false; Native *ndb = (Native *)pdata; string hash; pathHash(fn, hash, PATHHASHLEN); string pathterm = "P" + hash; // Look for all documents with this path, searching for the one // with the appropriate ipath. This is very inefficient. const char *ermsg = ""; try { if (!ndb->db.term_exists(pathterm)) { char len[20]; sprintf(len, "%d", pathterm.length()); throw string("path inexistant: [") + pathterm +"] length " + len; } for (Xapian::PostingIterator docid = ndb->db.postlist_begin(pathterm); docid != ndb->db.postlist_end(pathterm); docid++) { Xapian::Document xdoc = ndb->db.get_document(*docid); string data = xdoc.get_data(); if (dbDataToRclDoc(data, doc) && doc.ipath == ipath) return true; } } catch (const Xapian::Error &e) { ermsg = e.get_msg().c_str(); } catch (const string &s) { ermsg = s.c_str(); } catch (const char *s) { ermsg = s; } catch (...) { ermsg = "Caught unknown exception"; } if (*ermsg) { LOGERR(("Rcl::Db::getDoc: %s\n", ermsg)); // Initialize what we can anyway. If this is history, caller // will make partial display doc.ipath = ipath; doc.url = string("file://") + fn; } return false; }