From 5dd8774b3c0edc199a1c01967ae7a5a424c967b1 Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Tue, 14 Apr 2020 09:25:13 +0200 Subject: [PATCH] whitespace and indents only --- src/rcldb/expansiondbs.cpp | 148 ++-- src/rcldb/rcldb.cpp | 1706 ++++++++++++++++++------------------ src/rcldb/termproc.h | 75 +- 3 files changed, 963 insertions(+), 966 deletions(-) diff --git a/src/rcldb/expansiondbs.cpp b/src/rcldb/expansiondbs.cpp index c6b3f18d..dbf6a36e 100644 --- a/src/rcldb/expansiondbs.cpp +++ b/src/rcldb/expansiondbs.cpp @@ -41,7 +41,7 @@ namespace Rcl { * We use Xapian synonyms subsets to store the expansions. */ bool createExpansionDbs(Xapian::WritableDatabase& wdb, - const vector& langs) + const vector& langs) { LOGDEB("StemDb::createExpansionDbs: languages: " < stemdbs; - // Note: tried to make this to work with stack-allocated objects, couldn't. - // Looks like a bug in copy constructors somewhere, can't guess where - vector > stemmers; - for (unsigned int i = 0; i < langs.size(); i++) { - stemmers.push_back(std::shared_ptr - (new SynTermTransStem(langs[i]))); - stemdbs.push_back( - XapWritableComputableSynFamMember(wdb, synFamStem, langs[i], - stemmers.back().get())); - stemdbs.back().recreate(); - } + // Stem dbs + vector stemdbs; + // Note: tried to make this to work with stack-allocated objects, couldn't. + // Looks like a bug in copy constructors somewhere, can't guess where + vector > stemmers; + for (unsigned int i = 0; i < langs.size(); i++) { + stemmers.push_back(std::shared_ptr + (new SynTermTransStem(langs[i]))); + stemdbs.push_back( + XapWritableComputableSynFamMember(wdb, synFamStem, langs[i], + stemmers.back().get())); + stemdbs.back().recreate(); + } - // Unaccented stem dbs - vector unacstemdbs; - // We can reuse the same stemmer pointers, the objects are stateless. - if (!o_index_stripchars) { - for (unsigned int i = 0; i < langs.size(); i++) { - unacstemdbs.push_back( - XapWritableComputableSynFamMember(wdb, synFamStemUnac, langs[i], - stemmers.back().get())); - unacstemdbs.back().recreate(); - } - } - SynTermTransUnac transunac(UNACOP_UNACFOLD); - XapWritableComputableSynFamMember - diacasedb(wdb, synFamDiCa, "all", &transunac); - if (!o_index_stripchars) - diacasedb.recreate(); + // Unaccented stem dbs + vector unacstemdbs; + // We can reuse the same stemmer pointers, the objects are stateless. + if (!o_index_stripchars) { + for (unsigned int i = 0; i < langs.size(); i++) { + unacstemdbs.push_back( + XapWritableComputableSynFamMember( + wdb, synFamStemUnac, langs[i], stemmers.back().get())); + unacstemdbs.back().recreate(); + } + } + SynTermTransUnac transunac(UNACOP_UNACFOLD); + XapWritableComputableSynFamMember + diacasedb(wdb, synFamDiCa, "all", &transunac); + if (!o_index_stripchars) + diacasedb.recreate(); - Xapian::TermIterator it = wdb.allterms_begin(); - // We'd want to skip to the first non-prefixed term, but this is a bit - // complicated, so we just jump over most of the prefixed term and then - // skip the rest one by one. - it.skip_to(wrap_prefix("Z")); + Xapian::TermIterator it = wdb.allterms_begin(); + // We'd want to skip to the first non-prefixed term, but this is a bit + // complicated, so we just jump over most of the prefixed term and then + // skip the rest one by one. + it.skip_to(wrap_prefix("Z")); for ( ;it != wdb.allterms_end(); it++) { const string term{*it}; - if (has_prefix(term)) - continue; + if (has_prefix(term)) + continue; - // Detect and skip CJK terms. - Utf8Iter utfit(term); + // Detect and skip CJK terms. + Utf8Iter utfit(term); if (utfit.eof()) // Empty term?? Seems to happen. continue; - if (TextSplit::isCJK(*utfit)) { - // LOGDEB("stemskipped: Skipping CJK\n"); - continue; - } + if (TextSplit::isCJK(*utfit)) { + // LOGDEB("stemskipped: Skipping CJK\n"); + continue; + } - string lower = term; - // If the index is raw, compute the case-folded term which - // is the input to the stem db, and add a synonym from the - // stripped term to the cased and accented one, for accent - // and case expansion at query time - if (!o_index_stripchars) { - unacmaybefold(term, lower, "UTF-8", UNACOP_FOLD); - diacasedb.addSynonym(term); - } + string lower = term; + // If the index is raw, compute the case-folded term which + // is the input to the stem db, and add a synonym from the + // stripped term to the cased and accented one, for accent + // and case expansion at query time + if (!o_index_stripchars) { + unacmaybefold(term, lower, "UTF-8", UNACOP_FOLD); + diacasedb.addSynonym(term); + } - // Dont' apply stemming to terms which don't look like - // natural language words. + // Dont' apply stemming to terms which don't look like + // natural language words. if (!Db::isSpellingCandidate(term)) { LOGDEB1("createExpansionDbs: skipped: [" << term << "]\n"); continue; } - // Create stemming synonym for every language. The input is the - // lowercase accented term - for (unsigned int i = 0; i < langs.size(); i++) { - stemdbs[i].addSynonym(lower); - } + // Create stemming synonym for every language. The input is the + // lowercase accented term + for (unsigned int i = 0; i < langs.size(); i++) { + stemdbs[i].addSynonym(lower); + } - // For a raw index, also maybe create a stem expansion for - // the unaccented term. While this may be incorrect, it is - // also necessary for searching in a diacritic-unsensitive - // way on a raw index - if (!o_index_stripchars) { - string unac; - unacmaybefold(lower, unac, "UTF-8", UNACOP_UNAC); - if (unac != lower) { - for (unsigned int i = 0; i < langs.size(); i++) { - unacstemdbs[i].addSynonym(unac); - } - } - } + // For a raw index, also maybe create a stem expansion for + // the unaccented term. While this may be incorrect, it is + // also necessary for searching in a diacritic-unsensitive + // way on a raw index + if (!o_index_stripchars) { + string unac; + unacmaybefold(lower, unac, "UTF-8", UNACOP_UNAC); + if (unac != lower) { + for (unsigned int i = 0; i < langs.size(); i++) { + unacstemdbs[i].addSynonym(unac); + } + } + } } } XCATCHERROR(ermsg); if (!ermsg.empty()) { diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index 43a7f584..2c65ac53 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -65,11 +65,11 @@ using namespace std; #ifndef XAPIAN_AT_LEAST // Added in Xapian 1.4.2. Define it here for older versions -#define XAPIAN_AT_LEAST(A,B,C) \ - (XAPIAN_MAJOR_VERSION > (A) || \ - (XAPIAN_MAJOR_VERSION == (A) && \ - (XAPIAN_MINOR_VERSION > (B) || \ - (XAPIAN_MINOR_VERSION == (B) && XAPIAN_REVISION >= (C))))) +#define XAPIAN_AT_LEAST(A,B,C) \ + (XAPIAN_MAJOR_VERSION > (A) || \ + (XAPIAN_MAJOR_VERSION == (A) && \ + (XAPIAN_MINOR_VERSION > (B) || \ + (XAPIAN_MINOR_VERSION == (B) && XAPIAN_REVISION >= (C))))) #endif @@ -148,7 +148,7 @@ Db::Native::Native(Db *db) m_noversionwrite(false) #ifdef IDX_THREADS , m_wqueue("DbUpd", - m_rcldb->m_config->getThrConf(RclConfig::ThrDbWrite).first), + m_rcldb->m_config->getThrConf(RclConfig::ThrDbWrite).first), m_totalworkns(0LL), m_havewriteq(false) #endif // IDX_THREADS { @@ -160,7 +160,7 @@ Db::Native::~Native() LOGDEB1("Native::~Native: me " << this << "\n"); #ifdef IDX_THREADS if (m_havewriteq) { - void *status = m_wqueue.setTerminateAndWait(); + void *status = m_wqueue.setTerminateAndWait(); if (status) { LOGDEB1("Native::~Native: worker status " << status << "\n"); } @@ -177,37 +177,37 @@ void *DbUpdWorker(void* vdbp) DbUpdTask *tsk = 0; for (;;) { - size_t qsz = -1; - if (!tqp->take(&tsk, &qsz)) { - tqp->workerExit(); - return (void*)1; - } - bool status = false; - switch (tsk->op) { - case DbUpdTask::AddOrUpdate: - LOGDEB("DbUpdWorker: got add/update task, ql " << qsz << "\n"); - status = ndbp->addOrUpdateWrite( + size_t qsz = -1; + if (!tqp->take(&tsk, &qsz)) { + tqp->workerExit(); + return (void*)1; + } + bool status = false; + switch (tsk->op) { + case DbUpdTask::AddOrUpdate: + LOGDEB("DbUpdWorker: got add/update task, ql " << qsz << "\n"); + status = ndbp->addOrUpdateWrite( tsk->udi, tsk->uniterm, tsk->doc, tsk->txtlen, tsk->rawztext); - break; - case DbUpdTask::Delete: - LOGDEB("DbUpdWorker: got delete task, ql " << qsz << "\n"); - status = ndbp->purgeFileWrite(false, tsk->udi, tsk->uniterm); - break; - case DbUpdTask::PurgeOrphans: - LOGDEB("DbUpdWorker: got orphans purge task, ql " << qsz << "\n"); - status = ndbp->purgeFileWrite(true, tsk->udi, tsk->uniterm); - break; - default: - LOGERR("DbUpdWorker: unknown op " << tsk->op << " !!\n"); - break; - } - if (!status) { - LOGERR("DbUpdWorker: xxWrite failed\n"); - tqp->workerExit(); - delete tsk; - return (void*)0; - } - delete tsk; + break; + case DbUpdTask::Delete: + LOGDEB("DbUpdWorker: got delete task, ql " << qsz << "\n"); + status = ndbp->purgeFileWrite(false, tsk->udi, tsk->uniterm); + break; + case DbUpdTask::PurgeOrphans: + LOGDEB("DbUpdWorker: got orphans purge task, ql " << qsz << "\n"); + status = ndbp->purgeFileWrite(true, tsk->udi, tsk->uniterm); + break; + default: + LOGERR("DbUpdWorker: unknown op " << tsk->op << " !!\n"); + break; + } + if (!status) { + LOGERR("DbUpdWorker: xxWrite failed\n"); + tqp->workerExit(); + delete tsk; + return (void*)0; + } + delete tsk; } } @@ -218,15 +218,15 @@ void Db::Native::maybeStartThreads() int writeqlen = cnf->getThrConf(RclConfig::ThrDbWrite).first; int writethreads = cnf->getThrConf(RclConfig::ThrDbWrite).second; if (writethreads > 1) { - LOGINFO("RclDb: write threads count was forced down to 1\n"); - writethreads = 1; + LOGINFO("RclDb: write threads count was forced down to 1\n"); + writethreads = 1; } if (writeqlen >= 0 && writethreads > 0) { - if (!m_wqueue.start(writethreads, DbUpdWorker, this)) { - LOGERR("Db::Db: Worker start failed\n"); - return; - } - m_havewriteq = true; + if (!m_wqueue.start(writethreads, DbUpdWorker, this)) { + LOGERR("Db::Db: Worker start failed\n"); + return; + } + m_havewriteq = true; } LOGDEB("RclDb:: threads: haveWriteQ " << m_havewriteq << ", wqlen " << writeqlen << " wqts " << writethreads << "\n"); @@ -344,24 +344,24 @@ void Db::Native::openRead(const string& dir) /* See comment in class declaration: return all subdocuments of a * document given by its unique id. */ bool Db::Native::subDocs(const string &udi, int idxi, - vector& docids) + vector& docids) { LOGDEB2("subDocs: [" << uniterm << "]\n"); string pterm = make_parentterm(udi); vector candidates; XAPTRY(docids.clear(); candidates.insert(candidates.begin(), xrdb.postlist_begin(pterm), - xrdb.postlist_end(pterm)), + xrdb.postlist_end(pterm)), xrdb, m_rcldb->m_reason); if (!m_rcldb->m_reason.empty()) { LOGERR("Rcl::Db::subDocs: " << m_rcldb->m_reason << "\n"); return false; } else { - for (unsigned int i = 0; i < candidates.size(); i++) { - if (whatDbIdx(candidates[i]) == (size_t)idxi) { - docids.push_back(candidates[i]); - } - } + for (unsigned int i = 0; i < candidates.size(); i++) { + if (whatDbIdx(candidates[i]) == (size_t)idxi) { + docids.push_back(candidates[i]); + } + } LOGDEB0("Db::Native::subDocs: returning " << docids.size() << " ids\n"); return true; } @@ -371,18 +371,18 @@ bool Db::Native::xdocToUdi(Xapian::Document& xdoc, string &udi) { Xapian::TermIterator xit; XAPTRY(xit = xdoc.termlist_begin(); - xit.skip_to(wrap_prefix(udi_prefix)), + xit.skip_to(wrap_prefix(udi_prefix)), xrdb, m_rcldb->m_reason); if (!m_rcldb->m_reason.empty()) { - LOGERR("xdocToUdi: xapian error: " << m_rcldb->m_reason << "\n"); - return false; + LOGERR("xdocToUdi: xapian error: " << m_rcldb->m_reason << "\n"); + return false; } if (xit != xdoc.termlist_end()) { - udi = *xit; - if (!udi.empty()) { - udi = udi.substr(wrap_prefix(udi_prefix).size()); - return true; - } + udi = *xit; + if (!udi.empty()) { + udi = udi.substr(wrap_prefix(udi_prefix).size()); + return true; + } } return false; } @@ -397,27 +397,27 @@ bool Db::Native::clearDocTermIfWdf0(Xapian::Document& xdoc, const string& term) // Find the term Xapian::TermIterator xit; XAPTRY(xit = xdoc.termlist_begin(); xit.skip_to(term);, - xrdb, m_rcldb->m_reason); + xrdb, m_rcldb->m_reason); if (!m_rcldb->m_reason.empty()) { - LOGERR("Db::clearDocTerm...: [" << term << "] skip failed: " << + LOGERR("Db::clearDocTerm...: [" << term << "] skip failed: " << m_rcldb->m_reason << "\n"); - return false; + return false; } if (xit == xdoc.termlist_end() || term.compare(*xit)) { - LOGDEB0("Db::clearDocTermIFWdf0: term [" << term << + LOGDEB0("Db::clearDocTermIFWdf0: term [" << term << "] not found. xit: [" << (xit == xdoc.termlist_end() ? "EOL": *xit) << "]\n"); - return false; + return false; } // Clear the term if its frequency is 0 if (xit.get_wdf() == 0) { - LOGDEB1("Db::clearDocTermIfWdf0: clearing [" << term << "]\n"); - XAPTRY(xdoc.remove_term(term), xwdb, m_rcldb->m_reason); - if (!m_rcldb->m_reason.empty()) { - LOGDEB0("Db::clearDocTermIfWdf0: failed [" << term << "]: " << + LOGDEB1("Db::clearDocTermIfWdf0: clearing [" << term << "]\n"); + XAPTRY(xdoc.remove_term(term), xwdb, m_rcldb->m_reason); + if (!m_rcldb->m_reason.empty()) { + LOGDEB0("Db::clearDocTermIfWdf0: failed [" << term << "]: " << m_rcldb->m_reason << "\n"); - } + } } return true; } @@ -425,7 +425,7 @@ bool Db::Native::clearDocTermIfWdf0(Xapian::Document& xdoc, const string& term) // Holder for term + pos struct DocPosting { DocPosting(string t, Xapian::termpos ps) - : term(t), pos(ps) {} + : term(t), pos(ps) {} string term; Xapian::termpos pos; }; @@ -435,7 +435,7 @@ struct DocPosting { // prefix. We also remove the postings for the unprefixed terms (that // is, we undo what we did when indexing). bool Db::Native::clearField(Xapian::Document& xdoc, const string& pfx, - Xapian::termcount wdfdec) + Xapian::termcount wdfdec) { LOGDEB1("Db::clearField: clearing prefix [" << pfx << "] for docid " << xdoc.get_docid() << "\n"); @@ -446,48 +446,48 @@ bool Db::Native::clearField(Xapian::Document& xdoc, const string& pfx, m_rcldb->m_reason.clear(); for (int tries = 0; tries < 2; tries++) { - try { - Xapian::TermIterator xit; - xit = xdoc.termlist_begin(); - xit.skip_to(wrapd); - while (xit != xdoc.termlist_end() && - !(*xit).compare(0, wrapd.size(), wrapd)) { - LOGDEB1("Db::clearfield: erasing for [" << *xit << "]\n"); - Xapian::PositionIterator posit; - for (posit = xit.positionlist_begin(); - posit != xit.positionlist_end(); posit++) { - eraselist.push_back(DocPosting(*xit, *posit)); - eraselist.push_back(DocPosting(strip_prefix(*xit), *posit)); - } - xit++; - } - } catch (const Xapian::DatabaseModifiedError &e) { - m_rcldb->m_reason = e.get_msg(); - xrdb.reopen(); - continue; - } XCATCHERROR(m_rcldb->m_reason); - break; + try { + Xapian::TermIterator xit; + xit = xdoc.termlist_begin(); + xit.skip_to(wrapd); + while (xit != xdoc.termlist_end() && + !(*xit).compare(0, wrapd.size(), wrapd)) { + LOGDEB1("Db::clearfield: erasing for [" << *xit << "]\n"); + Xapian::PositionIterator posit; + for (posit = xit.positionlist_begin(); + posit != xit.positionlist_end(); posit++) { + eraselist.push_back(DocPosting(*xit, *posit)); + eraselist.push_back(DocPosting(strip_prefix(*xit), *posit)); + } + xit++; + } + } catch (const Xapian::DatabaseModifiedError &e) { + m_rcldb->m_reason = e.get_msg(); + xrdb.reopen(); + continue; + } XCATCHERROR(m_rcldb->m_reason); + break; } if (!m_rcldb->m_reason.empty()) { - LOGERR("Db::clearField: failed building erase list: " << + LOGERR("Db::clearField: failed building erase list: " << m_rcldb->m_reason << "\n"); - return false; + return false; } // Now remove the found positions, and the terms if the wdf is 0 for (vector::const_iterator it = eraselist.begin(); - it != eraselist.end(); it++) { - LOGDEB1("Db::clearField: remove posting: [" << it->term << "] pos [" << + it != eraselist.end(); it++) { + LOGDEB1("Db::clearField: remove posting: [" << it->term << "] pos [" << it->pos << "]\n"); - XAPTRY(xdoc.remove_posting(it->term, it->pos, wdfdec);, - xwdb,m_rcldb->m_reason); - if (!m_rcldb->m_reason.empty()) { - // Not that this normally fails for non-prefixed XXST and - // ND, don't make a fuss - LOGDEB1("Db::clearFiedl: remove_posting failed for [" << it->term << + XAPTRY(xdoc.remove_posting(it->term, it->pos, wdfdec);, + xwdb,m_rcldb->m_reason); + if (!m_rcldb->m_reason.empty()) { + // Not that this normally fails for non-prefixed XXST and + // ND, don't make a fuss + LOGDEB1("Db::clearFiedl: remove_posting failed for [" << it->term << "]," << it->pos << ": " << m_rcldb->m_reason << "\n"); - } - clearDocTermIfWdf0(xdoc, it->term); + } + clearDocTermIfWdf0(xdoc, it->term); } return true; } @@ -498,17 +498,17 @@ bool Db::Native::hasTerm(const string& udi, int idxi, const string& term) LOGDEB2("Native::hasTerm: udi [" << udi << "] term [" << term << "]\n"); Xapian::Document xdoc; if (getDoc(udi, idxi, xdoc)) { - Xapian::TermIterator xit; - XAPTRY(xit = xdoc.termlist_begin(); - xit.skip_to(term);, - xrdb, m_rcldb->m_reason); - if (!m_rcldb->m_reason.empty()) { - LOGERR("Rcl::Native::hasTerm: " << m_rcldb->m_reason << "\n"); - return false; - } - if (xit != xdoc.termlist_end() && !term.compare(*xit)) { - return true; - } + Xapian::TermIterator xit; + XAPTRY(xit = xdoc.termlist_begin(); + xit.skip_to(term);, + xrdb, m_rcldb->m_reason); + if (!m_rcldb->m_reason.empty()) { + LOGERR("Rcl::Native::hasTerm: " << m_rcldb->m_reason << "\n"); + return false; + } + if (xit != xdoc.termlist_end() && !term.compare(*xit)) { + return true; + } } return false; } @@ -516,25 +516,25 @@ bool Db::Native::hasTerm(const string& udi, int idxi, const string& term) // Retrieve Xapian document, given udi. There may be several identical udis // if we are using multiple indexes. Xapian::docid Db::Native::getDoc(const string& udi, int idxi, - Xapian::Document& xdoc) + Xapian::Document& xdoc) { string uniterm = make_uniterm(udi); for (int tries = 0; tries < 2; tries++) { - try { + try { Xapian::PostingIterator docid; - for (docid = xrdb.postlist_begin(uniterm); - docid != xrdb.postlist_end(uniterm); docid++) { - xdoc = xrdb.get_document(*docid); - if (whatDbIdx(*docid) == (size_t)idxi) - return *docid; - } - // Udi not in Db. - return 0; - } catch (const Xapian::DatabaseModifiedError &e) { + for (docid = xrdb.postlist_begin(uniterm); + docid != xrdb.postlist_end(uniterm); docid++) { + xdoc = xrdb.get_document(*docid); + if (whatDbIdx(*docid) == (size_t)idxi) + return *docid; + } + // Udi not in Db. + return 0; + } catch (const Xapian::DatabaseModifiedError &e) { m_rcldb->m_reason = e.get_msg(); - xrdb.reopen(); + xrdb.reopen(); continue; - } XCATCHERROR(m_rcldb->m_reason); + } XCATCHERROR(m_rcldb->m_reason); break; } LOGERR("Db::Native::getDoc: Xapian error: " << m_rcldb->m_reason << "\n"); @@ -543,12 +543,12 @@ Xapian::docid Db::Native::getDoc(const string& udi, int idxi, // Turn data record from db into document fields bool Db::Native::dbDataToRclDoc(Xapian::docid docid, std::string &data, - Doc &doc, bool fetchtext) + Doc &doc, bool fetchtext) { LOGDEB2("Db::dbDataToRclDoc: data:\n" << data << "\n"); ConfSimple parms(data); if (!parms.ok()) - return false; + return false; doc.xdocid = docid; doc.haspages = hasPages(docid); @@ -557,20 +557,20 @@ bool Db::Native::dbDataToRclDoc(Xapian::docid docid, std::string &data, string dbdir = m_rcldb->m_basedir; doc.idxi = 0; if (!m_rcldb->m_extraDbs.empty()) { - int idxi = int(whatDbIdx(docid)); + int idxi = int(whatDbIdx(docid)); - // idxi is in [0, extraDbs.size()]. 0 is for the main index, - // idxi-1 indexes into the additional dbs array. - if (idxi) { - dbdir = m_rcldb->m_extraDbs[idxi - 1]; - doc.idxi = idxi; - } + // idxi is in [0, extraDbs.size()]. 0 is for the main index, + // idxi-1 indexes into the additional dbs array. + if (idxi) { + dbdir = m_rcldb->m_extraDbs[idxi - 1]; + doc.idxi = idxi; + } } parms.get(Doc::keyurl, doc.idxurl); doc.url = doc.idxurl; m_rcldb->m_config->urlrewrite(dbdir, doc.url); if (!doc.url.compare(doc.idxurl)) - doc.idxurl.clear(); + doc.idxurl.clear(); // Special cases: parms.get(Doc::keytp, doc.mimetype); @@ -584,9 +584,9 @@ bool Db::Native::dbDataToRclDoc(Xapian::docid docid, std::string &data, // used to index the beginning of the text as abstract). doc.syntabs = false; if (doc.meta[Doc::keyabs].find(cstr_syntAbs) == 0) { - doc.meta[Doc::keyabs] = - doc.meta[Doc::keyabs].substr(cstr_syntAbs.length()); - doc.syntabs = true; + doc.meta[Doc::keyabs] = + doc.meta[Doc::keyabs].substr(cstr_syntAbs.length()); + doc.syntabs = true; } parms.get(Doc::keyipt, doc.ipath); parms.get(Doc::keypcs, doc.pcbytes); @@ -597,9 +597,9 @@ bool Db::Native::dbDataToRclDoc(Xapian::docid docid, std::string &data, // Normal key/value pairs: vector keys = parms.getNames(string()); for (vector::const_iterator it = keys.begin(); - it != keys.end(); it++) { - if (doc.meta.find(*it) == doc.meta.end()) - parms.get(*it, doc.meta[*it]); + it != keys.end(); it++) { + if (doc.meta.find(*it) == doc.meta.end()) + parms.get(*it, doc.meta[*it]); } doc.meta[Doc::keyurl] = doc.url; doc.meta[Doc::keymt] = doc.dmtime.empty() ? doc.fmtime : doc.dmtime; @@ -614,12 +614,12 @@ bool Db::Native::hasPages(Xapian::docid docid) string ermsg; Xapian::PositionIterator pos; XAPTRY(pos = xrdb.positionlist_begin(docid, page_break_term); - if (pos != xrdb.positionlist_end(docid, page_break_term)) { - return true; - }, - xrdb, ermsg); + if (pos != xrdb.positionlist_end(docid, page_break_term)) { + return true; + }, + xrdb, ermsg); if (!ermsg.empty()) { - LOGERR("Db::Native::hasPages: xapian error: " << ermsg << "\n"); + LOGERR("Db::Native::hasPages: xapian error: " << ermsg << "\n"); } return false; } @@ -632,46 +632,46 @@ bool Db::Native::getPagePositions(Xapian::docid docid, vector& vpos) // that we store there for lack of better place map mbreaksmap; try { - Xapian::Document xdoc = xrdb.get_document(docid); - string data = xdoc.get_data(); - Doc doc; - string mbreaks; - if (dbDataToRclDoc(docid, data, doc) && - doc.getmeta(cstr_mbreaks, &mbreaks)) { - vector values; - stringToTokens(mbreaks, values, ","); - for (unsigned int i = 0; i < values.size() - 1; i += 2) { - int pos = atoi(values[i].c_str()) + baseTextPosition; - int incr = atoi(values[i+1].c_str()); - mbreaksmap[pos] = incr; - } - } + Xapian::Document xdoc = xrdb.get_document(docid); + string data = xdoc.get_data(); + Doc doc; + string mbreaks; + if (dbDataToRclDoc(docid, data, doc) && + doc.getmeta(cstr_mbreaks, &mbreaks)) { + vector values; + stringToTokens(mbreaks, values, ","); + for (unsigned int i = 0; i < values.size() - 1; i += 2) { + int pos = atoi(values[i].c_str()) + baseTextPosition; + int incr = atoi(values[i+1].c_str()); + mbreaksmap[pos] = incr; + } + } } catch (...) { } string qterm = page_break_term; Xapian::PositionIterator pos; try { - for (pos = xrdb.positionlist_begin(docid, qterm); - pos != xrdb.positionlist_end(docid, qterm); pos++) { - int ipos = *pos; - if (ipos < int(baseTextPosition)) { - LOGDEB("getPagePositions: got page position " << ipos + for (pos = xrdb.positionlist_begin(docid, qterm); + pos != xrdb.positionlist_end(docid, qterm); pos++) { + int ipos = *pos; + if (ipos < int(baseTextPosition)) { + LOGDEB("getPagePositions: got page position " << ipos << " not in body\n"); - // Not in text body. Strange... - continue; - } - map::iterator it = mbreaksmap.find(ipos); - if (it != mbreaksmap.end()) { - LOGDEB1("getPagePositions: found multibreak at " << ipos << + // Not in text body. Strange... + continue; + } + map::iterator it = mbreaksmap.find(ipos); + if (it != mbreaksmap.end()) { + LOGDEB1("getPagePositions: found multibreak at " << ipos << " incr " << it->second << "\n"); - for (int i = 0 ; i < it->second; i++) - vpos.push_back(ipos); - } - vpos.push_back(ipos); - } + for (int i = 0 ; i < it->second; i++) + vpos.push_back(ipos); + } + vpos.push_back(ipos); + } } catch (...) { - // Term does not occur. No problem. + // Term does not occur. No problem. } return true; } @@ -679,9 +679,9 @@ bool Db::Native::getPagePositions(Xapian::docid docid, vector& vpos) int Db::Native::getPageNumberForPosition(const vector& pbreaks, int pos) { if (pos < int(baseTextPosition)) // Not in text body - return -1; + return -1; vector::const_iterator it = - upper_bound(pbreaks.begin(), pbreaks.end(), pos); + upper_bound(pbreaks.begin(), pbreaks.end(), pos); return int(it - pbreaks.begin() + 1); } @@ -734,17 +734,17 @@ bool Db::Native::addOrUpdateWrite( // to do this after having prepared the document, but it needs to be in // the single-threaded section. if (m_rcldb->m_maxFsOccupPc > 0 && - (m_rcldb->m_occFirstCheck || - (m_rcldb->m_curtxtsz - m_rcldb->m_occtxtsz) / MB >= 1)) { - LOGDEB("Db::add: checking file system usage\n"); - int pc; - m_rcldb->m_occFirstCheck = 0; - if (fsocc(m_rcldb->m_basedir, &pc) && pc >= m_rcldb->m_maxFsOccupPc) { - LOGERR("Db::add: stop indexing: file system " << pc << " %" << + (m_rcldb->m_occFirstCheck || + (m_rcldb->m_curtxtsz - m_rcldb->m_occtxtsz) / MB >= 1)) { + LOGDEB("Db::add: checking file system usage\n"); + int pc; + m_rcldb->m_occFirstCheck = 0; + if (fsocc(m_rcldb->m_basedir, &pc) && pc >= m_rcldb->m_maxFsOccupPc) { + LOGERR("Db::add: stop indexing: file system " << pc << " %" << " full > max " << m_rcldb->m_maxFsOccupPc << " %" << "\n"); - return false; - } - m_rcldb->m_occtxtsz = m_rcldb->m_curtxtsz; + return false; + } + m_rcldb->m_occtxtsz = m_rcldb->m_curtxtsz; } const char *fnc = udi.c_str(); @@ -753,30 +753,30 @@ bool Db::Native::addOrUpdateWrite( // Add db entry or update existing entry: Xapian::docid did = 0; try { - did = xwdb.replace_document(uniterm, *newdocument_ptr); - if (did < m_rcldb->updated.size()) { + did = xwdb.replace_document(uniterm, *newdocument_ptr); + if (did < m_rcldb->updated.size()) { // This is necessary because only the file-level docs are tested // by needUpdate(), so the subdocs existence flags are only set // here. - m_rcldb->updated[did] = true; - LOGINFO("Db::add: docid " << did << " updated [" << fnc << "]\n"); - } else { - LOGINFO("Db::add: docid " << did << " added [" << fnc << "]\n"); - } + m_rcldb->updated[did] = true; + LOGINFO("Db::add: docid " << did << " updated [" << fnc << "]\n"); + } else { + LOGINFO("Db::add: docid " << did << " added [" << fnc << "]\n"); + } } XCATCHERROR(ermsg); if (!ermsg.empty()) { - LOGERR("Db::add: replace_document failed: " << ermsg << "\n"); - ermsg.erase(); - // FIXME: is this ever actually needed? - try { - xwdb.add_document(*newdocument_ptr); - LOGDEB("Db::add: " << fnc << + LOGERR("Db::add: replace_document failed: " << ermsg << "\n"); + ermsg.erase(); + // FIXME: is this ever actually needed? + try { + xwdb.add_document(*newdocument_ptr); + LOGDEB("Db::add: " << fnc << " added (failed re-seek for duplicate)\n"); - } XCATCHERROR(ermsg); - if (!ermsg.empty()) { - LOGERR("Db::add: add_document failed: " << ermsg << "\n"); - return false; - } + } XCATCHERROR(ermsg); + if (!ermsg.empty()) { + LOGERR("Db::add: add_document failed: " << ermsg << "\n"); + return false; + } } XAPTRY(xwdb.set_metadata(rawtextMetaKey(did), rawztext), @@ -796,7 +796,7 @@ bool Db::Native::addOrUpdateWrite( } bool Db::Native::purgeFileWrite(bool orphansOnly, const string& udi, - const string& uniterm) + const string& uniterm) { #if defined(IDX_THREADS) // We need a mutex even if we have a write queue (so we can only @@ -808,54 +808,54 @@ bool Db::Native::purgeFileWrite(bool orphansOnly, const string& udi, string ermsg; try { - Xapian::PostingIterator docid = xwdb.postlist_begin(uniterm); - if (docid == xwdb.postlist_end(uniterm)) { - return true; + Xapian::PostingIterator docid = xwdb.postlist_begin(uniterm); + if (docid == xwdb.postlist_end(uniterm)) { + return true; } - if (m_rcldb->m_flushMb > 0) { - Xapian::termcount trms = xwdb.get_doclength(*docid); - m_rcldb->maybeflush(trms * 5); - } - string sig; - if (orphansOnly) { - Xapian::Document doc = xwdb.get_document(*docid); - sig = doc.get_value(VALUE_SIG); - if (sig.empty()) { - LOGINFO("purgeFileWrite: got empty sig\n"); - return false; - } - } else { - LOGDEB("purgeFile: delete docid " << *docid << "\n"); + if (m_rcldb->m_flushMb > 0) { + Xapian::termcount trms = xwdb.get_doclength(*docid); + m_rcldb->maybeflush(trms * 5); + } + string sig; + if (orphansOnly) { + Xapian::Document doc = xwdb.get_document(*docid); + sig = doc.get_value(VALUE_SIG); + if (sig.empty()) { + LOGINFO("purgeFileWrite: got empty sig\n"); + return false; + } + } else { + LOGDEB("purgeFile: delete docid " << *docid << "\n"); deleteDocument(*docid); - } - vector docids; - subDocs(udi, 0, docids); - LOGDEB("purgeFile: subdocs cnt " << docids.size() << "\n"); - for (vector::iterator it = docids.begin(); - it != docids.end(); it++) { - if (m_rcldb->m_flushMb > 0) { - Xapian::termcount trms = xwdb.get_doclength(*it); - m_rcldb->maybeflush(trms * 5); - } - string subdocsig; - if (orphansOnly) { - Xapian::Document doc = xwdb.get_document(*it); - subdocsig = doc.get_value(VALUE_SIG); - if (subdocsig.empty()) { - LOGINFO("purgeFileWrite: got empty sig for subdoc??\n"); - continue; - } - } - - if (!orphansOnly || sig != subdocsig) { - LOGDEB("Db::purgeFile: delete subdoc " << *it << "\n"); - deleteDocument(*it); - } - } - return true; + } + vector docids; + subDocs(udi, 0, docids); + LOGDEB("purgeFile: subdocs cnt " << docids.size() << "\n"); + for (vector::iterator it = docids.begin(); + it != docids.end(); it++) { + if (m_rcldb->m_flushMb > 0) { + Xapian::termcount trms = xwdb.get_doclength(*it); + m_rcldb->maybeflush(trms * 5); + } + string subdocsig; + if (orphansOnly) { + Xapian::Document doc = xwdb.get_document(*it); + subdocsig = doc.get_value(VALUE_SIG); + if (subdocsig.empty()) { + LOGINFO("purgeFileWrite: got empty sig for subdoc??\n"); + continue; + } + } + + if (!orphansOnly || sig != subdocsig) { + LOGDEB("Db::purgeFile: delete subdoc " << *it << "\n"); + deleteDocument(*it); + } + } + return true; } XCATCHERROR(ermsg); if (!ermsg.empty()) { - LOGERR("Db::purgeFileWrite: " << ermsg << "\n"); + LOGERR("Db::purgeFileWrite: " << ermsg << "\n"); } return false; } @@ -873,21 +873,21 @@ Db::Db(const RclConfig *cfp) { m_config = new RclConfig(*cfp); if (start_of_field_term.empty()) { - if (o_index_stripchars) { - start_of_field_term = "XXST"; - end_of_field_term = "XXND"; - } else { - start_of_field_term = "XXST/"; - end_of_field_term = "XXND/"; - } + if (o_index_stripchars) { + start_of_field_term = "XXST"; + end_of_field_term = "XXND"; + } else { + start_of_field_term = "XXST/"; + end_of_field_term = "XXND/"; + } } m_ndb = new Native(this); if (m_config) { - m_config->getConfParam("maxfsoccuppc", &m_maxFsOccupPc); - m_config->getConfParam("idxflushmb", &m_flushMb); - m_config->getConfParam("idxmetastoredlen", &m_idxMetaStoredLen); - m_config->getConfParam("idxtexttruncatelen", &m_idxTextTruncateLen); + m_config->getConfParam("maxfsoccuppc", &m_maxFsOccupPc); + m_config->getConfParam("idxflushmb", &m_flushMb); + m_config->getConfParam("idxmetastoredlen", &m_idxMetaStoredLen); + m_config->getConfParam("idxtexttruncatelen", &m_idxTextTruncateLen); } } @@ -895,7 +895,7 @@ Db::~Db() { LOGDEB2("Db::~Db\n"); if (m_ndb == 0) - return; + return; LOGDEB("Db::~Db: isopen " << m_ndb->m_isopen << " m_iswritable " << m_ndb->m_iswritable << "\n"); i_close(true); @@ -916,29 +916,29 @@ vector Db::getStemmerNames() bool Db::open(OpenMode mode, OpenError *error) { if (error) - *error = DbOpenMainDb; + *error = DbOpenMainDb; if (m_ndb == 0 || m_config == 0) { - m_reason = "Null configuration or Xapian Db"; - return false; + m_reason = "Null configuration or Xapian Db"; + return false; } LOGDEB("Db::open: m_isopen " << m_ndb->m_isopen << " m_iswritable " << m_ndb->m_iswritable << " mode " << mode << "\n"); if (m_ndb->m_isopen) { - // We used to return an error here but I see no reason to - if (!close()) - return false; + // We used to return an error here but I see no reason to + if (!close()) + return false; } if (!m_config->getStopfile().empty()) - m_stops.setFile(m_config->getStopfile()); + m_stops.setFile(m_config->getStopfile()); string dir = m_config->getDbDir(); string ermsg; try { - switch (mode) { - case DbUpd: - case DbTrunc: + switch (mode) { + case DbUpd: + case DbTrunc: m_ndb->openWrite(dir, mode); updated = vector(m_ndb->xwdb.get_lastdocid() + 1, false); // We used to open a readonly object in addition to the @@ -950,41 +950,41 @@ bool Db::open(OpenMode mode, OpenError *error) // so the query db is now a clone of the update one. m_ndb->xrdb = m_ndb->xwdb; LOGDEB("Db::open: lastdocid: " <xwdb.get_lastdocid()<<"\n"); - break; - case DbRO: - default: + break; + case DbRO: + default: m_ndb->openRead(dir); for (auto& db : m_extraDbs) { - if (error) - *error = DbOpenExtraDb; - LOGDEB("Db::Open: adding query db [" << &db << "]\n"); + if (error) + *error = DbOpenExtraDb; + LOGDEB("Db::Open: adding query db [" << &db << "]\n"); // An error here used to be non-fatal (1.13 and older) // but I can't see why m_ndb->xrdb.add_database(Xapian::Database(db)); - } - break; - } - if (error) - *error = DbOpenMainDb; + } + break; + } + if (error) + *error = DbOpenMainDb; - // Check index format version. Must not try to check a just created or - // truncated db - if (mode != DbTrunc && m_ndb->xrdb.get_doccount() > 0) { - string version = m_ndb->xrdb.get_metadata(cstr_RCL_IDX_VERSION_KEY); - if (version.compare(cstr_RCL_IDX_VERSION)) { - m_ndb->m_noversionwrite = true; - LOGERR("Rcl::Db::open: file index [" << version << + // Check index format version. Must not try to check a just created or + // truncated db + if (mode != DbTrunc && m_ndb->xrdb.get_doccount() > 0) { + string version = m_ndb->xrdb.get_metadata(cstr_RCL_IDX_VERSION_KEY); + if (version.compare(cstr_RCL_IDX_VERSION)) { + m_ndb->m_noversionwrite = true; + LOGERR("Rcl::Db::open: file index [" << version << "], software [" << cstr_RCL_IDX_VERSION << "]\n"); - throw Xapian::DatabaseError("Recoll index version mismatch", - "", ""); - } - } - m_mode = mode; - m_ndb->m_isopen = true; - m_basedir = dir; - if (error) - *error = DbOpenNoError; - return true; + throw Xapian::DatabaseError("Recoll index version mismatch", + "", ""); + } + } + m_mode = mode; + m_ndb->m_isopen = true; + m_basedir = dir; + if (error) + *error = DbOpenNoError; + return true; } XCATCHERROR(ermsg); m_reason = ermsg; @@ -1019,36 +1019,36 @@ bool Db::close() bool Db::i_close(bool final) { if (m_ndb == 0) - return false; + return false; LOGDEB("Db::i_close(" << final << "): m_isopen " << m_ndb->m_isopen << " m_iswritable " << m_ndb->m_iswritable << "\n"); if (m_ndb->m_isopen == false && !final) - return true; + return true; string ermsg; try { - bool w = m_ndb->m_iswritable; - if (w) { + bool w = m_ndb->m_iswritable; + if (w) { #ifdef IDX_THREADS - waitUpdIdle(); + waitUpdIdle(); #endif - if (!m_ndb->m_noversionwrite) - m_ndb->xwdb.set_metadata(cstr_RCL_IDX_VERSION_KEY, - cstr_RCL_IDX_VERSION); - LOGDEB("Rcl::Db:close: xapian will close. May take some time\n"); - } - deleteZ(m_ndb); - if (w) - LOGDEB("Rcl::Db:close() xapian close done.\n"); - if (final) { - return true; - } - m_ndb = new Native(this); - if (m_ndb) { - return true; - } - LOGERR("Rcl::Db::close(): cant recreate db object\n"); - return false; + if (!m_ndb->m_noversionwrite) + m_ndb->xwdb.set_metadata(cstr_RCL_IDX_VERSION_KEY, + cstr_RCL_IDX_VERSION); + LOGDEB("Rcl::Db:close: xapian will close. May take some time\n"); + } + deleteZ(m_ndb); + if (w) + LOGDEB("Rcl::Db:close() xapian close done.\n"); + if (final) { + return true; + } + m_ndb = new Native(this); + if (m_ndb) { + return true; + } + LOGERR("Rcl::Db::close(): cant recreate db object\n"); + return false; } XCATCHERROR(ermsg); LOGERR("Db:close: exception while deleting db: " << ermsg << "\n"); return false; @@ -1062,11 +1062,11 @@ bool Db::adjustdbs() return false; } if (m_ndb && m_ndb->m_isopen) { - if (!close()) - return false; - if (!open(m_mode)) { - return false; - } + if (!close()) + return false; + if (!open(m_mode)) { + return false; + } } return true; } @@ -1094,14 +1094,14 @@ int Db::termDocCnt(const string& _term) string term = _term; if (o_index_stripchars) - if (!unacmaybefold(_term, term, "UTF-8", UNACOP_UNACFOLD)) { - LOGINFO("Db::termDocCnt: unac failed for [" << _term << "]\n"); - return 0; - } + if (!unacmaybefold(_term, term, "UTF-8", UNACOP_UNACFOLD)) { + LOGINFO("Db::termDocCnt: unac failed for [" << _term << "]\n"); + return 0; + } if (m_stops.isStop(term)) { - LOGDEB1("Db::termDocCnt [" << term << "] in stop list\n"); - return 0; + LOGDEB1("Db::termDocCnt [" << term << "] in stop list\n"); + return 0; } XAPTRY(res = m_ndb->xrdb.get_termfreq(term), m_ndb->xrdb, m_reason); @@ -1119,12 +1119,12 @@ bool Db::addQueryDb(const string &_dir) LOGDEB0("Db::addQueryDb: ndb " << m_ndb << " iswritable " << ((m_ndb)?m_ndb->m_iswritable:0) << " db [" << dir << "]\n"); if (!m_ndb) - return false; + return false; if (m_ndb->m_iswritable) - return false; + return false; dir = path_canon(dir); if (find(m_extraDbs.begin(), m_extraDbs.end(), dir) == m_extraDbs.end()) { - m_extraDbs.push_back(dir); + m_extraDbs.push_back(dir); } return adjustdbs(); } @@ -1132,17 +1132,17 @@ bool Db::addQueryDb(const string &_dir) bool Db::rmQueryDb(const string &dir) { if (!m_ndb) - return false; + return false; if (m_ndb->m_iswritable) - return false; + return false; if (dir.empty()) { - m_extraDbs.clear(); + m_extraDbs.clear(); } else { - vector::iterator it = find(m_extraDbs.begin(), - m_extraDbs.end(), dir); - if (it != m_extraDbs.end()) { - m_extraDbs.erase(it); - } + vector::iterator it = find(m_extraDbs.begin(), + m_extraDbs.end(), dir); + if (it != m_extraDbs.end()) { + m_extraDbs.erase(it); + } } return adjustdbs(); } @@ -1177,9 +1177,9 @@ size_t Db::Native::whatDbIdx(Xapian::docid id) LOGDEB1("Db::whatDbIdx: xdocid " << id << ", " << m_rcldb->m_extraDbs.size() << " extraDbs\n"); if (id == 0) - return (size_t)-1; + return (size_t)-1; if (m_rcldb->m_extraDbs.size() == 0) - return 0; + return 0; return (id - 1) % (m_rcldb->m_extraDbs.size() + 1); } @@ -1187,7 +1187,7 @@ size_t Db::Native::whatDbIdx(Xapian::docid id) Xapian::docid Db::Native::whatDbDocid(Xapian::docid docid_combined) { if (m_rcldb->m_extraDbs.size() == 0) - return docid_combined; + return docid_combined; return (docid_combined - 1) / (m_rcldb->m_extraDbs.size() + 1) + 1; } @@ -1197,26 +1197,26 @@ bool Db::testDbDir(const string &dir, bool *stripped_p) bool mstripped = true; LOGDEB("Db::testDbDir: [" << dir << "]\n"); try { - Xapian::Database db(dir); - // If the prefix for mimetype is wrapped, it's an unstripped - // index. T has been in use in recoll since the beginning and - // all documents have a T field (possibly empty). - Xapian::TermIterator term = db.allterms_begin(":T:"); - if (term == db.allterms_end()) { - mstripped = true; + Xapian::Database db(dir); + // If the prefix for mimetype is wrapped, it's an unstripped + // index. T has been in use in recoll since the beginning and + // all documents have a T field (possibly empty). + Xapian::TermIterator term = db.allterms_begin(":T:"); + if (term == db.allterms_end()) { + mstripped = true; } else { - mstripped = false; + mstripped = false; } LOGDEB("testDbDir: " << dir << " is a " << (mstripped ? "stripped" : "raw") << " index\n"); } XCATCHERROR(aerr); if (!aerr.empty()) { - LOGERR("Db::Open: error while trying to open database from [" << + LOGERR("Db::Open: error while trying to open database from [" << dir << "]: " << aerr << "\n"); - return false; + return false; } if (stripped_p) - *stripped_p = mstripped; + *stripped_p = mstripped; return true; } @@ -1224,7 +1224,7 @@ bool Db::testDbDir(const string &dir, bool *stripped_p) bool Db::isopen() { if (m_ndb == 0) - return false; + return false; return m_ndb->m_isopen; } @@ -1233,7 +1233,7 @@ bool Db::fieldToTraits(const string& fld, const FieldTraits **ftpp, bool isquery) { if (m_config && m_config->getFieldTraits(fld, ftpp, isquery)) - return true; + return true; *ftpp = 0; return false; @@ -1243,7 +1243,7 @@ bool Db::fieldToTraits(const string& fld, const FieldTraits **ftpp, // document. We use a single object to split all of the document // fields and position jumps to separate fields class TextSplitDb : public TextSplitP { - public: +public: Xapian::Document &doc; // Xapian document // Base for document section. Gets large increment when we change // sections, to avoid cross-section proximity matches. @@ -1259,43 +1259,43 @@ class TextSplitDb : public TextSplitP { TextSplitDb(Xapian::WritableDatabase& _wdb, Xapian::Document &d, TermProc *prc) - : TextSplitP(prc), doc(d), basepos(1), curpos(0), wdb(_wdb) + : TextSplitP(prc), doc(d), basepos(1), curpos(0), wdb(_wdb) {} // Reimplement text_to_words to insert the begin and end anchor terms. virtual bool text_to_words(const string &in) { - string ermsg; + string ermsg; - try { - // Index the possibly prefixed start term. - doc.add_posting(ft.pfx + start_of_field_term, basepos, ft.wdfinc); - ++basepos; - } XCATCHERROR(ermsg); - if (!ermsg.empty()) { - LOGERR("Db: xapian add_posting error " << ermsg << "\n"); - goto out; - } + try { + // Index the possibly prefixed start term. + doc.add_posting(ft.pfx + start_of_field_term, basepos, ft.wdfinc); + ++basepos; + } XCATCHERROR(ermsg); + if (!ermsg.empty()) { + LOGERR("Db: xapian add_posting error " << ermsg << "\n"); + goto out; + } - if (!TextSplitP::text_to_words(in)) { - LOGDEB("TextSplitDb: TextSplit::text_to_words failed\n"); - goto out; - } + if (!TextSplitP::text_to_words(in)) { + LOGDEB("TextSplitDb: TextSplit::text_to_words failed\n"); + goto out; + } - try { - // Index the possibly prefixed end term. - doc.add_posting(ft.pfx + end_of_field_term, basepos + curpos + 1, - ft.wdfinc); - ++basepos; - } XCATCHERROR(ermsg); - if (!ermsg.empty()) { - LOGERR("Db: xapian add_posting error " << ermsg << "\n"); - goto out; - } + try { + // Index the possibly prefixed end term. + doc.add_posting(ft.pfx + end_of_field_term, basepos + curpos + 1, + ft.wdfinc); + ++basepos; + } XCATCHERROR(ermsg); + if (!ermsg.empty()) { + LOGERR("Db: xapian add_posting error " << ermsg << "\n"); + goto out; + } out: - basepos += curpos + 100; - return true; + basepos += curpos + 100; + return true; } void setTraits(const FieldTraits& ftp) @@ -1318,74 +1318,74 @@ public: bool takeword(const std::string &term, int pos, int, int) { - // Compute absolute position (pos is relative to current segment), - // and remember relative. - m_ts->curpos = pos; - pos += m_ts->basepos; - // Don't try to add empty term Xapian doesnt like it... Safety check - // this should not happen. - if (term.empty()) - return true; - string ermsg; - try { - // Index without prefix, using the field-specific weighting - LOGDEB1("Emitting term at " << pos << " : [" << term << "]\n"); + // Compute absolute position (pos is relative to current segment), + // and remember relative. + m_ts->curpos = pos; + pos += m_ts->basepos; + // Don't try to add empty term Xapian doesnt like it... Safety check + // this should not happen. + if (term.empty()) + return true; + string ermsg; + try { + // Index without prefix, using the field-specific weighting + LOGDEB1("Emitting term at " << pos << " : [" << term << "]\n"); if (!m_ts->ft.pfxonly) m_ts->doc.add_posting(term, pos, m_ts->ft.wdfinc); #ifdef TESTING_XAPIAN_SPELL - if (Db::isSpellingCandidate(term, false)) { - m_ts->wdb.add_spelling(term); - } + if (Db::isSpellingCandidate(term, false)) { + m_ts->wdb.add_spelling(term); + } #endif - // Index the prefixed term. - if (!m_ts->ft.pfx.empty()) { - m_ts->doc.add_posting(m_ts->ft.pfx + term, pos, + // Index the prefixed term. + if (!m_ts->ft.pfx.empty()) { + m_ts->doc.add_posting(m_ts->ft.pfx + term, pos, m_ts->ft.wdfinc); - } - return true; - } XCATCHERROR(ermsg); - LOGERR("Db: xapian add_posting error " << ermsg << "\n"); - return false; + } + return true; + } XCATCHERROR(ermsg); + LOGERR("Db: xapian add_posting error " << ermsg << "\n"); + return false; } void newpage(int pos) { - pos += m_ts->basepos; - if (pos < int(baseTextPosition)) { - LOGDEB("newpage: not in body: " << pos << "\n"); - return; - } + pos += m_ts->basepos; + if (pos < int(baseTextPosition)) { + LOGDEB("newpage: not in body: " << pos << "\n"); + return; + } - m_ts->doc.add_posting(m_ts->ft.pfx + page_break_term, pos); - if (pos == m_lastpagepos) { - m_pageincr++; - LOGDEB2("newpage: same pos, pageincr " << m_pageincr << + m_ts->doc.add_posting(m_ts->ft.pfx + page_break_term, pos); + if (pos == m_lastpagepos) { + m_pageincr++; + LOGDEB2("newpage: same pos, pageincr " << m_pageincr << " lastpagepos " << m_lastpagepos << "\n"); - } else { - LOGDEB2("newpage: pos change, pageincr " << m_pageincr << + } else { + LOGDEB2("newpage: pos change, pageincr " << m_pageincr << " lastpagepos " << m_lastpagepos << "\n"); - if (m_pageincr > 0) { - // Remember the multiple page break at this position - unsigned int relpos = m_lastpagepos - baseTextPosition; - LOGDEB2("Remembering multiple page break. Relpos " << relpos << + if (m_pageincr > 0) { + // Remember the multiple page break at this position + unsigned int relpos = m_lastpagepos - baseTextPosition; + LOGDEB2("Remembering multiple page break. Relpos " << relpos << " cnt " << m_pageincr << "\n"); - m_pageincrvec.push_back(pair(relpos, m_pageincr)); - } - m_pageincr = 0; - } - m_lastpagepos = pos; + m_pageincrvec.push_back(pair(relpos, m_pageincr)); + } + m_pageincr = 0; + } + m_lastpagepos = pos; } virtual bool flush() { - if (m_pageincr > 0) { - unsigned int relpos = m_lastpagepos - baseTextPosition; - LOGDEB2("Remembering multiple page break. Position " << relpos << + if (m_pageincr > 0) { + unsigned int relpos = m_lastpagepos - baseTextPosition; + LOGDEB2("Remembering multiple page break. Position " << relpos << " cnt " << m_pageincr << "\n"); - m_pageincrvec.push_back(pair(relpos, m_pageincr)); - m_pageincr = 0; - } - return TermProc::flush(); + m_pageincrvec.push_back(pair(relpos, m_pageincr)); + m_pageincr = 0; + } + return TermProc::flush(); } TextSplitDb *m_ts; @@ -1405,7 +1405,7 @@ bool Db::getSpellingSuggestions(const string& word, vector& suggs) LOGDEB("Db::getSpellingSuggestions:[" << word << "]\n"); suggs.clear(); if (nullptr == m_ndb) { - return false; + return false; } string term = word; @@ -1473,11 +1473,11 @@ void Db::setAbstractParams(int idxtrunc, int syntlen, int syntctxlen) LOGDEB1("Db::setAbstractParams: trunc " << idxtrunc << " syntlen " << syntlen << " ctxlen " << syntctxlen << "\n"); if (idxtrunc >= 0) - m_idxAbsTruncLen = idxtrunc; + m_idxAbsTruncLen = idxtrunc; if (syntlen > 0) - m_synthAbsLen = syntlen; + m_synthAbsLen = syntlen; if (syntctxlen > 0) - m_synthAbsWordCtxLen = syntctxlen; + m_synthAbsWordCtxLen = syntctxlen; } bool Db::setSynGroupsFile(const string& fn) @@ -1496,7 +1496,7 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc) { LOGDEB("Db::add: udi [" << udi << "] parent [" << parent_udi << "]\n"); if (m_ndb == 0) - return false; + return false; // This document is potentially going to be passed to the index // update thread. The reference counters are not mt-safe, so we @@ -1513,7 +1513,7 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc) TermProcPrep tpprep(nxt); if (o_index_stripchars) - nxt = &tpprep; + nxt = &tpprep; TextSplitDb splitter(m_ndb->xwdb, newdocument, nxt); tpidx.setTSD(&splitter); @@ -1524,14 +1524,14 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc) string rawztext; // Doc compressed text if (doc.onlyxattr) { - // Only updating an existing doc with new extended attributes - // data. Need to read the old doc and its data record - // first. This is so different from the normal processing that - // it uses a fully separate code path (with some duplication - // unfortunately) - if (!m_ndb->docToXdocXattrOnly(&splitter, udi, doc, newdocument)) { + // Only updating an existing doc with new extended attributes + // data. Need to read the old doc and its data record + // first. This is so different from the normal processing that + // it uses a fully separate code path (with some duplication + // unfortunately) + if (!m_ndb->docToXdocXattrOnly(&splitter, udi, doc, newdocument)) { delete newdocument_ptr; - return false; + return false; } } else { @@ -1539,24 +1539,24 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc) doc.text = truncate_to_word(doc.text, m_idxTextTruncateLen); } - // If the ipath is like a path, index the last element. This is - // for compound documents like zip and chm for which the filter - // uses the file path as ipath. - if (!doc.ipath.empty() && - doc.ipath.find_first_not_of("0123456789") != string::npos) { - string utf8ipathlast; - // There is no way in hell we could have an idea of the - // charset here, so let's hope it's ascii or utf-8. We call - // transcode to strip the bad chars and pray - if (transcode(path_getsimple(doc.ipath), utf8ipathlast, - "UTF-8", "UTF-8")) { - splitter.text_to_words(utf8ipathlast); - } - } + // If the ipath is like a path, index the last element. This is + // for compound documents like zip and chm for which the filter + // uses the file path as ipath. + if (!doc.ipath.empty() && + doc.ipath.find_first_not_of("0123456789") != string::npos) { + string utf8ipathlast; + // There is no way in hell we could have an idea of the + // charset here, so let's hope it's ascii or utf-8. We call + // transcode to strip the bad chars and pray + if (transcode(path_getsimple(doc.ipath), utf8ipathlast, + "UTF-8", "UTF-8")) { + splitter.text_to_words(utf8ipathlast); + } + } - // Split and index the path from the url for path-based filtering - { - string path = url_gpathS(doc.url); + // Split and index the path from the url for path-based filtering + { + string path = url_gpathS(doc.url); #ifdef _WIN32 // Windows file names are case-insensitive, so we @@ -1565,36 +1565,36 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc) unacmaybefold(upath, path, "UTF-8", UNACOP_FOLD); #endif - vector vpath; - stringToTokens(path, vpath, "/"); - // If vpath is not /, the last elt is the file/dir name, not a - // part of the path. - if (vpath.size()) - vpath.resize(vpath.size()-1); - splitter.curpos = 0; - newdocument.add_posting(wrap_prefix(pathelt_prefix), - splitter.basepos + splitter.curpos++); - for (vector::iterator it = vpath.begin(); - it != vpath.end(); it++){ - if (it->length() > 230) { - // Just truncate it. May still be useful because - // of wildcards - *it = it->substr(0, 230); - } - newdocument.add_posting(wrap_prefix(pathelt_prefix) + *it, - splitter.basepos + splitter.curpos++); - } + vector vpath; + stringToTokens(path, vpath, "/"); + // If vpath is not /, the last elt is the file/dir name, not a + // part of the path. + if (vpath.size()) + vpath.resize(vpath.size()-1); + splitter.curpos = 0; + newdocument.add_posting(wrap_prefix(pathelt_prefix), + splitter.basepos + splitter.curpos++); + for (vector::iterator it = vpath.begin(); + it != vpath.end(); it++){ + if (it->length() > 230) { + // Just truncate it. May still be useful because + // of wildcards + *it = it->substr(0, 230); + } + newdocument.add_posting(wrap_prefix(pathelt_prefix) + *it, + splitter.basepos + splitter.curpos++); + } splitter.basepos += splitter.curpos + 100; - } + } - // Index textual metadata. These are all indexed as text with - // positions, as we may want to do phrase searches with them (this - // makes no sense for keywords by the way). - // - // The order has no importance, and we set a position gap of 100 - // between fields to avoid false proximity matches. - for (const auto& entry: doc.meta) { - if (entry.second.empty()) { + // Index textual metadata. These are all indexed as text with + // positions, as we may want to do phrase searches with them (this + // makes no sense for keywords by the way). + // + // The order has no importance, and we set a position gap of 100 + // between fields to avoid false proximity matches. + for (const auto& entry: doc.meta) { + if (entry.second.empty()) { continue; } const FieldTraits *ftp{nullptr}; @@ -1623,22 +1623,22 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc) LOGDEB0("Db::add: no prefix for field [" << entry.first << "], no indexing\n"); } - } + } // Reset to no prefix and default params splitter.setTraits(FieldTraits()); - if (splitter.curpos < baseTextPosition) - splitter.basepos = baseTextPosition; + if (splitter.curpos < baseTextPosition) + splitter.basepos = baseTextPosition; - // Split and index body text - LOGDEB2("Db::add: split body: [" << doc.text << "]\n"); + // Split and index body text + LOGDEB2("Db::add: split body: [" << doc.text << "]\n"); #ifdef TEXTSPLIT_STATS - splitter.resetStats(); + splitter.resetStats(); #endif - if (!splitter.text_to_words(doc.text)) { - LOGDEB("Db::addOrUpdate: split failed for main text\n"); + if (!splitter.text_to_words(doc.text)) { + LOGDEB("Db::addOrUpdate: split failed for main text\n"); } else { if (m_ndb->m_storetext) { ZLibUtBuf buf; @@ -1648,142 +1648,142 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc) } #ifdef TEXTSPLIT_STATS - // Reject bad data. unrecognized base64 text is characterized by - // high avg word length and high variation (because there are - // word-splitters like +/ inside the data). - TextSplit::Stats::Values v = splitter.getStats(); - // v.avglen > 15 && v.sigma > 12 - if (v.count > 200 && (v.avglen > 10 && v.sigma / v.avglen > 0.8)) { - LOGINFO("RclDb::addOrUpdate: rejecting doc for bad stats count " << + // Reject bad data. unrecognized base64 text is characterized by + // high avg word length and high variation (because there are + // word-splitters like +/ inside the data). + TextSplit::Stats::Values v = splitter.getStats(); + // v.avglen > 15 && v.sigma > 12 + if (v.count > 200 && (v.avglen > 10 && v.sigma / v.avglen > 0.8)) { + LOGINFO("RclDb::addOrUpdate: rejecting doc for bad stats count " << v.count << " avglen " << v.avglen << " sigma " << v.sigma << " url [" << doc.url << "] ipath [" << doc.ipath << "] text " << doc.text << "\n"); delete newdocument_ptr; - return true; - } + return true; + } #endif - ////// Special terms for other metadata. No positions for these. - // Mime type - newdocument.add_boolean_term(wrap_prefix(mimetype_prefix) + doc.mimetype); + ////// Special terms for other metadata. No positions for these. + // Mime type + newdocument.add_boolean_term(wrap_prefix(mimetype_prefix) + doc.mimetype); - // Simple file name indexed unsplit for specific "file name" - // searches. This is not the same as a filename: clause inside the - // query language. - // We also add a term for the filename extension if any. - string utf8fn; - if (doc.getmeta(Doc::keyfn, &utf8fn) && !utf8fn.empty()) { - string fn; - if (unacmaybefold(utf8fn, fn, "UTF-8", UNACOP_UNACFOLD)) { - // We should truncate after extracting the extension, - // but this is a pathological case anyway - if (fn.size() > 230) - utf8truncate(fn, 230); - string::size_type pos = fn.rfind('.'); - if (pos != string::npos && pos != fn.length() - 1) { - newdocument.add_boolean_term(wrap_prefix(fileext_prefix) + - fn.substr(pos + 1)); - } - newdocument.add_term(wrap_prefix(unsplitfilename_prefix) + fn,0); - } - } + // Simple file name indexed unsplit for specific "file name" + // searches. This is not the same as a filename: clause inside the + // query language. + // We also add a term for the filename extension if any. + string utf8fn; + if (doc.getmeta(Doc::keyfn, &utf8fn) && !utf8fn.empty()) { + string fn; + if (unacmaybefold(utf8fn, fn, "UTF-8", UNACOP_UNACFOLD)) { + // We should truncate after extracting the extension, + // but this is a pathological case anyway + if (fn.size() > 230) + utf8truncate(fn, 230); + string::size_type pos = fn.rfind('.'); + if (pos != string::npos && pos != fn.length() - 1) { + newdocument.add_boolean_term(wrap_prefix(fileext_prefix) + + fn.substr(pos + 1)); + } + newdocument.add_term(wrap_prefix(unsplitfilename_prefix) + fn,0); + } + } - newdocument.add_boolean_term(uniterm); - // Parent term. This is used to find all descendents, mostly - // to delete them when the parent goes away - if (!parent_udi.empty()) { - newdocument.add_boolean_term(make_parentterm(parent_udi)); - } + newdocument.add_boolean_term(uniterm); + // Parent term. This is used to find all descendents, mostly + // to delete them when the parent goes away + if (!parent_udi.empty()) { + newdocument.add_boolean_term(make_parentterm(parent_udi)); + } - // Fields used for selecting by date. Note that this only - // works for years AD 0-9999 (no crash elsewhere, but things - // won't work). - time_t mtime = atoll(doc.dmtime.empty() ? doc.fmtime.c_str() : - doc.dmtime.c_str()); + // Fields used for selecting by date. Note that this only + // works for years AD 0-9999 (no crash elsewhere, but things + // won't work). + time_t mtime = atoll(doc.dmtime.empty() ? doc.fmtime.c_str() : + doc.dmtime.c_str()); struct tm tmb; localtime_r(&mtime, &tmb); char buf[50]; // It's actually 9, but use 50 to suppress warnings. snprintf(buf, 50, "%04d%02d%02d", tmb.tm_year+1900, tmb.tm_mon + 1, tmb.tm_mday); - // Date (YYYYMMDD) - newdocument.add_boolean_term(wrap_prefix(xapday_prefix) + string(buf)); - // Month (YYYYMM) - buf[6] = '\0'; - newdocument.add_boolean_term(wrap_prefix(xapmonth_prefix) + string(buf)); - // Year (YYYY) - buf[4] = '\0'; - newdocument.add_boolean_term(wrap_prefix(xapyear_prefix) + string(buf)); + // Date (YYYYMMDD) + newdocument.add_boolean_term(wrap_prefix(xapday_prefix) + string(buf)); + // Month (YYYYMM) + buf[6] = '\0'; + newdocument.add_boolean_term(wrap_prefix(xapmonth_prefix) + string(buf)); + // Year (YYYY) + buf[4] = '\0'; + newdocument.add_boolean_term(wrap_prefix(xapyear_prefix) + string(buf)); - ////////////////////////////////////////////////////////////////// - // Document data record. omindex has the following nl separated fields: - // - url - // - sample - // - caption (title limited to 100 chars) - // - mime type - // - // The title, author, abstract and keywords fields are special, - // they always get stored in the document data - // record. Configurable other fields can be, too. - // - // We truncate stored fields abstract, title and keywords to - // reasonable lengths and suppress newlines (so that the data - // record can keep a simple syntax) + ////////////////////////////////////////////////////////////////// + // Document data record. omindex has the following nl separated fields: + // - url + // - sample + // - caption (title limited to 100 chars) + // - mime type + // + // The title, author, abstract and keywords fields are special, + // they always get stored in the document data + // record. Configurable other fields can be, too. + // + // We truncate stored fields abstract, title and keywords to + // reasonable lengths and suppress newlines (so that the data + // record can keep a simple syntax) - string record; - RECORD_APPEND(record, Doc::keyurl, doc.url); - RECORD_APPEND(record, Doc::keytp, doc.mimetype); - // We left-zero-pad the times so that they are lexico-sortable - leftzeropad(doc.fmtime, 11); - RECORD_APPEND(record, Doc::keyfmt, doc.fmtime); - if (!doc.dmtime.empty()) { - leftzeropad(doc.dmtime, 11); - RECORD_APPEND(record, Doc::keydmt, doc.dmtime); - } - RECORD_APPEND(record, Doc::keyoc, doc.origcharset); + string record; + RECORD_APPEND(record, Doc::keyurl, doc.url); + RECORD_APPEND(record, Doc::keytp, doc.mimetype); + // We left-zero-pad the times so that they are lexico-sortable + leftzeropad(doc.fmtime, 11); + RECORD_APPEND(record, Doc::keyfmt, doc.fmtime); + if (!doc.dmtime.empty()) { + leftzeropad(doc.dmtime, 11); + RECORD_APPEND(record, Doc::keydmt, doc.dmtime); + } + RECORD_APPEND(record, Doc::keyoc, doc.origcharset); - if (doc.fbytes.empty()) - doc.fbytes = doc.pcbytes; + if (doc.fbytes.empty()) + doc.fbytes = doc.pcbytes; - if (!doc.fbytes.empty()) { - RECORD_APPEND(record, Doc::keyfs, doc.fbytes); - leftzeropad(doc.fbytes, 12); - newdocument.add_value(VALUE_SIZE, doc.fbytes); - } - if (doc.haschildren) { - newdocument.add_boolean_term(has_children_term); - } - if (!doc.pcbytes.empty()) - RECORD_APPEND(record, Doc::keypcs, doc.pcbytes); - char sizebuf[30]; - sprintf(sizebuf, "%u", (unsigned int)doc.text.length()); - RECORD_APPEND(record, Doc::keyds, sizebuf); + if (!doc.fbytes.empty()) { + RECORD_APPEND(record, Doc::keyfs, doc.fbytes); + leftzeropad(doc.fbytes, 12); + newdocument.add_value(VALUE_SIZE, doc.fbytes); + } + if (doc.haschildren) { + newdocument.add_boolean_term(has_children_term); + } + if (!doc.pcbytes.empty()) + RECORD_APPEND(record, Doc::keypcs, doc.pcbytes); + char sizebuf[30]; + sprintf(sizebuf, "%u", (unsigned int)doc.text.length()); + RECORD_APPEND(record, Doc::keyds, sizebuf); - // Note that we add the signature both as a value and in the data record - if (!doc.sig.empty()) { - RECORD_APPEND(record, Doc::keysig, doc.sig); - newdocument.add_value(VALUE_SIG, doc.sig); - } + // Note that we add the signature both as a value and in the data record + if (!doc.sig.empty()) { + RECORD_APPEND(record, Doc::keysig, doc.sig); + newdocument.add_value(VALUE_SIG, doc.sig); + } - if (!doc.ipath.empty()) - RECORD_APPEND(record, Doc::keyipt, doc.ipath); + if (!doc.ipath.empty()) + RECORD_APPEND(record, Doc::keyipt, doc.ipath); // Fields from the Meta array. Handle title specially because it has a // different name inside the data record (history...) string& ttref = doc.meta[Doc::keytt]; ttref = neutchars(truncate_to_word(ttref, m_idxMetaStoredLen), cstr_nc); - if (!ttref.empty()) { - RECORD_APPEND(record, cstr_caption, ttref); + if (!ttref.empty()) { + RECORD_APPEND(record, cstr_caption, ttref); ttref.clear(); } - // If abstract is empty, we make up one with the beginning of the - // document. This is then not indexed, but part of the doc data so - // that we can return it to a query without having to decode the - // original file. - // Note that the map accesses by operator[] create empty entries if they - // don't exist yet. + // If abstract is empty, we make up one with the beginning of the + // document. This is then not indexed, but part of the doc data so + // that we can return it to a query without having to decode the + // original file. + // Note that the map accesses by operator[] create empty entries if they + // don't exist yet. if (m_idxAbsTruncLen > 0) { string& absref = doc.meta[Doc::keyabs]; trimstring(absref, " \t\r\n"); @@ -1805,17 +1805,17 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc) } // Append all regular "stored" meta fields - const set& stored = m_config->getStoredFields(); - for (set::const_iterator it = stored.begin(); - it != stored.end(); it++) { - string nm = m_config->fieldCanon(*it); - if (!doc.meta[nm].empty()) { - string value = - neutchars(truncate_to_word(doc.meta[nm], + const set& stored = m_config->getStoredFields(); + for (set::const_iterator it = stored.begin(); + it != stored.end(); it++) { + string nm = m_config->fieldCanon(*it); + if (!doc.meta[nm].empty()) { + string value = + neutchars(truncate_to_word(doc.meta[nm], m_idxMetaStoredLen), cstr_nc); - RECORD_APPEND(record, nm, value); - } - } + RECORD_APPEND(record, nm, value); + } + } // At this point, if the document "filename" field was empty, // try to store the "container file name" value. This is done @@ -1825,64 +1825,64 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc) const string *fnp = 0; if (!doc.peekmeta(Rcl::Doc::keyfn, &fnp) || fnp->empty()) { if (doc.peekmeta(Rcl::Doc::keytcfn, &fnp) && !fnp->empty()) { - string value = - neutchars(truncate_to_word(*fnp, + string value = + neutchars(truncate_to_word(*fnp, m_idxMetaStoredLen), cstr_nc); - RECORD_APPEND(record, Rcl::Doc::keyfn, value); + RECORD_APPEND(record, Rcl::Doc::keyfn, value); } } - // If empty pages (multiple break at same pos) were recorded, save - // them (this is because we have no way to record them in the - // Xapian list - if (!tpidx.m_pageincrvec.empty()) { - ostringstream multibreaks; - for (unsigned int i = 0; i < tpidx.m_pageincrvec.size(); i++) { - if (i != 0) - multibreaks << ","; - multibreaks << tpidx.m_pageincrvec[i].first << "," << - tpidx.m_pageincrvec[i].second; - } - RECORD_APPEND(record, string(cstr_mbreaks), multibreaks.str()); - } + // If empty pages (multiple break at same pos) were recorded, save + // them (this is because we have no way to record them in the + // Xapian list + if (!tpidx.m_pageincrvec.empty()) { + ostringstream multibreaks; + for (unsigned int i = 0; i < tpidx.m_pageincrvec.size(); i++) { + if (i != 0) + multibreaks << ","; + multibreaks << tpidx.m_pageincrvec[i].first << "," << + tpidx.m_pageincrvec[i].second; + } + RECORD_APPEND(record, string(cstr_mbreaks), multibreaks.str()); + } - // If the file's md5 was computed, add value and term. - // The value is optionally used for query result duplicate elimination, - // and the term to find the duplicates. - // We don't do this for empty docs. - const string *md5; - if (doc.peekmeta(Doc::keymd5, &md5) && !md5->empty() && - md5->compare(cstr_md5empty)) { - string digest; - MD5HexScan(*md5, digest); - newdocument.add_value(VALUE_MD5, digest); - newdocument.add_boolean_term(wrap_prefix("XM") + *md5); - } + // If the file's md5 was computed, add value and term. + // The value is optionally used for query result duplicate elimination, + // and the term to find the duplicates. + // We don't do this for empty docs. + const string *md5; + if (doc.peekmeta(Doc::keymd5, &md5) && !md5->empty() && + md5->compare(cstr_md5empty)) { + string digest; + MD5HexScan(*md5, digest); + newdocument.add_value(VALUE_MD5, digest); + newdocument.add_boolean_term(wrap_prefix("XM") + *md5); + } - LOGDEB0("Rcl::Db::add: new doc record:\n" << record << "\n"); - newdocument.set_data(record); + LOGDEB0("Rcl::Db::add: new doc record:\n" << record << "\n"); + newdocument.set_data(record); } #ifdef IDX_THREADS if (m_ndb->m_havewriteq) { - DbUpdTask *tp = new DbUpdTask( + DbUpdTask *tp = new DbUpdTask( DbUpdTask::AddOrUpdate, udi, uniterm, newdocument_ptr, doc.text.length(), rawztext); - if (!m_ndb->m_wqueue.put(tp)) { - LOGERR("Db::addOrUpdate:Cant queue task\n"); + if (!m_ndb->m_wqueue.put(tp)) { + LOGERR("Db::addOrUpdate:Cant queue task\n"); delete newdocument_ptr; - return false; - } else { - return true; - } + return false; + } else { + return true; + } } #endif return m_ndb->addOrUpdateWrite(udi, uniterm, newdocument_ptr, - doc.text.length(), rawztext); + doc.text.length(), rawztext); } bool Db::Native::docToXdocXattrOnly(TextSplitDb *splitter, const string &udi, - Doc &doc, Xapian::Document& xdoc) + Doc &doc, Xapian::Document& xdoc) { LOGDEB0("Db::docToXdocXattrOnly\n"); #ifdef IDX_THREADS @@ -1891,8 +1891,8 @@ bool Db::Native::docToXdocXattrOnly(TextSplitDb *splitter, const string &udi, // Read existing document and its data record if (getDoc(udi, 0, xdoc) == 0) { - LOGERR("docToXdocXattrOnly: existing doc not found\n"); - return false; + LOGERR("docToXdocXattrOnly: existing doc not found\n"); + return false; } string data; XAPTRY(data = xdoc.get_data(), xrdb, m_rcldb->m_reason); @@ -1904,20 +1904,20 @@ bool Db::Native::docToXdocXattrOnly(TextSplitDb *splitter, const string &udi, // Clear the term lists for the incoming fields and index the new values map::iterator meta_it; for (const auto& ent : doc.meta) { - const FieldTraits *ftp; - if (!m_rcldb->fieldToTraits(ent.first, &ftp) || ftp->pfx.empty()) { - LOGDEB0("Db::xattrOnly: no prefix for field [" << + const FieldTraits *ftp; + if (!m_rcldb->fieldToTraits(ent.first, &ftp) || ftp->pfx.empty()) { + LOGDEB0("Db::xattrOnly: no prefix for field [" << ent.first << "], skipped\n"); - continue; - } - // Clear the previous terms for the field - clearField(xdoc, ftp->pfx, ftp->wdfinc); - LOGDEB0("Db::xattrOnly: field [" << ent.first << "] pfx [" << + continue; + } + // Clear the previous terms for the field + clearField(xdoc, ftp->pfx, ftp->wdfinc); + LOGDEB0("Db::xattrOnly: field [" << ent.first << "] pfx [" << ftp->pfx << "] inc " << ftp->wdfinc << ": [" << ent.second << "]\n"); - splitter->setTraits(*ftp); - if (!splitter->text_to_words(ent.second)) { - LOGDEB("Db::xattrOnly: split failed for " << ent.first << "\n"); + splitter->setTraits(*ftp); + if (!splitter->text_to_words(ent.second)) { + LOGDEB("Db::xattrOnly: split failed for " << ent.first << "\n"); } } xdoc.add_value(VALUE_SIG, doc.sig); @@ -1925,22 +1925,22 @@ bool Db::Native::docToXdocXattrOnly(TextSplitDb *splitter, const string &udi, // Parse current data record into a dict for ease of processing ConfSimple datadic(data); if (!datadic.ok()) { - LOGERR("db::docToXdocXattrOnly: failed turning data rec to dict\n"); - return false; + LOGERR("db::docToXdocXattrOnly: failed turning data rec to dict\n"); + return false; } // For each "stored" field, check if set in doc metadata and // update the value if it is const set& stored = m_rcldb->m_config->getStoredFields(); for (set::const_iterator it = stored.begin(); - it != stored.end(); it++) { - string nm = m_rcldb->m_config->fieldCanon(*it); - if (doc.getmeta(nm, 0)) { - string value = neutchars( + it != stored.end(); it++) { + string nm = m_rcldb->m_config->fieldCanon(*it); + if (doc.getmeta(nm, 0)) { + string value = neutchars( truncate_to_word(doc.meta[nm], m_rcldb->m_idxMetaStoredLen), cstr_nc); - datadic.set(nm, value, ""); - } + datadic.set(nm, value, ""); + } } // Recreate the record. We want to do this with the local RECORD_APPEND @@ -1948,10 +1948,10 @@ bool Db::Native::docToXdocXattrOnly(TextSplitDb *splitter, const string &udi, vector names = datadic.getNames(""); data.clear(); for (vector::const_iterator it = names.begin(); - it != names.end(); it++) { - string value; - datadic.get(*it, value, ""); - RECORD_APPEND(data, *it, value); + it != names.end(); it++) { + string value; + datadic.get(*it, value, ""); + RECORD_APPEND(data, *it, value); } RECORD_APPEND(data, Doc::keysig, doc.sig); xdoc.set_data(data); @@ -1962,18 +1962,18 @@ bool Db::Native::docToXdocXattrOnly(TextSplitDb *splitter, const string &udi, void Db::waitUpdIdle() { if (m_ndb->m_iswritable && m_ndb->m_havewriteq) { - Chrono chron; - m_ndb->m_wqueue.waitIdle(); - // We flush here just for correct measurement of the thread work time - string ermsg; - try { - m_ndb->xwdb.commit(); - } XCATCHERROR(ermsg); - if (!ermsg.empty()) { - LOGERR("Db::waitUpdIdle: flush() failed: " << ermsg << "\n"); - } - m_ndb->m_totalworkns += chron.nanos(); - LOGINFO("Db::waitUpdIdle: total xapian work " << + Chrono chron; + m_ndb->m_wqueue.waitIdle(); + // We flush here just for correct measurement of the thread work time + string ermsg; + try { + m_ndb->xwdb.commit(); + } XCATCHERROR(ermsg); + if (!ermsg.empty()) { + LOGERR("Db::waitUpdIdle: flush() failed: " << ermsg << "\n"); + } + m_ndb->m_totalworkns += chron.nanos(); + LOGINFO("Db::waitUpdIdle: total xapian work " << lltodecstr(m_ndb->m_totalworkns/1000000) << " mS\n"); } } @@ -1983,12 +1983,12 @@ void Db::waitUpdIdle() bool Db::maybeflush(int64_t moretext) { if (m_flushMb > 0) { - m_curtxtsz += moretext; - if ((m_curtxtsz - m_flushtxtsz) / MB >= m_flushMb) { - LOGINF("Db::add/delete: txt size >= " << m_flushMb << + m_curtxtsz += moretext; + if ((m_curtxtsz - m_flushtxtsz) / MB >= m_flushMb) { + LOGINF("Db::add/delete: txt size >= " << m_flushMb << " Mb, flushing\n"); - return doFlush(); - } + return doFlush(); + } } return true; } @@ -1996,16 +1996,16 @@ bool Db::maybeflush(int64_t moretext) bool Db::doFlush() { if (!m_ndb) { - LOGERR("Db::doFLush: no ndb??\n"); - return false; + LOGERR("Db::doFLush: no ndb??\n"); + return false; } string ermsg; try { - m_ndb->xwdb.commit(); + m_ndb->xwdb.commit(); } XCATCHERROR(ermsg); if (!ermsg.empty()) { - LOGERR("Db::doFlush: flush() failed: " << ermsg << "\n"); - return false; + LOGERR("Db::doFlush: flush() failed: " << ermsg << "\n"); + return false; } m_flushtxtsz = m_curtxtsz; return true; @@ -2072,12 +2072,12 @@ bool Db::needUpdate(const string &udi, const string& sig, // If we are doing an in place or full reset, no need to test. if (o_inPlaceReset || m_mode == DbTrunc) { - // For in place reset, pretend the doc existed, to enable - // subdoc purge. The value is only used as a boolean in this case. - if (docidp && o_inPlaceReset) { - *docidp = -1; + // For in place reset, pretend the doc existed, to enable + // subdoc purge. The value is only used as a boolean in this case. + if (docidp && o_inPlaceReset) { + *docidp = -1; } - return true; + return true; } string uniterm = make_uniterm(udi); @@ -2149,7 +2149,7 @@ vector Db::getStemLangs() LOGDEB("Db::getStemLang\n"); vector langs; if (m_ndb == 0 || m_ndb->m_isopen == false) - return langs; + return langs; StemDb db(m_ndb->xrdb); db.getMembers(langs); return langs; @@ -2162,7 +2162,7 @@ bool Db::deleteStemDb(const string& lang) { LOGDEB("Db::deleteStemDb(" << lang << ")\n"); if (m_ndb == 0 || m_ndb->m_isopen == false || !m_ndb->m_iswritable) - return false; + return false; XapWritableSynFamily db(m_ndb->xwdb, synFamStem); return db.deleteMember(lang); } @@ -2177,8 +2177,8 @@ bool Db::createStemDbs(const vector& langs) { LOGDEB("Db::createStemDbs\n"); if (m_ndb == 0 || m_ndb->m_isopen == false || !m_ndb->m_iswritable) { - LOGERR("createStemDb: db not open or not writable\n"); - return false; + LOGERR("createStemDb: db not open or not writable\n"); + return false; } return createExpansionDbs(m_ndb->xwdb, langs); @@ -2194,16 +2194,16 @@ bool Db::purge() { LOGDEB("Db::purge\n"); if (m_ndb == 0) - return false; + return false; LOGDEB("Db::purge: m_isopen " << m_ndb->m_isopen << " m_iswritable " << m_ndb->m_iswritable << "\n"); if (m_ndb->m_isopen == false || m_ndb->m_iswritable == false) - return false; + return false; #ifdef IDX_THREADS // If we manage our own write queue, make sure it's drained and closed if (m_ndb->m_havewriteq) - m_ndb->m_wqueue.setTerminateAndWait(); + m_ndb->m_wqueue.setTerminateAndWait(); // else we need to lock out other top level threads. This is just // a precaution as they should have been waited for by the top // level actor at this point @@ -2229,39 +2229,39 @@ bool Db::purge() // flag is not set (we did not see its source during indexing). int purgecount = 0; for (Xapian::docid docid = 1; docid < updated.size(); ++docid) { - if (!updated[docid]) { - if ((purgecount+1) % 100 == 0) { - try { - CancelCheck::instance().checkCancel(); - } catch(CancelExcept) { - LOGINFO("Db::purge: partially cancelled\n"); - break; - } - } + if (!updated[docid]) { + if ((purgecount+1) % 100 == 0) { + try { + CancelCheck::instance().checkCancel(); + } catch(CancelExcept) { + LOGINFO("Db::purge: partially cancelled\n"); + break; + } + } - try { - if (m_flushMb > 0) { - // We use an average term length of 5 for - // estimating the doc sizes which is probably not - // accurate but gives rough consistency with what - // we do for add/update. I should fetch the doc - // size from the data record, but this would be - // bad for performance. - Xapian::termcount trms = m_ndb->xwdb.get_doclength(docid); - maybeflush(trms * 5); - } - m_ndb->deleteDocument(docid); - LOGDEB("Db::purge: deleted document #" << docid << "\n"); - } catch (const Xapian::DocNotFoundError &) { - LOGDEB0("Db::purge: document #" << docid << " not found\n"); - } catch (const Xapian::Error &e) { - LOGERR("Db::purge: document #" << docid << ": " << + try { + if (m_flushMb > 0) { + // We use an average term length of 5 for + // estimating the doc sizes which is probably not + // accurate but gives rough consistency with what + // we do for add/update. I should fetch the doc + // size from the data record, but this would be + // bad for performance. + Xapian::termcount trms = m_ndb->xwdb.get_doclength(docid); + maybeflush(trms * 5); + } + m_ndb->deleteDocument(docid); + LOGDEB("Db::purge: deleted document #" << docid << "\n"); + } catch (const Xapian::DocNotFoundError &) { + LOGDEB0("Db::purge: document #" << docid << " not found\n"); + } catch (const Xapian::Error &e) { + LOGERR("Db::purge: document #" << docid << ": " << e.get_msg() << "\n"); - } catch (...) { - LOGERR("Db::purge: document #" << docid << ": unknown error\n"); - } - purgecount++; - } + } catch (...) { + LOGERR("Db::purge: document #" << docid << ": unknown error\n"); + } + purgecount++; + } } m_reason.clear(); @@ -2285,15 +2285,15 @@ bool Db::docExists(const string& uniterm) string ermsg; try { - Xapian::PostingIterator docid = m_ndb->xrdb.postlist_begin(uniterm); - if (docid == m_ndb->xrdb.postlist_end(uniterm)) { - return false; + Xapian::PostingIterator docid = m_ndb->xrdb.postlist_begin(uniterm); + if (docid == m_ndb->xrdb.postlist_end(uniterm)) { + return false; } else { - return true; - } + return true; + } } XCATCHERROR(ermsg); if (!ermsg.empty()) { - LOGERR("Db::docExists(" << uniterm << ") " << ermsg << "\n"); + LOGERR("Db::docExists(" << uniterm << ") " << ermsg << "\n"); } return false; } @@ -2303,26 +2303,26 @@ bool Db::purgeFile(const string &udi, bool *existed) { LOGDEB("Db:purgeFile: [" << udi << "]\n"); if (m_ndb == 0 || !m_ndb->m_iswritable) - return false; + return false; string uniterm = make_uniterm(udi); bool exists = docExists(uniterm); if (existed) - *existed = exists; + *existed = exists; if (!exists) - return true; + return true; #ifdef IDX_THREADS if (m_ndb->m_havewriteq) { string rztxt; - DbUpdTask *tp = new DbUpdTask(DbUpdTask::Delete, udi, uniterm, - 0, (size_t)-1, rztxt); - if (!m_ndb->m_wqueue.put(tp)) { - LOGERR("Db::purgeFile:Cant queue task\n"); - return false; - } else { - return true; - } + DbUpdTask *tp = new DbUpdTask(DbUpdTask::Delete, udi, uniterm, + 0, (size_t)-1, rztxt); + if (!m_ndb->m_wqueue.put(tp)) { + LOGERR("Db::purgeFile:Cant queue task\n"); + return false; + } else { + return true; + } } #endif /* We get there is IDX_THREADS is not defined or there is no queue */ @@ -2336,21 +2336,21 @@ bool Db::purgeOrphans(const string &udi) { LOGDEB("Db:purgeOrphans: [" << udi << "]\n"); if (m_ndb == 0 || !m_ndb->m_iswritable) - return false; + return false; string uniterm = make_uniterm(udi); #ifdef IDX_THREADS if (m_ndb->m_havewriteq) { string rztxt; - DbUpdTask *tp = new DbUpdTask(DbUpdTask::PurgeOrphans, udi, uniterm, - 0, (size_t)-1, rztxt); - if (!m_ndb->m_wqueue.put(tp)) { - LOGERR("Db::purgeFile:Cant queue task\n"); - return false; - } else { - return true; - } + DbUpdTask *tp = new DbUpdTask(DbUpdTask::PurgeOrphans, udi, uniterm, + 0, (size_t)-1, rztxt); + if (!m_ndb->m_wqueue.put(tp)) { + LOGERR("Db::purgeFile:Cant queue task\n"); + return false; + } else { + return true; + } } #endif /* We get there is IDX_THREADS is not defined or there is no queue */ @@ -2360,14 +2360,14 @@ bool Db::purgeOrphans(const string &udi) bool Db::dbStats(DbStats& res, bool listfailed) { if (!m_ndb || !m_ndb->m_isopen) - return false; + return false; Xapian::Database xdb = m_ndb->xrdb; XAPTRY(res.dbdoccount = xdb.get_doccount(); res.dbavgdoclen = xdb.get_avlength(); - res.mindoclen = xdb.get_doclength_lower_bound(); - res.maxdoclen = xdb.get_doclength_upper_bound(); - , xdb, m_reason); + res.mindoclen = xdb.get_doclength_lower_bound(); + res.maxdoclen = xdb.get_doclength_upper_bound(); + , xdb, m_reason); if (!m_reason.empty()) return false; if (!listfailed) { @@ -2406,7 +2406,7 @@ bool Db::dbStats(DbStats& res, bool listfailed) } } XCATCHERROR(ermsg); if (!ermsg.empty()) { - LOGERR("Db::dbStats: " << ermsg << "\n"); + LOGERR("Db::dbStats: " << ermsg << "\n"); return false; } return true; @@ -2450,34 +2450,34 @@ bool Db::getDoc(const string& udi, int idxi, Doc& doc) // Initialize what we can in any case. If this is history, caller // will make partial display in case of error if (m_ndb == 0) - return false; + return false; doc.meta[Rcl::Doc::keyrr] = "100%"; doc.pc = 100; Xapian::Document xdoc; Xapian::docid docid; if (idxi >= 0 && (docid = m_ndb->getDoc(udi, idxi, xdoc))) { - string data = xdoc.get_data(); - doc.meta[Rcl::Doc::keyudi] = udi; - return m_ndb->dbDataToRclDoc(docid, data, doc); + string data = xdoc.get_data(); + doc.meta[Rcl::Doc::keyudi] = udi; + return m_ndb->dbDataToRclDoc(docid, data, doc); } else { - // Document found in history no longer in the - // database. We return true (because their might be - // other ok docs further) but indicate the error with - // pc = -1 - doc.pc = -1; - LOGINFO("Db:getDoc: no such doc in current index: [" << udi << "]\n"); - return true; + // Document found in history no longer in the + // database. We return true (because their might be + // other ok docs further) but indicate the error with + // pc = -1 + doc.pc = -1; + LOGINFO("Db:getDoc: no such doc in current index: [" << udi << "]\n"); + return true; } } bool Db::hasSubDocs(const Doc &idoc) { if (m_ndb == 0) - return false; + return false; string inudi; if (!idoc.getmeta(Doc::keyudi, &inudi) || inudi.empty()) { - LOGERR("Db::hasSubDocs: no input udi or empty\n"); - return false; + LOGERR("Db::hasSubDocs: no input udi or empty\n"); + return false; } LOGDEB1("Db::hasSubDocs: idxi " << idoc.idxi << " inudi [" < docids; if (!m_ndb->subDocs(inudi, idoc.idxi, docids)) { - LOGDEB("Db::hasSubDocs: lower level subdocs failed\n"); - return false; + LOGDEB("Db::hasSubDocs: lower level subdocs failed\n"); + return false; } if (!docids.empty()) - return true; + return true; // Check if doc has an "has_children" term if (m_ndb->hasTerm(inudi, idoc.idxi, has_children_term)) - return true; + return true; return false; } @@ -2506,12 +2506,12 @@ bool Db::hasSubDocs(const Doc &idoc) bool Db::getSubDocs(const Doc &idoc, vector& subdocs) { if (m_ndb == 0) - return false; + return false; string inudi; if (!idoc.getmeta(Doc::keyudi, &inudi) || inudi.empty()) { - LOGERR("Db::getSubDocs: no input udi or empty\n"); - return false; + LOGERR("Db::getSubDocs: no input udi or empty\n"); + return false; } string rootudi; @@ -2519,28 +2519,28 @@ bool Db::getSubDocs(const Doc &idoc, vector& subdocs) LOGDEB0("Db::getSubDocs: idxi " << idoc.idxi << " inudi [" << inudi << "] ipath [" << ipath << "]\n"); if (ipath.empty()) { - // File-level doc. Use it as root - rootudi = inudi; + // File-level doc. Use it as root + rootudi = inudi; } else { - // See if we have a parent term - Xapian::Document xdoc; - if (!m_ndb->getDoc(inudi, idoc.idxi, xdoc)) { - LOGERR("Db::getSubDocs: can't get Xapian document\n"); - return false; - } - Xapian::TermIterator xit; - XAPTRY(xit = xdoc.termlist_begin(); - xit.skip_to(wrap_prefix(parent_prefix)), - m_ndb->xrdb, m_reason); - if (!m_reason.empty()) { - LOGERR("Db::getSubDocs: xapian error: " << m_reason << "\n"); - return false; - } - if (xit == xdoc.termlist_end()) { - LOGERR("Db::getSubDocs: parent term not found\n"); - return false; - } - rootudi = strip_prefix(*xit); + // See if we have a parent term + Xapian::Document xdoc; + if (!m_ndb->getDoc(inudi, idoc.idxi, xdoc)) { + LOGERR("Db::getSubDocs: can't get Xapian document\n"); + return false; + } + Xapian::TermIterator xit; + XAPTRY(xit = xdoc.termlist_begin(); + xit.skip_to(wrap_prefix(parent_prefix)), + m_ndb->xrdb, m_reason); + if (!m_reason.empty()) { + LOGERR("Db::getSubDocs: xapian error: " << m_reason << "\n"); + return false; + } + if (xit == xdoc.termlist_end()) { + LOGERR("Db::getSubDocs: parent term not found\n"); + return false; + } + rootudi = strip_prefix(*xit); } LOGDEB("Db::getSubDocs: root: [" << rootudi << "]\n"); @@ -2548,38 +2548,38 @@ bool Db::getSubDocs(const Doc &idoc, vector& subdocs) // Retrieve all subdoc xapian ids for the root vector docids; if (!m_ndb->subDocs(rootudi, idoc.idxi, docids)) { - LOGDEB("Db::getSubDocs: lower level subdocs failed\n"); - return false; + LOGDEB("Db::getSubDocs: lower level subdocs failed\n"); + return false; } // Retrieve doc, filter, and build output list for (int tries = 0; tries < 2; tries++) { - try { - for (vector::const_iterator it = docids.begin(); - it != docids.end(); it++) { - Xapian::Document xdoc = m_ndb->xrdb.get_document(*it); - string data = xdoc.get_data(); - string docudi; - m_ndb->xdocToUdi(xdoc, docudi); - Doc doc; - doc.meta[Doc::keyudi] = docudi; - doc.meta[Doc::keyrr] = "100%"; - doc.pc = 100; - if (!m_ndb->dbDataToRclDoc(*it, data, doc)) { - LOGERR("Db::getSubDocs: doc conversion error\n"); - return false; - } + try { + for (vector::const_iterator it = docids.begin(); + it != docids.end(); it++) { + Xapian::Document xdoc = m_ndb->xrdb.get_document(*it); + string data = xdoc.get_data(); + string docudi; + m_ndb->xdocToUdi(xdoc, docudi); + Doc doc; + doc.meta[Doc::keyudi] = docudi; + doc.meta[Doc::keyrr] = "100%"; + doc.pc = 100; + if (!m_ndb->dbDataToRclDoc(*it, data, doc)) { + LOGERR("Db::getSubDocs: doc conversion error\n"); + return false; + } if (ipath.empty() || FileInterner::ipathContains(ipath, doc.ipath)) { subdocs.push_back(doc); } - } - return true; - } catch (const Xapian::DatabaseModifiedError &e) { + } + return true; + } catch (const Xapian::DatabaseModifiedError &e) { m_reason = e.get_msg(); - m_ndb->xrdb.reopen(); + m_ndb->xrdb.reopen(); continue; - } XCATCHERROR(m_reason); + } XCATCHERROR(m_reason); break; } @@ -2590,12 +2590,12 @@ bool Db::getSubDocs(const Doc &idoc, vector& subdocs) bool Db::getContainerDoc(const Doc &idoc, Doc& ctdoc) { if (m_ndb == 0) - return false; + return false; string inudi; if (!idoc.getmeta(Doc::keyudi, &inudi) || inudi.empty()) { - LOGERR("Db::getContainerDoc: no input udi or empty\n"); - return false; + LOGERR("Db::getContainerDoc: no input udi or empty\n"); + return false; } string rootudi; @@ -2603,7 +2603,7 @@ bool Db::getContainerDoc(const Doc &idoc, Doc& ctdoc) LOGDEB0("Db::getContainerDoc: idxi " << idoc.idxi << " inudi [" << inudi << "] ipath [" << ipath << "]\n"); if (ipath.empty()) { - // File-level doc ?? + // File-level doc ?? ctdoc = idoc; return true; } diff --git a/src/rcldb/termproc.h b/src/rcldb/termproc.h index 9406de64..e0f5950d 100644 --- a/src/rcldb/termproc.h +++ b/src/rcldb/termproc.h @@ -52,21 +52,18 @@ class TermProc { public: TermProc(TermProc* next) : m_next(next) {} virtual ~TermProc() {} - virtual bool takeword(const string &term, int pos, int bs, int be) - { + virtual bool takeword(const string &term, int pos, int bs, int be) { if (m_next) return m_next->takeword(term, pos, bs, be); else return true; } // newpage() is like takeword(), but for page breaks. - virtual void newpage(int pos) - { + virtual void newpage(int pos) { if (m_next) m_next->newpage(pos); } - virtual bool flush() - { + virtual bool flush() { if (m_next) return m_next->flush(); else @@ -137,7 +134,7 @@ public: // We don't generate a fatal error because of a bad term, // but one has to put the limit somewhere if (m_unacerrors > 500 && - (double(m_totalterms) / double(m_unacerrors)) < 2.0) { + (double(m_totalterms) / double(m_unacerrors)) < 2.0) { // More than 1 error for every other term LOGERR("splitter::takeword: too many unac errors " << m_unacerrors << "/" << m_totalterms << "\n"); @@ -147,12 +144,12 @@ public: } if (otrm.empty()) { - // It may happen in some weird cases that the output from - // unac is empty (if the word actually consisted entirely - // of diacritics ...) The consequence is that a phrase - // search won't work without addional slack. + // It may happen in some weird cases that the output from + // unac is empty (if the word actually consisted entirely + // of diacritics ...) The consequence is that a phrase + // search won't work without addional slack. return true; - } + } // We should have a Japanese stemmer to handle this, but for // experimenting, let's do it here: remove 'prolounged sound @@ -174,34 +171,34 @@ public: return true; } - // It may also occur that unac introduces spaces in the string - // (when removing isolated accents, may happen for Greek - // for example). This is a pathological situation. We - // index all the resulting terms at the same pos because - // the surrounding code is not designed to handle a pos - // change in here. This means that phrase searches and - // snippets will be wrong, but at least searching for the - // terms will work. - bool hasspace = false; - for (string::const_iterator it = otrm.begin();it < otrm.end();it++) { - if (*it == ' ') { - hasspace=true; - break; - } - } - if (hasspace) { + // It may also occur that unac introduces spaces in the string + // (when removing isolated accents, may happen for Greek + // for example). This is a pathological situation. We + // index all the resulting terms at the same pos because + // the surrounding code is not designed to handle a pos + // change in here. This means that phrase searches and + // snippets will be wrong, but at least searching for the + // terms will work. + bool hasspace = false; + for (string::const_iterator it = otrm.begin();it < otrm.end();it++) { + if (*it == ' ') { + hasspace=true; + break; + } + } + if (hasspace) { std::vector terms; - stringToTokens(otrm, terms, " ", true); - for (std::vector::const_iterator it = terms.begin(); - it < terms.end(); it++) { - if (!TermProc::takeword(*it, pos, bs, be)) { - return false; - } - } - return true; - } else { - return TermProc::takeword(otrm, pos, bs, be); - } + stringToTokens(otrm, terms, " ", true); + for (std::vector::const_iterator it = terms.begin(); + it < terms.end(); it++) { + if (!TermProc::takeword(*it, pos, bs, be)) { + return false; + } + } + return true; + } else { + return TermProc::takeword(otrm, pos, bs, be); + } } virtual bool flush()