From a5810508ed9ae822c48f701e95933aee4a7f0d33 Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Fri, 17 May 2019 09:39:26 +0200 Subject: [PATCH] abstract: optimize the way we retrieve the wdfs by sorting the list of terms we query for. Big difference on very big docs --- src/rcldb/rclabstract.cpp | 60 +++++++++++++++++++++++++++------------ 1 file changed, 42 insertions(+), 18 deletions(-) diff --git a/src/rcldb/rclabstract.cpp b/src/rcldb/rclabstract.cpp index eec666c5..63bd4b4a 100644 --- a/src/rcldb/rclabstract.cpp +++ b/src/rcldb/rclabstract.cpp @@ -39,6 +39,8 @@ using namespace std; namespace Rcl { +static Chrono chron; + // This is used as a marker inside the abstract frag lists, but // normally doesn't remain in final output (which is built with a // custom sep. by our caller). @@ -140,8 +142,9 @@ double Query::Native::qualityTerms(Xapian::docid docid, const vector& terms, multimap >& byQ) { - LOGABS("qualityTerms\n"); + LOGABS("qualityTerms: entry " << chron.millis() << "mS\n"); setDbWideQTermsFreqs(); + LOGABS("qualityTerms: setDbWide..: " << chron.millis() << "mS\n"); map termQcoefs; double totalweight = 0; @@ -154,6 +157,7 @@ double Query::Native::qualityTerms(Xapian::docid docid, if (m_q->m_sd) { m_q->m_sd->getTerms(hld); } + LOGABS("qualityTerms: m_sd->getTerms(): " << chron.millis() << "mS\n"); // Group the input terms by the user term they were possibly // expanded from (by stemming) @@ -181,28 +185,47 @@ double Query::Native::qualityTerms(Xapian::docid docid, } byRootstr.append("\n"); } - LOGABS("\nqualityTerms: uterms to terms: " << byRootstr << "\n"); + LOGABS("qualityTerms: uterms to terms: " << chron.millis() << "mS " << + byRootstr << endl); } #endif - // Compute in-document and global frequencies for the groups. - map grpwdfs; - map grptfreqs; + // Compute in-document and global frequencies for the groups. We + // used to call termlist_begin() for each term. This was very slow + // on big documents and long term lists. We now compute a sorted + // list of terms (with pointers back to their root through a map), + // and just call skip_to repeatedly + vector allterms; + unordered_map toRoot; for (const auto& group : byRoot) { for (const auto& term : group.second) { - Xapian::TermIterator xtermit = xrdb.termlist_begin(docid); - xtermit.skip_to(term); - if (xtermit != xrdb.termlist_end(docid) && *xtermit == term) { - if (grpwdfs.find(group.first) != grpwdfs.end()) { - grpwdfs[group.first] = xtermit.get_wdf() / doclen; - grptfreqs[group.first] = termfreqs[term]; - } else { - grpwdfs[group.first] += xtermit.get_wdf() / doclen; - grptfreqs[group.first] += termfreqs[term]; - } - } - } + allterms.push_back(term); + toRoot[term] = group.first; + } } + sort(allterms.begin(), allterms.end()); + allterms.erase(unique(allterms.begin(), allterms.end()), allterms.end()); + + map grpwdfs; + map grptfreqs; + Xapian::TermIterator xtermit = xrdb.termlist_begin(docid); + for (const auto& term : allterms) { + const string& root = toRoot[term]; + xtermit.skip_to(term); + if (xtermit != xrdb.termlist_end(docid) && *xtermit == term) { + if (grpwdfs.find(root) != grpwdfs.end()) { + grpwdfs[root] = xtermit.get_wdf() / doclen; + grptfreqs[root] = termfreqs[term]; + } else { + grpwdfs[root] += xtermit.get_wdf() / doclen; + grptfreqs[root] += termfreqs[term]; + } + } else { + LOGDEB("qualityTerms: term not found in doc term list: " << term << + endl); + } + } + LOGABS("qualityTerms: freqs compute: " << chron.millis() << "mS\n"); // Build a sorted by quality container for the groups for (const auto& group : byRoot) { @@ -237,6 +260,7 @@ double Query::Native::qualityTerms(Xapian::docid docid, int Query::Native::getFirstMatchPage(Xapian::docid docid, string& term) { LOGDEB("Query::Native::getFirstMatchPage\n"); + chron.restart(); if (!m_q|| !m_q->m_db || !m_q->m_db->m_ndb || !m_q->m_db->m_ndb->m_isopen) { LOGERR("Query::getFirstMatchPage: no db\n"); return -1; @@ -601,7 +625,7 @@ int Query::Native::makeAbstract(Xapian::docid docid, vector& vabs, int imaxoccs, int ictxwords) { - Chrono chron; + chron.restart(); LOGABS("makeAbstract: docid " << docid << " imaxoccs " << imaxoccs << " ictxwords " << ictxwords << "\n");