From 217b7018d695f272c8d3fe2a93f01b1254a3565e Mon Sep 17 00:00:00 2001 From: dockes Date: Thu, 26 Nov 2009 07:17:40 +0000 Subject: [PATCH] use only match terms to build doc abstract, not all query terms (might save a little effort) --- src/rcldb/rcldb.cpp | 42 +++++++++++++++++++++++++++++------------- src/rcldb/rclquery.cpp | 6 +++++- src/rcldb/rclquery.h | 1 + 3 files changed, 35 insertions(+), 14 deletions(-) diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index 736c3233..13fb3eff 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -173,9 +173,8 @@ bool Db::Native::dbDataToRclDoc(Xapian::docid docid, std::string &data, } // Remove prefixes (caps) from a list of terms. -static list noPrefixList(const list& in) +static void noPrefixList(const list& in, list& out) { - list out; for (list::const_iterator qit = in.begin(); qit != in.end(); qit++) { if ('A' <= qit->at(0) && qit->at(0) <= 'Z') { @@ -189,7 +188,6 @@ static list noPrefixList(const list& in) out.push_back(*qit); } } - return out; } //#define DEBUGABSTRACT 1 @@ -198,6 +196,14 @@ static list noPrefixList(const list& in) #else #define LOGABS LOGDEB2 #endif +static void listList(const string& what, const list&l) +{ + string a; + for (list::const_iterator it = l.begin(); it != l.end(); it++) { + a = a + *it + " "; + } + LOGDEB(("%s: %s\n", what.c_str(), a.c_str())); +} // Build a document abstract by extracting text chunks around the query terms // This uses the db termlists, not the original document. @@ -210,22 +216,32 @@ string Db::Native::makeAbstract(Xapian::docid docid, Query *query) LOGDEB(("makeAbstract:%d: maxlen %d wWidth %d\n", chron.ms(), m_rcldb->m_synthAbsLen, m_rcldb->m_synthAbsWordCtxLen)); - list iterms; - query->getQueryTerms(iterms); + list terms; - list terms = noPrefixList(iterms); - if (terms.empty()) { - return string(); + { + list iterms; + query->getMatchTerms(docid, iterms); + noPrefixList(iterms, terms); + if (terms.empty()) { + LOGDEB(("makeAbstract::Empty term list\n")); + return string(); + } } +// listList("Match terms: ", terms); - // Retrieve db-wide frequencies for the query terms + // Retrieve db-wide frequencies for the query terms (we do this once per + // query, using all the query terms, not only the document match terms) if (query->m_nq->termfreqs.empty()) { + list iqterms, qterms; + query->getQueryTerms(iqterms); + noPrefixList(iqterms, qterms); +// listList("Query terms: ", qterms); double doccnt = xrdb.get_doccount(); if (doccnt == 0) doccnt = 1; - for (list::const_iterator qit = terms.begin(); - qit != terms.end(); qit++) { + for (list::const_iterator qit = qterms.begin(); + qit != qterms.end(); qit++) { query->m_nq->termfreqs[*qit] = xrdb.get_termfreq(*qit) / doccnt; - LOGABS(("makeAbstract: [%s] db freq %.1e\n", qit->c_str(), + LOGDEB(("makeAbstract: [%s] db freq %.1e\n", qit->c_str(), query->m_nq->termfreqs[*qit])); } LOGABS(("makeAbstract:%d: got termfreqs\n", chron.ms())); @@ -450,7 +466,7 @@ string Db::Native::makeAbstract(Xapian::docid docid, Query *query) } #endif - LOGDEB(("makeAbstract:%d: extracting\n", chron.millis())); + LOGABS(("makeAbstract:%d: extracting\n", chron.millis())); // Finally build the abstract by walking the map (in order of position) string abstract; diff --git a/src/rcldb/rclquery.cpp b/src/rcldb/rclquery.cpp index 5a7e2531..c8dcfc24 100644 --- a/src/rcldb/rclquery.cpp +++ b/src/rcldb/rclquery.cpp @@ -225,6 +225,10 @@ bool Query::getQueryTerms(list& terms) } bool Query::getMatchTerms(const Doc& doc, list& terms) +{ + return getMatchTerms(doc.xdocid, terms); +} +bool Query::getMatchTerms(unsigned long xdocid, list& terms) { if (ISNULL(m_nq) || !m_nq->xenquire) { LOGERR(("Query::getMatchTerms: no query opened\n")); @@ -233,7 +237,7 @@ bool Query::getMatchTerms(const Doc& doc, list& terms) terms.clear(); Xapian::TermIterator it; - Xapian::docid id = Xapian::docid(doc.xdocid); + Xapian::docid id = Xapian::docid(xdocid); XAPTRY(terms.insert(terms.begin(), m_nq->xenquire->get_matching_terms_begin(id), diff --git a/src/rcldb/rclquery.h b/src/rcldb/rclquery.h index 9208b9e5..e09d3a9b 100644 --- a/src/rcldb/rclquery.h +++ b/src/rcldb/rclquery.h @@ -77,6 +77,7 @@ class Query { /** Return a list of terms which matched for a specific result document */ bool getMatchTerms(const Doc& doc, list& terms); + bool getMatchTerms(unsigned long xdocid, list& terms); /** Expand query to look for documents like the one passed in */ list expand(const Doc &doc);