use only match terms to build doc abstract, not all query terms (might save a little effort)

This commit is contained in:
dockes 2009-11-26 07:17:40 +00:00
parent 58ba06eb9d
commit 217b7018d6
3 changed files with 35 additions and 14 deletions

View File

@ -173,9 +173,8 @@ bool Db::Native::dbDataToRclDoc(Xapian::docid docid, std::string &data,
} }
// Remove prefixes (caps) from a list of terms. // Remove prefixes (caps) from a list of terms.
static list<string> noPrefixList(const list<string>& in) static void noPrefixList(const list<string>& in, list<string>& out)
{ {
list<string> out;
for (list<string>::const_iterator qit = in.begin(); for (list<string>::const_iterator qit = in.begin();
qit != in.end(); qit++) { qit != in.end(); qit++) {
if ('A' <= qit->at(0) && qit->at(0) <= 'Z') { if ('A' <= qit->at(0) && qit->at(0) <= 'Z') {
@ -189,7 +188,6 @@ static list<string> noPrefixList(const list<string>& in)
out.push_back(*qit); out.push_back(*qit);
} }
} }
return out;
} }
//#define DEBUGABSTRACT 1 //#define DEBUGABSTRACT 1
@ -198,6 +196,14 @@ static list<string> noPrefixList(const list<string>& in)
#else #else
#define LOGABS LOGDEB2 #define LOGABS LOGDEB2
#endif #endif
static void listList(const string& what, const list<string>&l)
{
string a;
for (list<string>::const_iterator it = l.begin(); it != l.end(); it++) {
a = a + *it + " ";
}
LOGDEB(("%s: %s\n", what.c_str(), a.c_str()));
}
// Build a document abstract by extracting text chunks around the query terms // Build a document abstract by extracting text chunks around the query terms
// This uses the db termlists, not the original document. // This uses the db termlists, not the original document.
@ -210,22 +216,32 @@ string Db::Native::makeAbstract(Xapian::docid docid, Query *query)
LOGDEB(("makeAbstract:%d: maxlen %d wWidth %d\n", chron.ms(), LOGDEB(("makeAbstract:%d: maxlen %d wWidth %d\n", chron.ms(),
m_rcldb->m_synthAbsLen, m_rcldb->m_synthAbsWordCtxLen)); m_rcldb->m_synthAbsLen, m_rcldb->m_synthAbsWordCtxLen));
list<string> iterms; list<string> terms;
query->getQueryTerms(iterms);
list<string> terms = noPrefixList(iterms); {
if (terms.empty()) { list<string> iterms;
return string(); query->getMatchTerms(docid, iterms);
noPrefixList(iterms, terms);
if (terms.empty()) {
LOGDEB(("makeAbstract::Empty term list\n"));
return string();
}
} }
// listList("Match terms: ", terms);
// Retrieve db-wide frequencies for the query terms // Retrieve db-wide frequencies for the query terms (we do this once per
// query, using all the query terms, not only the document match terms)
if (query->m_nq->termfreqs.empty()) { if (query->m_nq->termfreqs.empty()) {
list<string> iqterms, qterms;
query->getQueryTerms(iqterms);
noPrefixList(iqterms, qterms);
// listList("Query terms: ", qterms);
double doccnt = xrdb.get_doccount(); double doccnt = xrdb.get_doccount();
if (doccnt == 0) doccnt = 1; if (doccnt == 0) doccnt = 1;
for (list<string>::const_iterator qit = terms.begin(); for (list<string>::const_iterator qit = qterms.begin();
qit != terms.end(); qit++) { qit != qterms.end(); qit++) {
query->m_nq->termfreqs[*qit] = xrdb.get_termfreq(*qit) / doccnt; query->m_nq->termfreqs[*qit] = xrdb.get_termfreq(*qit) / doccnt;
LOGABS(("makeAbstract: [%s] db freq %.1e\n", qit->c_str(), LOGDEB(("makeAbstract: [%s] db freq %.1e\n", qit->c_str(),
query->m_nq->termfreqs[*qit])); query->m_nq->termfreqs[*qit]));
} }
LOGABS(("makeAbstract:%d: got termfreqs\n", chron.ms())); LOGABS(("makeAbstract:%d: got termfreqs\n", chron.ms()));
@ -450,7 +466,7 @@ string Db::Native::makeAbstract(Xapian::docid docid, Query *query)
} }
#endif #endif
LOGDEB(("makeAbstract:%d: extracting\n", chron.millis())); LOGABS(("makeAbstract:%d: extracting\n", chron.millis()));
// Finally build the abstract by walking the map (in order of position) // Finally build the abstract by walking the map (in order of position)
string abstract; string abstract;

View File

@ -225,6 +225,10 @@ bool Query::getQueryTerms(list<string>& terms)
} }
bool Query::getMatchTerms(const Doc& doc, list<string>& terms) bool Query::getMatchTerms(const Doc& doc, list<string>& terms)
{
return getMatchTerms(doc.xdocid, terms);
}
bool Query::getMatchTerms(unsigned long xdocid, list<string>& terms)
{ {
if (ISNULL(m_nq) || !m_nq->xenquire) { if (ISNULL(m_nq) || !m_nq->xenquire) {
LOGERR(("Query::getMatchTerms: no query opened\n")); LOGERR(("Query::getMatchTerms: no query opened\n"));
@ -233,7 +237,7 @@ bool Query::getMatchTerms(const Doc& doc, list<string>& terms)
terms.clear(); terms.clear();
Xapian::TermIterator it; Xapian::TermIterator it;
Xapian::docid id = Xapian::docid(doc.xdocid); Xapian::docid id = Xapian::docid(xdocid);
XAPTRY(terms.insert(terms.begin(), XAPTRY(terms.insert(terms.begin(),
m_nq->xenquire->get_matching_terms_begin(id), m_nq->xenquire->get_matching_terms_begin(id),

View File

@ -77,6 +77,7 @@ class Query {
/** Return a list of terms which matched for a specific result document */ /** Return a list of terms which matched for a specific result document */
bool getMatchTerms(const Doc& doc, list<string>& terms); bool getMatchTerms(const Doc& doc, list<string>& terms);
bool getMatchTerms(unsigned long xdocid, list<string>& terms);
/** Expand query to look for documents like the one passed in */ /** Expand query to look for documents like the one passed in */
list<string> expand(const Doc &doc); list<string> expand(const Doc &doc);