From 225c3e8b7d6cf9d19a3c7719679643d761e5aac6 Mon Sep 17 00:00:00 2001 From: dockes Date: Mon, 13 Nov 2006 14:48:21 +0000 Subject: [PATCH] use wdfs for better selection of doc extracts in makeAbstract --- src/rcldb/rcldb.cpp | 137 +++++++++++++++++++++++++++++++++----------- 1 file changed, 102 insertions(+), 35 deletions(-) diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index 79431d9c..0d1efb87 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.91 2006-11-13 08:49:44 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.92 2006-11-13 14:48:21 dockes Exp $ (C) 2004 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -22,6 +22,7 @@ static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.91 2006-11-13 08:49:44 dockes Exp $ #include #include #include +#include #include #include @@ -91,6 +92,9 @@ class Native { Xapian::Enquire *enquire; // Open query descriptor. Xapian::MSet mset; // Partial result set + // Term frequencies for current query. See makeAbstract, not used yet. + // map m_termfreqs; + Native(Db *db) : m_db(db), m_isopen(false), m_iswritable(false), enquire(0) @@ -200,7 +204,6 @@ bool Native::dbDataToRclDoc(std::string &data, Doc &doc, // If the option is set and the abstract is synthetic or empty , build // abstract from position data. if ((qopts & Db::QO_BUILD_ABSTRACT) && !terms.empty()) { - LOGDEB(("dbDataToRclDoc:: building abstract from position data\n")); if (doc.abstract.empty() || syntabs || (qopts & Db::QO_REPLACE_ABSTRACT)) doc.abstract = makeAbstract(docid, terms); @@ -212,17 +215,63 @@ bool Native::dbDataToRclDoc(std::string &data, Doc &doc, return true; } -// We build a possibly full size but sparsely populated (only around -// the search term occurrences) reconstruction of the document. It -// would be possible to compress the array, by having only multiple -// chunks around the terms, but this would seriously complicate the -// data structure. +// Build a document abstract by extracting text chunks around the query terms +// This uses the db termlists, not the original document. string Native::makeAbstract(Xapian::docid docid, const list& terms) { - LOGDEB(("Native::makeAbstract: maxlen %d wWidth %d\n", + Chrono chron; + LOGDEB(("makeAbstract:%d: maxlen %d wWidth %d\n", chron.ms(), m_db->m_synthAbsLen, m_db->m_synthAbsWordCtxLen)); - Chrono chron; + if (terms.empty()) { + return ""; + } + + // We may want to use the db-wide freqs to tune the abstracts one + // day but we currently don't +#if 0 + if (m_termfreqs.empty()) { + for (list::const_iterator qit = terms.begin(); + qit != terms.end(); qit++) { + m_termfreqs[*qit] = db.get_termfreq(*qit); + LOGDEB2(("makeAbstract: [%s] db freq %d\n", qit->c_str(), + m_termfreqs[*qit])); + } + LOGDEB2(("makeAbstract:%d: got termfreqs\n", chron.ms())); + } +#endif + + // Retrieve the term Within Document Frequencies. We are going to try + // and show text around the less common search terms. + map termwdfs; + int totalqtermoccs = 0; + for (list::const_iterator qit = terms.begin(); + qit != terms.end(); qit++) { + Xapian::TermIterator term = db.termlist_begin(docid); + term.skip_to(*qit); + if (term != db.termlist_end(docid) && *term == *qit) { + int f = term.get_wdf(); + termwdfs[*qit] = f; + totalqtermoccs += f; + LOGDEB2(("makeAbstract: [%s] wdf %d\n", qit->c_str(), + termwdfs[*qit])); + } + } + LOGDEB2(("makeAbstract:%d: got wdfs totalqtermoccs %d\n", + chron.ms(), totalqtermoccs)); + if (totalqtermoccs == 0) { + LOGERR(("makeAbstract: no term occurrences !\n")); + return ""; + } + + // Build a sorted by frequency term list: it seems reasonable to + // prefer sampling around the less frequent terms: + multimap bywdf; + for (list::const_iterator qit = terms.begin(); + qit != terms.end(); qit++) { + if (termwdfs.find(*qit) != termwdfs.end()) + bywdf.insert(pair(termwdfs[*qit], *qit)); + } // For each of the query terms, query xapian for its positions // list in the document. For each position entry, remember it in qtermposs @@ -238,26 +287,43 @@ string Native::makeAbstract(Xapian::docid docid, const list& terms) vector qtermposs; // Limit the total number of slots we populate. - const unsigned int maxtotaloccs = 300; - // Max occurrences per term. We initially know nothing about the - // occurrences repartition (it would be possible that only one - // term in the list occurs, or that all do). So this is a rather - // arbitrary choice. - const unsigned int maxoccperterm = maxtotaloccs / 10; - unsigned int totaloccs = 0; + const unsigned int maxtotaloccs = + MAX(50, m_db->m_synthAbsLen /(4 * (m_db->m_synthAbsWordCtxLen+1))); + LOGDEB2(("makeAbstract:%d: ttlqtrms %d mxttloccs %d\n", + chron.ms(), totalqtermoccs, maxtotaloccs)); +#if 0 + for (multimap::iterator qit = bywdf.begin(); + qit != bywdf.end(); qit++) { + LOGDEB(("%d->[%s]\n", qit->first, qit->second.c_str())); + } +#endif - for (list::const_iterator qit = terms.begin(); qit != terms.end(); - qit++) { + // Find the text positions which we will have to fill with terms + unsigned int totaloccs = 0; + for (multimap::iterator qit = bywdf.begin(); + qit != bywdf.end(); qit++) { + string qterm = qit->second; + unsigned int maxoccs; + if (bywdf.size() == 1) { + maxoccs = maxtotaloccs; + } else { + float q = (1 - float(termwdfs[qterm]) / float(totalqtermoccs)) / + (bywdf.size() - 1); + maxoccs = int(ceil(maxtotaloccs * q)); + LOGDEB2(("makeAbstract: [%s] %d max occs (coef %.2f)\n", + qterm.c_str(), maxoccs, q)); + } + Xapian::PositionIterator pos; // There may be query terms not in this doc. This raises an // exception when requesting the position list, we catch it. string emptys; try { unsigned int occurrences = 0; - for (pos = db.positionlist_begin(docid, *qit); - pos != db.positionlist_end(docid, *qit); pos++) { + for (pos = db.positionlist_begin(docid, qterm); + pos != db.positionlist_end(docid, qterm); pos++) { unsigned int ipos = *pos; - LOGDEB2(("Abstract: [%s] at %d\n", qit->c_str(), ipos)); + LOGDEB2(("makeAbstract: [%s] at %d\n", qit->c_str(), ipos)); // Remember the term position qtermposs.push_back(ipos); // Add adjacent slots to the set to populate at next step @@ -265,13 +331,13 @@ string Native::makeAbstract(Xapian::docid docid, const list& terms) unsigned int sto = ipos+m_db->m_synthAbsWordCtxLen; for (unsigned int ii = sta; ii <= sto; ii++) { if (ii == ipos) - sparseDoc[ii] = *qit; + sparseDoc[ii] = qterm; else sparseDoc[ii] = emptys; } // Limit the number of occurences we keep for each // term. The abstract has a finite length anyway ! - if (occurrences++ > maxoccperterm) + if (occurrences++ > maxoccs) break; } } catch (...) { @@ -282,7 +348,7 @@ string Native::makeAbstract(Xapian::docid docid, const list& terms) break; } - LOGDEB(("Abstract:%d:chosen number of positions %d. Populating\n", + LOGDEB2(("makeAbstract:%d:chosen number of positions %d\n", chron.millis(), qtermposs.size())); // Walk the full document position list (for each term walk @@ -296,7 +362,7 @@ string Native::makeAbstract(Xapian::docid docid, const list& terms) for (term = db.termlist_begin(docid); term != db.termlist_end(docid); term++) { if (cutoff-- < 0) { - LOGDEB(("Abstract: max term count cutoff\n")); + LOGDEB(("makeAbstract: max term count cutoff\n")); break; } @@ -304,7 +370,7 @@ string Native::makeAbstract(Xapian::docid docid, const list& terms) for (pos = db.positionlist_begin(docid, *term); pos != db.positionlist_end(docid, *term); pos++) { if (cutoff-- < 0) { - LOGDEB(("Abstract: max term count cutoff\n")); + LOGDEB(("makeAbstract: max term count cutoff\n")); break; } map::iterator vit; @@ -314,7 +380,7 @@ string Native::makeAbstract(Xapian::docid docid, const list& terms) // at the same position, we want to keep only the // first one (ie: dockes and dockes@wanadoo.fr) if (vit->second.empty()) { - LOGDEB2(("Abstract: populating: [%s] at %d\n", + LOGDEB2(("makeAbstract: populating: [%s] at %d\n", (*term).c_str(), *pos)); sparseDoc[*pos] = *term; } @@ -332,16 +398,16 @@ string Native::makeAbstract(Xapian::docid docid, const list& terms) it++, ipos++) { if (it->empty()) { if (!epty) - LOGDEB(("Abstract:vec[%d]: [%s]\n", ipos, it->c_str())); + LOGDEB(("makeAbstract:vec[%d]: [%s]\n", ipos, it->c_str())); epty=true; } else { epty = false; - LOGDEB(("Abstract:vec[%d]: [%s]\n", ipos, it->c_str())); + LOGDEB(("makeAbstract:vec[%d]: [%s]\n", ipos, it->c_str())); } } #endif - LOGDEB(("Abstract:%d: randomizing and extracting\n", chron.millis())); + LOGDEB2(("makeAbstract:%d: randomizing and extracting\n", chron.millis())); // We randomize the selection of term positions, from which we // shall pull, starting at the beginning, until the abstract is @@ -363,7 +429,7 @@ string Native::makeAbstract(Xapian::docid docid, const list& terms) unsigned int sta = MAX(0, *pos - m_db->m_synthAbsWordCtxLen); unsigned int sto = *pos + m_db->m_synthAbsWordCtxLen; - LOGDEB2(("Abstract: %d<-%d->%d\n", sta, *pos, sto)); + LOGDEB2(("makeAbstract: %d<-%d->%d\n", sta, *pos, sto)); for (unsigned int ii = sta; ii <= sto; ii++) { @@ -372,12 +438,12 @@ string Native::makeAbstract(Xapian::docid docid, const list& terms) map::const_iterator vit = sparseDoc.find(ii); if (vit != sparseDoc.end() && !vit->second.empty()) { - LOGDEB2(("Abstract: position %d -> [%s]\n", + LOGDEB2(("makeAbstract: position %d -> [%s]\n", ii, vit->second.c_str())); mabs[ii] = vit->second; abslen += vit->second.length(); } else { - LOGDEB2(("Abstract: empty position at %d\n", ii)); + LOGDEB2(("makeAbstract: empty position at %d\n", ii)); } } @@ -394,7 +460,7 @@ string Native::makeAbstract(Xapian::docid docid, const list& terms) LOGDEB2(("Abtract:output %u -> [%s]\n", it->first,it->second.c_str())); abstract += it->second + " "; } - LOGDEB(("Abtract: done in %d mS\n", chron.millis())); + LOGDEB(("makeAbtract: done in %d mS\n", chron.millis())); return abstract; } @@ -1164,7 +1230,7 @@ bool Db::filenameWildExp(const string& fnexp, list& names) } // Limit the match count if (names.size() > 1000) { - LOGERR(("Db::SetQuery: too many matched file names\n")); + LOGERR(("Db::filenameWildExp: too many matched file names\n")); break; } } @@ -1190,6 +1256,7 @@ bool Db::setQuery(RefCntr sdata, int opts, m_filterTopDir = sdata->m_topdir; m_dbindices.clear(); m_qOpts = opts; + m_ndb->m_termfreqs.clear(); Xapian::Query xq; sdata->toNativeQuery(*this, &xq, (opts & Db::QO_STEM) ? stemlang : "");