use wdfs for better selection of doc extracts in makeAbstract

2006-11-13 14:48:21 +00:00 · 2006-11-13 14:48:21 +00:00 · 225c3e8b7d
commit 225c3e8b7d
parent 0c70448784
1 changed files with 102 additions and 35 deletions
--- a/src/rcldb/rcldb.cpp
+++ b/src/rcldb/rcldb.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.91 2006-11-13 08:49:44 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.92 2006-11-13 14:48:21 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 /*
 *   This program is free software; you can redistribute it and/or modify
@ -22,6 +22,7 @@ static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.91 2006-11-13 08:49:44 dockes Exp $
 #include <sys/stat.h>
 #include <fnmatch.h>
 #include <regex.h>
+#include <math.h>

 #include <iostream>
 #include <string>
@ -91,6 +92,9 @@ class Native {
    Xapian::Enquire *enquire; // Open query descriptor.
    Xapian::MSet     mset;    // Partial result set

+    // Term frequencies for current query. See makeAbstract, not used yet.
+    //    map<string, int>  m_termfreqs; 
+    
    Native(Db *db) 
 	: m_db(db),
 	  m_isopen(false), m_iswritable(false), enquire(0) 
@ -200,7 +204,6 @@ bool Native::dbDataToRclDoc(std::string &data, Doc &doc,
    // If the option is set and the abstract is synthetic or empty , build 
    // abstract from position data. 
    if ((qopts & Db::QO_BUILD_ABSTRACT) && !terms.empty()) {
-	LOGDEB(("dbDataToRclDoc:: building abstract from position data\n"));
 	if (doc.abstract.empty() || syntabs || 
 	    (qopts & Db::QO_REPLACE_ABSTRACT))
 	    doc.abstract = makeAbstract(docid, terms);
@ -212,17 +215,63 @@ bool Native::dbDataToRclDoc(std::string &data, Doc &doc,
    return true;
 }

-// We build a possibly full size but sparsely populated (only around
-// the search term occurrences) reconstruction of the document. It
-// would be possible to compress the array, by having only multiple
-// chunks around the terms, but this would seriously complicate the
-// data structure.
+// Build a document abstract by extracting text chunks around the query terms
+// This uses the db termlists, not the original document.
 string Native::makeAbstract(Xapian::docid docid, const list<string>& terms)
 {
-    LOGDEB(("Native::makeAbstract: maxlen %d wWidth %d\n",
+    Chrono chron;
+    LOGDEB(("makeAbstract:%d: maxlen %d wWidth %d\n", chron.ms(),
 	    m_db->m_synthAbsLen, m_db->m_synthAbsWordCtxLen));

-    Chrono chron;
+    if (terms.empty()) {
+	return "";
+    }
+
+    // We may want to use the db-wide freqs to tune the abstracts one
+    // day but we currently don't
+#if 0 
+    if (m_termfreqs.empty()) {
+	for (list<string>::const_iterator qit = terms.begin(); 
+	     qit != terms.end(); qit++) {
+	    m_termfreqs[*qit] = db.get_termfreq(*qit);
+	    LOGDEB2(("makeAbstract: [%s] db freq %d\n", qit->c_str(), 
+		     m_termfreqs[*qit]));
+	}
+	LOGDEB2(("makeAbstract:%d: got termfreqs\n", chron.ms()));
+    }
+#endif
+
+    // Retrieve the term Within Document Frequencies. We are going to try 
+    // and show text around the less common search terms.
+    map<string, int> termwdfs;
+    int totalqtermoccs = 0;
+    for (list<string>::const_iterator qit = terms.begin(); 
+	 qit != terms.end(); qit++) {
+	Xapian::TermIterator term = db.termlist_begin(docid);
+	term.skip_to(*qit);
+	if (term != db.termlist_end(docid) && *term == *qit) {
+	    int f = term.get_wdf();
+	    termwdfs[*qit] = f;
+	    totalqtermoccs += f;
+	    LOGDEB2(("makeAbstract: [%s] wdf %d\n", qit->c_str(), 
+		     termwdfs[*qit]));
+	}
+    }    
+    LOGDEB2(("makeAbstract:%d: got wdfs totalqtermoccs %d\n", 
+	    chron.ms(), totalqtermoccs));
+    if (totalqtermoccs == 0) {
+	LOGERR(("makeAbstract: no term occurrences !\n"));
+	return "";
+    }
+
+    // Build a sorted by frequency term list: it seems reasonable to
+    // prefer sampling around the less frequent terms:
+    multimap<int, string> bywdf;
+    for (list<string>::const_iterator qit = terms.begin(); 
+	 qit != terms.end(); qit++) {
+	if (termwdfs.find(*qit) != termwdfs.end())
+	    bywdf.insert(pair<int,string>(termwdfs[*qit], *qit));
+    }

    // For each of the query terms, query xapian for its positions
    // list in the document. For each position entry, remember it in qtermposs
@ -238,26 +287,43 @@ string Native::makeAbstract(Xapian::docid docid, const list<string>& terms)
    vector<unsigned int> qtermposs; 

    // Limit the total number of slots we populate.
-    const unsigned int maxtotaloccs = 300;
-    // Max occurrences per term. We initially know nothing about the
-    // occurrences repartition (it would be possible that only one
-    // term in the list occurs, or that all do). So this is a rather
-    // arbitrary choice.
-    const unsigned int maxoccperterm = maxtotaloccs / 10;
-    unsigned int totaloccs = 0;
+    const unsigned int maxtotaloccs = 
+	MAX(50, m_db->m_synthAbsLen /(4 * (m_db->m_synthAbsWordCtxLen+1)));
+    LOGDEB2(("makeAbstract:%d: ttlqtrms %d mxttloccs %d\n", 
+	    chron.ms(), totalqtermoccs,  maxtotaloccs));
+#if 0
+    for (multimap<int, string>::iterator qit = bywdf.begin(); 
+	 qit != bywdf.end(); qit++) {
+	LOGDEB(("%d->[%s]\n", qit->first, qit->second.c_str()));
+    }
+#endif

-    for (list<string>::const_iterator qit = terms.begin(); qit != terms.end();
-	 qit++) {
+    // Find the text positions which we will have to fill with terms
+    unsigned int totaloccs = 0;
+    for (multimap<int, string>::iterator qit = bywdf.begin(); 
+	 qit != bywdf.end(); qit++) {
+	string qterm = qit->second;
+	unsigned int maxoccs;
+	if (bywdf.size() == 1) {
+	    maxoccs = maxtotaloccs;
+	} else {
+	    float q = (1 - float(termwdfs[qterm]) / float(totalqtermoccs)) /
+		(bywdf.size() - 1);
+	    maxoccs = int(ceil(maxtotaloccs * q));
+	    LOGDEB2(("makeAbstract: [%s] %d max occs (coef %.2f)\n", 
+		    qterm.c_str(), maxoccs, q));
+	}
+		
 	Xapian::PositionIterator pos;
 	// There may be query terms not in this doc. This raises an
 	// exception when requesting the position list, we catch it.
 	string emptys;
 	try {
 	    unsigned int occurrences = 0;
-	    for (pos = db.positionlist_begin(docid, *qit); 
-		 pos != db.positionlist_end(docid, *qit); pos++) {
+	    for (pos = db.positionlist_begin(docid, qterm); 
+		 pos != db.positionlist_end(docid, qterm); pos++) {
 		unsigned int ipos = *pos;
-		LOGDEB2(("Abstract: [%s] at %d\n", qit->c_str(), ipos));
+		LOGDEB2(("makeAbstract: [%s] at %d\n", qit->c_str(), ipos));
 		// Remember the term position
 		qtermposs.push_back(ipos);
 		// Add adjacent slots to the set to populate at next step
@ -265,13 +331,13 @@ string Native::makeAbstract(Xapian::docid docid, const list<string>& terms)
 		unsigned int sto = ipos+m_db->m_synthAbsWordCtxLen;
 		for (unsigned int ii = sta; ii <= sto;  ii++) {
 		    if (ii == ipos)
-			sparseDoc[ii] = *qit;
+			sparseDoc[ii] = qterm;
 		    else
 			sparseDoc[ii] = emptys;
 		}
 		// Limit the number of occurences we keep for each
 		// term. The abstract has a finite length anyway !
-		if (occurrences++ > maxoccperterm)
+		if (occurrences++ > maxoccs)
 		    break;
 	    }
 	} catch (...) {
@ -282,7 +348,7 @@ string Native::makeAbstract(Xapian::docid docid, const list<string>& terms)
 	    break;
    }

-    LOGDEB(("Abstract:%d:chosen number of positions %d. Populating\n", 
+    LOGDEB2(("makeAbstract:%d:chosen number of positions %d\n", 
 	    chron.millis(), qtermposs.size()));

    // Walk the full document position list (for each term walk
@ -296,7 +362,7 @@ string Native::makeAbstract(Xapian::docid docid, const list<string>& terms)
 	for (term = db.termlist_begin(docid);
 	     term != db.termlist_end(docid); term++) {
 	    if (cutoff-- < 0) {
-		LOGDEB(("Abstract: max term count cutoff\n"));
+		LOGDEB(("makeAbstract: max term count cutoff\n"));
 		break;
 	    }

@ -304,7 +370,7 @@ string Native::makeAbstract(Xapian::docid docid, const list<string>& terms)
 	    for (pos = db.positionlist_begin(docid, *term); 
 		 pos != db.positionlist_end(docid, *term); pos++) {
 		if (cutoff-- < 0) {
-		    LOGDEB(("Abstract: max term count cutoff\n"));
+		    LOGDEB(("makeAbstract: max term count cutoff\n"));
 		    break;
 		}
 		map<unsigned int, string>::iterator vit;
@ -314,7 +380,7 @@ string Native::makeAbstract(Xapian::docid docid, const list<string>& terms)
 		    // at the same position, we want to keep only the
 		    // first one (ie: dockes and dockes@wanadoo.fr)
 		    if (vit->second.empty()) {
-			LOGDEB2(("Abstract: populating: [%s] at %d\n", 
+			LOGDEB2(("makeAbstract: populating: [%s] at %d\n", 
 				(*term).c_str(), *pos));
 			sparseDoc[*pos] = *term;
 		    }
@ -332,16 +398,16 @@ string Native::makeAbstract(Xapian::docid docid, const list<string>& terms)
 	 it++, ipos++) {
 	if (it->empty()) {
 	    if (!epty)
-		LOGDEB(("Abstract:vec[%d]: [%s]\n", ipos, it->c_str()));
+		LOGDEB(("makeAbstract:vec[%d]: [%s]\n", ipos, it->c_str()));
 	    epty=true;
 	} else {
 	    epty = false;
-	    LOGDEB(("Abstract:vec[%d]: [%s]\n", ipos, it->c_str()));
+	    LOGDEB(("makeAbstract:vec[%d]: [%s]\n", ipos, it->c_str()));
 	}
    }
 #endif

-    LOGDEB(("Abstract:%d: randomizing and extracting\n", chron.millis()));
+    LOGDEB2(("makeAbstract:%d: randomizing and extracting\n", chron.millis()));

    // We randomize the selection of term positions, from which we
    // shall pull, starting at the beginning, until the abstract is
@ -363,7 +429,7 @@ string Native::makeAbstract(Xapian::docid docid, const list<string>& terms)
 	unsigned int sta = MAX(0, *pos - m_db->m_synthAbsWordCtxLen);
 	unsigned int sto = *pos + m_db->m_synthAbsWordCtxLen;

-	LOGDEB2(("Abstract: %d<-%d->%d\n", sta, *pos, sto));
+	LOGDEB2(("makeAbstract: %d<-%d->%d\n", sta, *pos, sto));

 	for (unsigned int ii = sta; ii <= sto; ii++) {

@ -372,12 +438,12 @@ string Native::makeAbstract(Xapian::docid docid, const list<string>& terms)
 	    map<unsigned int, string>::const_iterator vit = 
 		sparseDoc.find(ii);
 	    if (vit != sparseDoc.end() && !vit->second.empty()) {
-		LOGDEB2(("Abstract: position %d -> [%s]\n", 
+		LOGDEB2(("makeAbstract: position %d -> [%s]\n", 
 			 ii, vit->second.c_str()));
 		mabs[ii] = vit->second;
 		abslen += vit->second.length();
 	    } else {
-		LOGDEB2(("Abstract: empty position at %d\n", ii));
+		LOGDEB2(("makeAbstract: empty position at %d\n", ii));
 	    }
 	}

@ -394,7 +460,7 @@ string Native::makeAbstract(Xapian::docid docid, const list<string>& terms)
 	LOGDEB2(("Abtract:output %u -> [%s]\n", it->first,it->second.c_str()));
 	abstract += it->second + " ";
    }
-    LOGDEB(("Abtract: done in %d mS\n", chron.millis()));
+    LOGDEB(("makeAbtract: done in %d mS\n", chron.millis()));
    return abstract;
 }

@ -1164,7 +1230,7 @@ bool Db::filenameWildExp(const string& fnexp, list<string>& names)
 	}
 	// Limit the match count
 	if (names.size() > 1000) {
-	    LOGERR(("Db::SetQuery: too many matched file names\n"));
+	    LOGERR(("Db::filenameWildExp: too many matched file names\n"));
 	    break;
 	}
    }
@ -1190,6 +1256,7 @@ bool Db::setQuery(RefCntr<SearchData> sdata, int opts,
    m_filterTopDir = sdata->m_topdir;
    m_dbindices.clear();
    m_qOpts = opts;
+    m_ndb->m_termfreqs.clear();

    Xapian::Query xq;
    sdata->toNativeQuery(*this, &xq, (opts & Db::QO_STEM) ? stemlang : "");