use wdfs for better selection of doc extracts in makeAbstract

This commit is contained in:
dockes 2006-11-13 14:48:21 +00:00
parent 0c70448784
commit 225c3e8b7d

View File

@ -1,5 +1,5 @@
#ifndef lint #ifndef lint
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.91 2006-11-13 08:49:44 dockes Exp $ (C) 2004 J.F.Dockes"; static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.92 2006-11-13 14:48:21 dockes Exp $ (C) 2004 J.F.Dockes";
#endif #endif
/* /*
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
@ -22,6 +22,7 @@ static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.91 2006-11-13 08:49:44 dockes Exp $
#include <sys/stat.h> #include <sys/stat.h>
#include <fnmatch.h> #include <fnmatch.h>
#include <regex.h> #include <regex.h>
#include <math.h>
#include <iostream> #include <iostream>
#include <string> #include <string>
@ -91,6 +92,9 @@ class Native {
Xapian::Enquire *enquire; // Open query descriptor. Xapian::Enquire *enquire; // Open query descriptor.
Xapian::MSet mset; // Partial result set Xapian::MSet mset; // Partial result set
// Term frequencies for current query. See makeAbstract, not used yet.
// map<string, int> m_termfreqs;
Native(Db *db) Native(Db *db)
: m_db(db), : m_db(db),
m_isopen(false), m_iswritable(false), enquire(0) m_isopen(false), m_iswritable(false), enquire(0)
@ -200,7 +204,6 @@ bool Native::dbDataToRclDoc(std::string &data, Doc &doc,
// If the option is set and the abstract is synthetic or empty , build // If the option is set and the abstract is synthetic or empty , build
// abstract from position data. // abstract from position data.
if ((qopts & Db::QO_BUILD_ABSTRACT) && !terms.empty()) { if ((qopts & Db::QO_BUILD_ABSTRACT) && !terms.empty()) {
LOGDEB(("dbDataToRclDoc:: building abstract from position data\n"));
if (doc.abstract.empty() || syntabs || if (doc.abstract.empty() || syntabs ||
(qopts & Db::QO_REPLACE_ABSTRACT)) (qopts & Db::QO_REPLACE_ABSTRACT))
doc.abstract = makeAbstract(docid, terms); doc.abstract = makeAbstract(docid, terms);
@ -212,17 +215,63 @@ bool Native::dbDataToRclDoc(std::string &data, Doc &doc,
return true; return true;
} }
// We build a possibly full size but sparsely populated (only around // Build a document abstract by extracting text chunks around the query terms
// the search term occurrences) reconstruction of the document. It // This uses the db termlists, not the original document.
// would be possible to compress the array, by having only multiple
// chunks around the terms, but this would seriously complicate the
// data structure.
string Native::makeAbstract(Xapian::docid docid, const list<string>& terms) string Native::makeAbstract(Xapian::docid docid, const list<string>& terms)
{ {
LOGDEB(("Native::makeAbstract: maxlen %d wWidth %d\n", Chrono chron;
LOGDEB(("makeAbstract:%d: maxlen %d wWidth %d\n", chron.ms(),
m_db->m_synthAbsLen, m_db->m_synthAbsWordCtxLen)); m_db->m_synthAbsLen, m_db->m_synthAbsWordCtxLen));
Chrono chron; if (terms.empty()) {
return "";
}
// We may want to use the db-wide freqs to tune the abstracts one
// day but we currently don't
#if 0
if (m_termfreqs.empty()) {
for (list<string>::const_iterator qit = terms.begin();
qit != terms.end(); qit++) {
m_termfreqs[*qit] = db.get_termfreq(*qit);
LOGDEB2(("makeAbstract: [%s] db freq %d\n", qit->c_str(),
m_termfreqs[*qit]));
}
LOGDEB2(("makeAbstract:%d: got termfreqs\n", chron.ms()));
}
#endif
// Retrieve the term Within Document Frequencies. We are going to try
// and show text around the less common search terms.
map<string, int> termwdfs;
int totalqtermoccs = 0;
for (list<string>::const_iterator qit = terms.begin();
qit != terms.end(); qit++) {
Xapian::TermIterator term = db.termlist_begin(docid);
term.skip_to(*qit);
if (term != db.termlist_end(docid) && *term == *qit) {
int f = term.get_wdf();
termwdfs[*qit] = f;
totalqtermoccs += f;
LOGDEB2(("makeAbstract: [%s] wdf %d\n", qit->c_str(),
termwdfs[*qit]));
}
}
LOGDEB2(("makeAbstract:%d: got wdfs totalqtermoccs %d\n",
chron.ms(), totalqtermoccs));
if (totalqtermoccs == 0) {
LOGERR(("makeAbstract: no term occurrences !\n"));
return "";
}
// Build a sorted by frequency term list: it seems reasonable to
// prefer sampling around the less frequent terms:
multimap<int, string> bywdf;
for (list<string>::const_iterator qit = terms.begin();
qit != terms.end(); qit++) {
if (termwdfs.find(*qit) != termwdfs.end())
bywdf.insert(pair<int,string>(termwdfs[*qit], *qit));
}
// For each of the query terms, query xapian for its positions // For each of the query terms, query xapian for its positions
// list in the document. For each position entry, remember it in qtermposs // list in the document. For each position entry, remember it in qtermposs
@ -238,26 +287,43 @@ string Native::makeAbstract(Xapian::docid docid, const list<string>& terms)
vector<unsigned int> qtermposs; vector<unsigned int> qtermposs;
// Limit the total number of slots we populate. // Limit the total number of slots we populate.
const unsigned int maxtotaloccs = 300; const unsigned int maxtotaloccs =
// Max occurrences per term. We initially know nothing about the MAX(50, m_db->m_synthAbsLen /(4 * (m_db->m_synthAbsWordCtxLen+1)));
// occurrences repartition (it would be possible that only one LOGDEB2(("makeAbstract:%d: ttlqtrms %d mxttloccs %d\n",
// term in the list occurs, or that all do). So this is a rather chron.ms(), totalqtermoccs, maxtotaloccs));
// arbitrary choice. #if 0
const unsigned int maxoccperterm = maxtotaloccs / 10; for (multimap<int, string>::iterator qit = bywdf.begin();
unsigned int totaloccs = 0; qit != bywdf.end(); qit++) {
LOGDEB(("%d->[%s]\n", qit->first, qit->second.c_str()));
}
#endif
for (list<string>::const_iterator qit = terms.begin(); qit != terms.end(); // Find the text positions which we will have to fill with terms
qit++) { unsigned int totaloccs = 0;
for (multimap<int, string>::iterator qit = bywdf.begin();
qit != bywdf.end(); qit++) {
string qterm = qit->second;
unsigned int maxoccs;
if (bywdf.size() == 1) {
maxoccs = maxtotaloccs;
} else {
float q = (1 - float(termwdfs[qterm]) / float(totalqtermoccs)) /
(bywdf.size() - 1);
maxoccs = int(ceil(maxtotaloccs * q));
LOGDEB2(("makeAbstract: [%s] %d max occs (coef %.2f)\n",
qterm.c_str(), maxoccs, q));
}
Xapian::PositionIterator pos; Xapian::PositionIterator pos;
// There may be query terms not in this doc. This raises an // There may be query terms not in this doc. This raises an
// exception when requesting the position list, we catch it. // exception when requesting the position list, we catch it.
string emptys; string emptys;
try { try {
unsigned int occurrences = 0; unsigned int occurrences = 0;
for (pos = db.positionlist_begin(docid, *qit); for (pos = db.positionlist_begin(docid, qterm);
pos != db.positionlist_end(docid, *qit); pos++) { pos != db.positionlist_end(docid, qterm); pos++) {
unsigned int ipos = *pos; unsigned int ipos = *pos;
LOGDEB2(("Abstract: [%s] at %d\n", qit->c_str(), ipos)); LOGDEB2(("makeAbstract: [%s] at %d\n", qit->c_str(), ipos));
// Remember the term position // Remember the term position
qtermposs.push_back(ipos); qtermposs.push_back(ipos);
// Add adjacent slots to the set to populate at next step // Add adjacent slots to the set to populate at next step
@ -265,13 +331,13 @@ string Native::makeAbstract(Xapian::docid docid, const list<string>& terms)
unsigned int sto = ipos+m_db->m_synthAbsWordCtxLen; unsigned int sto = ipos+m_db->m_synthAbsWordCtxLen;
for (unsigned int ii = sta; ii <= sto; ii++) { for (unsigned int ii = sta; ii <= sto; ii++) {
if (ii == ipos) if (ii == ipos)
sparseDoc[ii] = *qit; sparseDoc[ii] = qterm;
else else
sparseDoc[ii] = emptys; sparseDoc[ii] = emptys;
} }
// Limit the number of occurences we keep for each // Limit the number of occurences we keep for each
// term. The abstract has a finite length anyway ! // term. The abstract has a finite length anyway !
if (occurrences++ > maxoccperterm) if (occurrences++ > maxoccs)
break; break;
} }
} catch (...) { } catch (...) {
@ -282,7 +348,7 @@ string Native::makeAbstract(Xapian::docid docid, const list<string>& terms)
break; break;
} }
LOGDEB(("Abstract:%d:chosen number of positions %d. Populating\n", LOGDEB2(("makeAbstract:%d:chosen number of positions %d\n",
chron.millis(), qtermposs.size())); chron.millis(), qtermposs.size()));
// Walk the full document position list (for each term walk // Walk the full document position list (for each term walk
@ -296,7 +362,7 @@ string Native::makeAbstract(Xapian::docid docid, const list<string>& terms)
for (term = db.termlist_begin(docid); for (term = db.termlist_begin(docid);
term != db.termlist_end(docid); term++) { term != db.termlist_end(docid); term++) {
if (cutoff-- < 0) { if (cutoff-- < 0) {
LOGDEB(("Abstract: max term count cutoff\n")); LOGDEB(("makeAbstract: max term count cutoff\n"));
break; break;
} }
@ -304,7 +370,7 @@ string Native::makeAbstract(Xapian::docid docid, const list<string>& terms)
for (pos = db.positionlist_begin(docid, *term); for (pos = db.positionlist_begin(docid, *term);
pos != db.positionlist_end(docid, *term); pos++) { pos != db.positionlist_end(docid, *term); pos++) {
if (cutoff-- < 0) { if (cutoff-- < 0) {
LOGDEB(("Abstract: max term count cutoff\n")); LOGDEB(("makeAbstract: max term count cutoff\n"));
break; break;
} }
map<unsigned int, string>::iterator vit; map<unsigned int, string>::iterator vit;
@ -314,7 +380,7 @@ string Native::makeAbstract(Xapian::docid docid, const list<string>& terms)
// at the same position, we want to keep only the // at the same position, we want to keep only the
// first one (ie: dockes and dockes@wanadoo.fr) // first one (ie: dockes and dockes@wanadoo.fr)
if (vit->second.empty()) { if (vit->second.empty()) {
LOGDEB2(("Abstract: populating: [%s] at %d\n", LOGDEB2(("makeAbstract: populating: [%s] at %d\n",
(*term).c_str(), *pos)); (*term).c_str(), *pos));
sparseDoc[*pos] = *term; sparseDoc[*pos] = *term;
} }
@ -332,16 +398,16 @@ string Native::makeAbstract(Xapian::docid docid, const list<string>& terms)
it++, ipos++) { it++, ipos++) {
if (it->empty()) { if (it->empty()) {
if (!epty) if (!epty)
LOGDEB(("Abstract:vec[%d]: [%s]\n", ipos, it->c_str())); LOGDEB(("makeAbstract:vec[%d]: [%s]\n", ipos, it->c_str()));
epty=true; epty=true;
} else { } else {
epty = false; epty = false;
LOGDEB(("Abstract:vec[%d]: [%s]\n", ipos, it->c_str())); LOGDEB(("makeAbstract:vec[%d]: [%s]\n", ipos, it->c_str()));
} }
} }
#endif #endif
LOGDEB(("Abstract:%d: randomizing and extracting\n", chron.millis())); LOGDEB2(("makeAbstract:%d: randomizing and extracting\n", chron.millis()));
// We randomize the selection of term positions, from which we // We randomize the selection of term positions, from which we
// shall pull, starting at the beginning, until the abstract is // shall pull, starting at the beginning, until the abstract is
@ -363,7 +429,7 @@ string Native::makeAbstract(Xapian::docid docid, const list<string>& terms)
unsigned int sta = MAX(0, *pos - m_db->m_synthAbsWordCtxLen); unsigned int sta = MAX(0, *pos - m_db->m_synthAbsWordCtxLen);
unsigned int sto = *pos + m_db->m_synthAbsWordCtxLen; unsigned int sto = *pos + m_db->m_synthAbsWordCtxLen;
LOGDEB2(("Abstract: %d<-%d->%d\n", sta, *pos, sto)); LOGDEB2(("makeAbstract: %d<-%d->%d\n", sta, *pos, sto));
for (unsigned int ii = sta; ii <= sto; ii++) { for (unsigned int ii = sta; ii <= sto; ii++) {
@ -372,12 +438,12 @@ string Native::makeAbstract(Xapian::docid docid, const list<string>& terms)
map<unsigned int, string>::const_iterator vit = map<unsigned int, string>::const_iterator vit =
sparseDoc.find(ii); sparseDoc.find(ii);
if (vit != sparseDoc.end() && !vit->second.empty()) { if (vit != sparseDoc.end() && !vit->second.empty()) {
LOGDEB2(("Abstract: position %d -> [%s]\n", LOGDEB2(("makeAbstract: position %d -> [%s]\n",
ii, vit->second.c_str())); ii, vit->second.c_str()));
mabs[ii] = vit->second; mabs[ii] = vit->second;
abslen += vit->second.length(); abslen += vit->second.length();
} else { } else {
LOGDEB2(("Abstract: empty position at %d\n", ii)); LOGDEB2(("makeAbstract: empty position at %d\n", ii));
} }
} }
@ -394,7 +460,7 @@ string Native::makeAbstract(Xapian::docid docid, const list<string>& terms)
LOGDEB2(("Abtract:output %u -> [%s]\n", it->first,it->second.c_str())); LOGDEB2(("Abtract:output %u -> [%s]\n", it->first,it->second.c_str()));
abstract += it->second + " "; abstract += it->second + " ";
} }
LOGDEB(("Abtract: done in %d mS\n", chron.millis())); LOGDEB(("makeAbtract: done in %d mS\n", chron.millis()));
return abstract; return abstract;
} }
@ -1164,7 +1230,7 @@ bool Db::filenameWildExp(const string& fnexp, list<string>& names)
} }
// Limit the match count // Limit the match count
if (names.size() > 1000) { if (names.size() > 1000) {
LOGERR(("Db::SetQuery: too many matched file names\n")); LOGERR(("Db::filenameWildExp: too many matched file names\n"));
break; break;
} }
} }
@ -1190,6 +1256,7 @@ bool Db::setQuery(RefCntr<SearchData> sdata, int opts,
m_filterTopDir = sdata->m_topdir; m_filterTopDir = sdata->m_topdir;
m_dbindices.clear(); m_dbindices.clear();
m_qOpts = opts; m_qOpts = opts;
m_ndb->m_termfreqs.clear();
Xapian::Query xq; Xapian::Query xq;
sdata->toNativeQuery(*this, &xq, (opts & Db::QO_STEM) ? stemlang : ""); sdata->toNativeQuery(*this, &xq, (opts & Db::QO_STEM) ? stemlang : "");