simplified and hopefully improved abstract generation
This commit is contained in:
parent
d58f028319
commit
f6d8c44cc5
@ -1,5 +1,5 @@
|
|||||||
#ifndef lint
|
#ifndef lint
|
||||||
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.118 2007-06-22 06:14:04 dockes Exp $ (C) 2004 J.F.Dockes";
|
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.119 2007-06-25 10:25:39 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||||
#endif
|
#endif
|
||||||
/*
|
/*
|
||||||
* This program is free software; you can redistribute it and/or modify
|
* This program is free software; you can redistribute it and/or modify
|
||||||
@ -56,6 +56,10 @@ using namespace std;
|
|||||||
#define MIN(A,B) (A<B?A:B)
|
#define MIN(A,B) (A<B?A:B)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
// This is the word position offset at which we index the body text
|
||||||
|
// (abstract, keywords, etc.. are stored before this)
|
||||||
|
static const unsigned int baseTextPosition = 100000;
|
||||||
|
|
||||||
#undef MTIME_IN_VALUE
|
#undef MTIME_IN_VALUE
|
||||||
#ifdef MTIME_IN_VALUE
|
#ifdef MTIME_IN_VALUE
|
||||||
// Omega compatible values
|
// Omega compatible values
|
||||||
@ -103,8 +107,8 @@ class Native {
|
|||||||
Xapian::Enquire *enquire; // Open query descriptor.
|
Xapian::Enquire *enquire; // Open query descriptor.
|
||||||
Xapian::MSet mset; // Partial result set
|
Xapian::MSet mset; // Partial result set
|
||||||
|
|
||||||
// Term frequencies for current query. See makeAbstract, not used yet.
|
// Term frequencies for current query. See makeAbstract, setQuery
|
||||||
map<string, int> m_termfreqs;
|
map<string, double> m_termfreqs;
|
||||||
|
|
||||||
Native(Db *db)
|
Native(Db *db)
|
||||||
: m_db(db),
|
: m_db(db),
|
||||||
@ -232,12 +236,19 @@ static list<string> noPrefixList(const list<string>& in)
|
|||||||
return out;
|
return out;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//#define DEBUGABSTRACT
|
||||||
|
#ifdef DEBUGABSTRACT
|
||||||
|
#define LOGABS LOGDEB
|
||||||
|
#else
|
||||||
|
#define LOGABS LOGDEB2
|
||||||
|
#endif
|
||||||
|
|
||||||
// Build a document abstract by extracting text chunks around the query terms
|
// Build a document abstract by extracting text chunks around the query terms
|
||||||
// This uses the db termlists, not the original document.
|
// This uses the db termlists, not the original document.
|
||||||
string Native::makeAbstract(Xapian::docid docid, const list<string>& iterms)
|
string Native::makeAbstract(Xapian::docid docid, const list<string>& iterms)
|
||||||
{
|
{
|
||||||
Chrono chron;
|
Chrono chron;
|
||||||
LOGDEB2(("makeAbstract:%d: maxlen %d wWidth %d\n", chron.ms(),
|
LOGDEB(("makeAbstract:%d: maxlen %d wWidth %d\n", chron.ms(),
|
||||||
m_db->m_synthAbsLen, m_db->m_synthAbsWordCtxLen));
|
m_db->m_synthAbsLen, m_db->m_synthAbsWordCtxLen));
|
||||||
|
|
||||||
list<string> terms = noPrefixList(iterms);
|
list<string> terms = noPrefixList(iterms);
|
||||||
@ -245,90 +256,105 @@ string Native::makeAbstract(Xapian::docid docid, const list<string>& iterms)
|
|||||||
return "";
|
return "";
|
||||||
}
|
}
|
||||||
|
|
||||||
// We may want to use the db-wide freqs to tune the abstracts one
|
// Retrieve db-wide frequencies for the query terms
|
||||||
// day but we currently don't
|
|
||||||
#if 0
|
|
||||||
if (m_termfreqs.empty()) {
|
if (m_termfreqs.empty()) {
|
||||||
|
double doccnt = db.get_doccount();
|
||||||
|
if (doccnt == 0) doccnt = 1;
|
||||||
for (list<string>::const_iterator qit = terms.begin();
|
for (list<string>::const_iterator qit = terms.begin();
|
||||||
qit != terms.end(); qit++) {
|
qit != terms.end(); qit++) {
|
||||||
m_termfreqs[*qit] = db.get_termfreq(*qit);
|
m_termfreqs[*qit] = db.get_termfreq(*qit) / doccnt;
|
||||||
LOGDEB(("makeAbstract: [%s] db freq %d\n", qit->c_str(),
|
LOGABS(("makeAbstract: [%s] db freq %.1e\n", qit->c_str(),
|
||||||
m_termfreqs[*qit]));
|
m_termfreqs[*qit]));
|
||||||
}
|
}
|
||||||
LOGDEB(("makeAbstract:%d: got termfreqs\n", chron.ms()));
|
LOGABS(("makeAbstract:%d: got termfreqs\n", chron.ms()));
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
|
||||||
// Retrieve the term Within Document Frequencies. We are going to try
|
// Compute a term quality coefficient by retrieving the term
|
||||||
|
// Within Document Frequencies and multiplying by overal term
|
||||||
|
// frequency, then using log-based thresholds. We are going to try
|
||||||
// and show text around the less common search terms.
|
// and show text around the less common search terms.
|
||||||
map<string, int> termwdfs;
|
map<string, double> termQcoefs;
|
||||||
int totalqtermoccs = 0;
|
double totalweight = 0;
|
||||||
|
double doclen = db.get_doclength(docid);
|
||||||
|
if (doclen == 0) doclen = 1;
|
||||||
for (list<string>::const_iterator qit = terms.begin();
|
for (list<string>::const_iterator qit = terms.begin();
|
||||||
qit != terms.end(); qit++) {
|
qit != terms.end(); qit++) {
|
||||||
Xapian::TermIterator term = db.termlist_begin(docid);
|
Xapian::TermIterator term = db.termlist_begin(docid);
|
||||||
term.skip_to(*qit);
|
term.skip_to(*qit);
|
||||||
if (term != db.termlist_end(docid) && *term == *qit) {
|
if (term != db.termlist_end(docid) && *term == *qit) {
|
||||||
int f = term.get_wdf();
|
double q = (term.get_wdf() / doclen) * m_termfreqs[*qit];
|
||||||
termwdfs[*qit] = f;
|
q = -log10(q);
|
||||||
totalqtermoccs += f;
|
if (q < 3) {
|
||||||
LOGDEB2(("makeAbstract: [%s] wdf %d\n", qit->c_str(),
|
q = 0.05;
|
||||||
termwdfs[*qit]));
|
} else if (q < 4) {
|
||||||
|
q = 0.3;
|
||||||
|
} else if (q < 5) {
|
||||||
|
q = 0.7;
|
||||||
|
} else if (q < 6) {
|
||||||
|
q = 0.8;
|
||||||
|
} else {
|
||||||
|
q = 1;
|
||||||
|
}
|
||||||
|
termQcoefs[*qit] = q;
|
||||||
|
totalweight += q;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
LOGDEB2(("makeAbstract:%d: got wdfs totalqtermoccs %d\n",
|
LOGABS(("makeAbstract:%d: computed Qcoefs.\n", chron.ms()));
|
||||||
chron.ms(), totalqtermoccs));
|
|
||||||
if (totalqtermoccs == 0) {
|
|
||||||
LOGERR(("makeAbstract: no term occurrences !\n"));
|
|
||||||
return "";
|
|
||||||
}
|
|
||||||
|
|
||||||
// Build a sorted by frequency term list: it seems reasonable to
|
// Build a sorted by quality term list.
|
||||||
// prefer sampling around the less frequent terms:
|
multimap<double, string> byQ;
|
||||||
multimap<int, string> bywdf;
|
|
||||||
for (list<string>::const_iterator qit = terms.begin();
|
for (list<string>::const_iterator qit = terms.begin();
|
||||||
qit != terms.end(); qit++) {
|
qit != terms.end(); qit++) {
|
||||||
if (termwdfs.find(*qit) != termwdfs.end())
|
if (termQcoefs.find(*qit) != termQcoefs.end())
|
||||||
bywdf.insert(pair<int,string>(termwdfs[*qit], *qit));
|
byQ.insert(pair<double,string>(termQcoefs[*qit], *qit));
|
||||||
}
|
}
|
||||||
|
|
||||||
// For each of the query terms, query xapian for its positions
|
#ifdef DEBUGABSTRACT
|
||||||
// list in the document. For each position entry, remember it in qtermposs
|
for (multimap<double, string>::reverse_iterator qit = byQ.rbegin();
|
||||||
// and insert it and its neighbours in the set of 'interesting' positions
|
qit != byQ.rend(); qit++) {
|
||||||
|
LOGDEB(("%.1e->[%s]\n", qit->first, qit->second.c_str()));
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
// For each of the query terms, ask xapian for its positions list
|
||||||
|
// in the document. For each position entry, remember it in
|
||||||
|
// qtermposs and insert it and its neighbours in the set of
|
||||||
|
// 'interesting' positions
|
||||||
|
|
||||||
// The terms 'array' that we partially populate with the document
|
// The terms 'array' that we partially populate with the document
|
||||||
// terms, at their positions around the search terms positions:
|
// terms, at their positions around the search terms positions:
|
||||||
map<unsigned int, string> sparseDoc;
|
map<unsigned int, string> sparseDoc;
|
||||||
|
|
||||||
// All the query term positions. We remember this mainly because we are
|
// All the chosen query term positions.
|
||||||
// going to random-shuffle it for selecting the chunks that we actually
|
|
||||||
// print.
|
|
||||||
vector<unsigned int> qtermposs;
|
vector<unsigned int> qtermposs;
|
||||||
|
|
||||||
// Limit the total number of slots we populate.
|
// Limit the total number of slots we populate. The 7 is taken as
|
||||||
|
// average word size. It was a mistake to have the user max
|
||||||
|
// abstract size parameter in characters, we basically only deal
|
||||||
|
// with words. We used to limit the character size at the end, but
|
||||||
|
// this damaged our careful selection of terms
|
||||||
const unsigned int maxtotaloccs =
|
const unsigned int maxtotaloccs =
|
||||||
MAX(50, m_db->m_synthAbsLen /(4 * (m_db->m_synthAbsWordCtxLen+1)));
|
m_db->m_synthAbsLen /(7 * (m_db->m_synthAbsWordCtxLen+1));
|
||||||
LOGDEB2(("makeAbstract:%d: ttlqtrms %d mxttloccs %d\n",
|
LOGABS(("makeAbstract:%d: mxttloccs %d\n", chron.ms(), maxtotaloccs));
|
||||||
chron.ms(), totalqtermoccs, maxtotaloccs));
|
// This can't happen, but would crash us
|
||||||
#if 0
|
if (totalweight == 0.0) {
|
||||||
for (multimap<int, string>::iterator qit = bywdf.begin();
|
LOGERR(("makeAbstract: 0 totalweight!\n"));
|
||||||
qit != bywdf.end(); qit++) {
|
return "";
|
||||||
LOGDEB(("%d->[%s]\n", qit->first, qit->second.c_str()));
|
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
|
||||||
// Find the text positions which we will have to fill with terms
|
// Let's go populate
|
||||||
unsigned int totaloccs = 0;
|
for (multimap<double, string>::reverse_iterator qit = byQ.rbegin();
|
||||||
for (multimap<int, string>::iterator qit = bywdf.begin();
|
qit != byQ.rend(); qit++) {
|
||||||
qit != bywdf.end(); qit++) {
|
|
||||||
string qterm = qit->second;
|
string qterm = qit->second;
|
||||||
unsigned int maxoccs;
|
unsigned int maxoccs;
|
||||||
if (bywdf.size() == 1) {
|
if (byQ.size() == 1) {
|
||||||
maxoccs = maxtotaloccs;
|
maxoccs = maxtotaloccs;
|
||||||
} else {
|
} else {
|
||||||
float q = (1 - float(termwdfs[qterm]) / float(totalqtermoccs)) /
|
// We give more slots to the better terms
|
||||||
(bywdf.size() - 1);
|
float q = qit->first / totalweight;
|
||||||
maxoccs = int(ceil(maxtotaloccs * q));
|
maxoccs = int(ceil(maxtotaloccs * q));
|
||||||
LOGDEB2(("makeAbstract: [%s] %d max occs (coef %.2f)\n",
|
LOGABS(("makeAbstract: [%s] %d max occs (coef %.2f)\n",
|
||||||
qterm.c_str(), maxoccs, q));
|
qterm.c_str(), maxoccs, q));
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -341,7 +367,10 @@ string Native::makeAbstract(Xapian::docid docid, const list<string>& iterms)
|
|||||||
for (pos = db.positionlist_begin(docid, qterm);
|
for (pos = db.positionlist_begin(docid, qterm);
|
||||||
pos != db.positionlist_end(docid, qterm); pos++) {
|
pos != db.positionlist_end(docid, qterm); pos++) {
|
||||||
unsigned int ipos = *pos;
|
unsigned int ipos = *pos;
|
||||||
LOGDEB2(("makeAbstract: [%s] at %d\n", qit->c_str(), ipos));
|
if (ipos < baseTextPosition) // Not in text body
|
||||||
|
continue;
|
||||||
|
LOGABS(("makeAbstract: [%s] at %d occurrences %d maxoccs %d\n",
|
||||||
|
qterm.c_str(), ipos, occurrences, maxoccs));
|
||||||
// Remember the term position
|
// Remember the term position
|
||||||
qtermposs.push_back(ipos);
|
qtermposs.push_back(ipos);
|
||||||
// Add adjacent slots to the set to populate at next step
|
// Add adjacent slots to the set to populate at next step
|
||||||
@ -353,26 +382,30 @@ string Native::makeAbstract(Xapian::docid docid, const list<string>& iterms)
|
|||||||
else
|
else
|
||||||
sparseDoc[ii] = emptys;
|
sparseDoc[ii] = emptys;
|
||||||
}
|
}
|
||||||
// Limit the number of occurences we keep for each
|
// Limit to allocated occurences and total size
|
||||||
// term. The abstract has a finite length anyway !
|
if (++occurrences >= maxoccs ||
|
||||||
if (occurrences++ > maxoccs)
|
qtermposs.size() >= maxtotaloccs)
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
} catch (...) {
|
} catch (...) {
|
||||||
// Term does not occur. No problem.
|
// Term does not occur. No problem.
|
||||||
}
|
}
|
||||||
// Limit total size
|
if (qtermposs.size() >= maxtotaloccs)
|
||||||
if (totaloccs++ > maxtotaloccs)
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
LOGABS(("makeAbstract:%d:chosen number of positions %d\n",
|
||||||
LOGDEB2(("makeAbstract:%d:chosen number of positions %d\n",
|
|
||||||
chron.millis(), qtermposs.size()));
|
chron.millis(), qtermposs.size()));
|
||||||
|
|
||||||
// Walk the full document position list (for each term walk
|
// This can happen if there are term occurences in the keywords
|
||||||
// position list) and populate slots around the query terms. We
|
// etc. but not elsewhere ?
|
||||||
// arbitrarily truncate the list to avoid taking forever. If we do
|
if (qtermposs.size() == 0)
|
||||||
// cutoff, the abstract may be inconsistant, which is bad...
|
return "";
|
||||||
|
|
||||||
|
// Walk all document's terms position lists and populate slots
|
||||||
|
// around the query terms. We arbitrarily truncate the list to
|
||||||
|
// avoid taking forever. If we do cutoff, the abstract may be
|
||||||
|
// inconsistant (missing words, potentially altering meaning),
|
||||||
|
// which is bad...
|
||||||
{
|
{
|
||||||
Xapian::TermIterator term;
|
Xapian::TermIterator term;
|
||||||
int cutoff = 500 * 1000;
|
int cutoff = 500 * 1000;
|
||||||
@ -401,7 +434,7 @@ string Native::makeAbstract(Xapian::docid docid, const list<string>& iterms)
|
|||||||
// at the same position, we want to keep only the
|
// at the same position, we want to keep only the
|
||||||
// first one (ie: dockes and dockes@wanadoo.fr)
|
// first one (ie: dockes and dockes@wanadoo.fr)
|
||||||
if (vit->second.empty()) {
|
if (vit->second.empty()) {
|
||||||
LOGDEB2(("makeAbstract: populating: [%s] at %d\n",
|
LOGABS(("makeAbstract: populating: [%s] at %d\n",
|
||||||
(*term).c_str(), *pos));
|
(*term).c_str(), *pos));
|
||||||
sparseDoc[*pos] = *term;
|
sparseDoc[*pos] = *term;
|
||||||
}
|
}
|
||||||
@ -428,61 +461,29 @@ string Native::makeAbstract(Xapian::docid docid, const list<string>& iterms)
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
LOGDEB2(("makeAbstract:%d: randomizing and extracting\n", chron.millis()));
|
LOGDEB(("makeAbstract:%d: extracting\n", chron.millis()));
|
||||||
|
|
||||||
// We randomize the selection of term positions, from which we
|
// Add "..." at ends of chunks
|
||||||
// shall pull, starting at the beginning, until the abstract is
|
|
||||||
// big enough. The abstract is finally built in correct position
|
|
||||||
// order, thanks to the position map.
|
|
||||||
random_shuffle(qtermposs.begin(), qtermposs.end());
|
|
||||||
map<unsigned int, string> mabs;
|
|
||||||
unsigned int abslen = 0;
|
|
||||||
|
|
||||||
// Extract data around the N first (in random order) query term
|
|
||||||
// positions, and store the terms in the map. Don't concatenate
|
|
||||||
// immediately into chunks because there might be overlaps
|
|
||||||
for (vector<unsigned int>::const_iterator pos = qtermposs.begin();
|
for (vector<unsigned int>::const_iterator pos = qtermposs.begin();
|
||||||
pos != qtermposs.end(); pos++) {
|
pos != qtermposs.end(); pos++) {
|
||||||
|
|
||||||
if (int(abslen) > m_db->m_synthAbsLen)
|
|
||||||
break;
|
|
||||||
|
|
||||||
unsigned int sta = MAX(0, *pos - m_db->m_synthAbsWordCtxLen);
|
|
||||||
unsigned int sto = *pos + m_db->m_synthAbsWordCtxLen;
|
unsigned int sto = *pos + m_db->m_synthAbsWordCtxLen;
|
||||||
|
|
||||||
LOGDEB2(("makeAbstract: %d<-%d->%d\n", sta, *pos, sto));
|
|
||||||
|
|
||||||
for (unsigned int ii = sta; ii <= sto; ii++) {
|
|
||||||
|
|
||||||
if (int(abslen) > m_db->m_synthAbsLen)
|
|
||||||
break;
|
|
||||||
map<unsigned int, string>::const_iterator vit =
|
|
||||||
sparseDoc.find(ii);
|
|
||||||
if (vit != sparseDoc.end() && !vit->second.empty()) {
|
|
||||||
LOGDEB2(("makeAbstract: position %d -> [%s]\n",
|
|
||||||
ii, vit->second.c_str()));
|
|
||||||
mabs[ii] = vit->second;
|
|
||||||
abslen += vit->second.length();
|
|
||||||
} else {
|
|
||||||
LOGDEB2(("makeAbstract: empty position at %d\n", ii));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Possibly add a ... at the end of chunk if it's not
|
// Possibly add a ... at the end of chunk if it's not
|
||||||
// overlapping
|
// overlapping
|
||||||
if (mabs.find(sto+1) == mabs.end())
|
if (sparseDoc.find(sto) != sparseDoc.end() &&
|
||||||
mabs[sto+1] = "...";
|
sparseDoc.find(sto+1) == sparseDoc.end())
|
||||||
|
sparseDoc[sto+1] = "...";
|
||||||
}
|
}
|
||||||
|
|
||||||
// Build the abstract by walking the map (in order of position)
|
// Finally build the abstract by walking the map (in order of position)
|
||||||
string abstract;
|
string abstract;
|
||||||
for (map<unsigned int, string>::const_iterator it = mabs.begin();
|
for (map<unsigned int, string>::const_iterator it = sparseDoc.begin();
|
||||||
it != mabs.end(); it++) {
|
it != sparseDoc.end(); it++) {
|
||||||
LOGDEB2(("Abtract:output %u -> [%s]\n", it->first,it->second.c_str()));
|
LOGDEB2(("Abtract:output %u -> [%s]\n", it->first,it->second.c_str()));
|
||||||
abstract += it->second + " ";
|
abstract += it->second + " ";
|
||||||
}
|
}
|
||||||
|
|
||||||
// This happens for docs with no terms (only filename) indexed. I'll fix
|
// This happens for docs with no terms (only filename) indexed? I'll fix
|
||||||
// one day (yeah)
|
// one day (yeah)
|
||||||
if (!abstract.compare("... "))
|
if (!abstract.compare("... "))
|
||||||
abstract.clear();
|
abstract.clear();
|
||||||
@ -973,16 +974,18 @@ bool Db::add(const string &fn, const Doc &idoc, const struct stat *stp)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (splitData.curpos < baseTextPosition)
|
||||||
|
splitData.basepos = baseTextPosition;
|
||||||
|
else
|
||||||
|
splitData.basepos += splitData.curpos + 100;
|
||||||
|
|
||||||
// Split and index body text
|
// Finally: split and index body text
|
||||||
LOGDEB2(("Db::add: split body\n"));
|
LOGDEB2(("Db::add: split body\n"));
|
||||||
if (!dumb_string(doc.text, noacc)) {
|
if (!dumb_string(doc.text, noacc)) {
|
||||||
LOGERR(("Db::add: dumb_string failed\n"));
|
LOGERR(("Db::add: dumb_string failed\n"));
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
splitter.text_to_words(noacc);
|
splitter.text_to_words(noacc);
|
||||||
splitData.basepos += splitData.curpos + 100;
|
|
||||||
|
|
||||||
|
|
||||||
////// Special terms for other metadata. No positions for these.
|
////// Special terms for other metadata. No positions for these.
|
||||||
// Mime type
|
// Mime type
|
||||||
@ -1425,7 +1428,7 @@ bool Db::filenameWildExp(const string& fnexp, list<string>& names)
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Prepare query out of "advanced search" data
|
// Prepare query out of user search data
|
||||||
bool Db::setQuery(RefCntr<SearchData> sdata, int opts,
|
bool Db::setQuery(RefCntr<SearchData> sdata, int opts,
|
||||||
const string& stemlang)
|
const string& stemlang)
|
||||||
{
|
{
|
||||||
@ -1447,7 +1450,6 @@ bool Db::setQuery(RefCntr<SearchData> sdata, int opts,
|
|||||||
m_reason += sdata->getReason();
|
m_reason += sdata->getReason();
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
m_ndb->query = xq;
|
m_ndb->query = xq;
|
||||||
delete m_ndb->enquire;
|
delete m_ndb->enquire;
|
||||||
m_ndb->enquire = new Xapian::Enquire(m_ndb->db);
|
m_ndb->enquire = new Xapian::Enquire(m_ndb->db);
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user