optimized abstract building: bybye big vector
This commit is contained in:
parent
3d88716b9a
commit
047c1ba1fa
@ -1,5 +1,5 @@
|
|||||||
#ifndef lint
|
#ifndef lint
|
||||||
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.88 2006-11-09 17:37:16 dockes Exp $ (C) 2004 J.F.Dockes";
|
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.89 2006-11-10 17:18:01 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||||
#endif
|
#endif
|
||||||
/*
|
/*
|
||||||
* This program is free software; you can redistribute it and/or modify
|
* This program is free software; you can redistribute it and/or modify
|
||||||
@ -1642,23 +1642,21 @@ string Native::makeAbstract(Xapian::docid docid, const list<string>& terms)
|
|||||||
|
|
||||||
Chrono chron;
|
Chrono chron;
|
||||||
|
|
||||||
// The terms array that we populate with the document terms, at
|
|
||||||
// their position:
|
|
||||||
vector<string> termsVec;
|
|
||||||
|
|
||||||
// For each of the query terms, query xapian for its positions
|
// For each of the query terms, query xapian for its positions
|
||||||
// list in the document. For each position entry, remember it in qtermposs
|
// list in the document. For each position entry, remember it in qtermposs
|
||||||
// and insert it and its neighbours in the set of 'interesting' positions
|
// and insert it and its neighbours in the set of 'interesting' positions
|
||||||
|
|
||||||
|
// The terms 'array' that we partially populate with the document
|
||||||
|
// terms, at their positions around the search terms positions:
|
||||||
|
map<unsigned int, string> sparseDoc;
|
||||||
|
|
||||||
// All the query term positions. We remember this mainly because we are
|
// All the query term positions. We remember this mainly because we are
|
||||||
// going to random-shuffle it for selecting the chunks that we actually
|
// going to random-shuffle it for selecting the chunks that we actually
|
||||||
// print.
|
// print.
|
||||||
vector<unsigned int> qtermposs;
|
vector<unsigned int> qtermposs;
|
||||||
// The set of all the positions we shall populate with the query terms and
|
|
||||||
// their neighbour words.
|
|
||||||
set<unsigned int> chunkposs;
|
|
||||||
// Limit the total number of slots we populate.
|
// Limit the total number of slots we populate.
|
||||||
const unsigned int maxtotaloccs = 200;
|
const unsigned int maxtotaloccs = 300;
|
||||||
// Max occurrences per term. We initially know nothing about the
|
// Max occurrences per term. We initially know nothing about the
|
||||||
// occurrences repartition (it would be possible that only one
|
// occurrences repartition (it would be possible that only one
|
||||||
// term in the list occurs, or that all do). So this is a rather
|
// term in the list occurs, or that all do). So this is a rather
|
||||||
@ -1670,24 +1668,24 @@ string Native::makeAbstract(Xapian::docid docid, const list<string>& terms)
|
|||||||
qit++) {
|
qit++) {
|
||||||
Xapian::PositionIterator pos;
|
Xapian::PositionIterator pos;
|
||||||
// There may be query terms not in this doc. This raises an
|
// There may be query terms not in this doc. This raises an
|
||||||
// exception when requesting the position list, we just catch it.
|
// exception when requesting the position list, we catch it.
|
||||||
|
string emptys;
|
||||||
try {
|
try {
|
||||||
unsigned int occurrences = 0;
|
unsigned int occurrences = 0;
|
||||||
for (pos = db.positionlist_begin(docid, *qit);
|
for (pos = db.positionlist_begin(docid, *qit);
|
||||||
pos != db.positionlist_end(docid, *qit); pos++) {
|
pos != db.positionlist_end(docid, *qit); pos++) {
|
||||||
unsigned int ipos = *pos;
|
unsigned int ipos = *pos;
|
||||||
LOGDEB2(("Abstract: [%s] at %d\n", qit->c_str(), ipos));
|
LOGDEB2(("Abstract: [%s] at %d\n", qit->c_str(), ipos));
|
||||||
// Possibly extend the array. Do it in big chunks
|
|
||||||
if (ipos + m_db->m_synthAbsWordCtxLen >= termsVec.size()) {
|
|
||||||
termsVec.resize(ipos + m_db->m_synthAbsWordCtxLen + 1000);
|
|
||||||
}
|
|
||||||
// Remember the term position
|
// Remember the term position
|
||||||
qtermposs.push_back(ipos);
|
qtermposs.push_back(ipos);
|
||||||
// Add adjacent slots to the set to populate at next step
|
// Add adjacent slots to the set to populate at next step
|
||||||
for (unsigned int ii = MAX(0, ipos-m_db->m_synthAbsWordCtxLen);
|
unsigned int sta = MAX(0, ipos-m_db->m_synthAbsWordCtxLen);
|
||||||
ii <= MIN(ipos+m_db->m_synthAbsWordCtxLen, termsVec.size()-1);
|
unsigned int sto = ipos+m_db->m_synthAbsWordCtxLen;
|
||||||
ii++) {
|
for (unsigned int ii = sta; ii <= sto; ii++) {
|
||||||
chunkposs.insert(ii);
|
if (ii == ipos)
|
||||||
|
sparseDoc[ii] = *qit;
|
||||||
|
else
|
||||||
|
sparseDoc[ii] = emptys;
|
||||||
}
|
}
|
||||||
// Limit the number of occurences we keep for each
|
// Limit the number of occurences we keep for each
|
||||||
// term. The abstract has a finite length anyway !
|
// term. The abstract has a finite length anyway !
|
||||||
@ -1705,10 +1703,10 @@ string Native::makeAbstract(Xapian::docid docid, const list<string>& terms)
|
|||||||
LOGDEB(("Abstract:%d:chosen number of positions %d. Populating\n",
|
LOGDEB(("Abstract:%d:chosen number of positions %d. Populating\n",
|
||||||
chron.millis(), qtermposs.size()));
|
chron.millis(), qtermposs.size()));
|
||||||
|
|
||||||
// Walk the full document position list and populate slots around
|
// Walk the full document position list (for each term walk
|
||||||
// the query terms. We arbitrarily truncate the list to avoid
|
// position list) and populate slots around the query terms. We
|
||||||
// taking forever. If we do cutoff, the abstract may be
|
// arbitrarily truncate the list to avoid taking forever. If we do
|
||||||
// inconsistant, which is bad...
|
// cutoff, the abstract may be inconsistant, which is bad...
|
||||||
{
|
{
|
||||||
Xapian::TermIterator term;
|
Xapian::TermIterator term;
|
||||||
int cutoff = 500 * 1000;
|
int cutoff = 500 * 1000;
|
||||||
@ -1727,16 +1725,16 @@ string Native::makeAbstract(Xapian::docid docid, const list<string>& terms)
|
|||||||
LOGDEB(("Abstract: max term count cutoff\n"));
|
LOGDEB(("Abstract: max term count cutoff\n"));
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
unsigned int ipos = *pos;
|
map<unsigned int, string>::iterator vit;
|
||||||
if (chunkposs.find(ipos) != chunkposs.end()) {
|
if ((vit=sparseDoc.find(*pos)) != sparseDoc.end()) {
|
||||||
// Don't replace a term: the terms list is in
|
// Don't replace a term: the terms list is in
|
||||||
// alphabetic order, and we may have several terms
|
// alphabetic order, and we may have several terms
|
||||||
// at the same position, we want to keep only the
|
// at the same position, we want to keep only the
|
||||||
// first one (ie: dockes and dockes@wanadoo.fr)
|
// first one (ie: dockes and dockes@wanadoo.fr)
|
||||||
if (termsVec[ipos].empty()) {
|
if (vit->second.empty()) {
|
||||||
LOGDEB2(("Abstract: populating: [%s] at %d\n",
|
LOGDEB2(("Abstract: populating: [%s] at %d\n",
|
||||||
(*term).c_str(), ipos));
|
(*term).c_str(), *pos));
|
||||||
termsVec[ipos] = *term;
|
sparseDoc[*pos] = *term;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1747,7 +1745,8 @@ string Native::makeAbstract(Xapian::docid docid, const list<string>& terms)
|
|||||||
// Debug only: output the full term[position] vector
|
// Debug only: output the full term[position] vector
|
||||||
bool epty = false;
|
bool epty = false;
|
||||||
int ipos = 0;
|
int ipos = 0;
|
||||||
for (vector<string>::iterator it = termsVec.begin(); it != termsVec.end();
|
for (map<unsigned int, string>::iterator it = sparseDoc.begin();
|
||||||
|
it != sparseDoc.end();
|
||||||
it++, ipos++) {
|
it++, ipos++) {
|
||||||
if (it->empty()) {
|
if (it->empty()) {
|
||||||
if (!epty)
|
if (!epty)
|
||||||
@ -1770,50 +1769,48 @@ string Native::makeAbstract(Xapian::docid docid, const list<string>& terms)
|
|||||||
map<unsigned int, string> mabs;
|
map<unsigned int, string> mabs;
|
||||||
unsigned int abslen = 0;
|
unsigned int abslen = 0;
|
||||||
|
|
||||||
// Extract data around the first (in random order) query term
|
// Extract data around the N first (in random order) query term
|
||||||
// positions, and store the terms in the map. Don't concatenate
|
// positions, and store the terms in the map. Don't concatenate
|
||||||
// immediately into chunks because there might be overlaps
|
// immediately into chunks because there might be overlaps
|
||||||
for (vector<unsigned int>::const_iterator it = qtermposs.begin();
|
for (vector<unsigned int>::const_iterator pos = qtermposs.begin();
|
||||||
it != qtermposs.end(); it++) {
|
pos != qtermposs.end(); pos++) {
|
||||||
|
|
||||||
if (int(abslen) > m_db->m_synthAbsLen)
|
if (int(abslen) > m_db->m_synthAbsLen)
|
||||||
break;
|
break;
|
||||||
|
|
||||||
unsigned int ipos = *it;
|
unsigned int sta = MAX(0, *pos - m_db->m_synthAbsWordCtxLen);
|
||||||
unsigned int beg = MAX(0, ipos-m_db->m_synthAbsWordCtxLen);
|
unsigned int sto = *pos + m_db->m_synthAbsWordCtxLen;
|
||||||
unsigned int fin = MIN(ipos+m_db->m_synthAbsWordCtxLen,
|
|
||||||
termsVec.size()-1);
|
|
||||||
LOGDEB2(("Abstract: %d<-%d->%d\n", beg, ipos, fin));
|
|
||||||
|
|
||||||
for (unsigned int ii = beg; ii <= fin; ii++) {
|
LOGDEB2(("Abstract: %d<-%d->%d\n", sta, *pos, sto));
|
||||||
|
|
||||||
|
for (unsigned int ii = sta; ii <= sto; ii++) {
|
||||||
|
|
||||||
if (int(abslen) > m_db->m_synthAbsLen)
|
if (int(abslen) > m_db->m_synthAbsLen)
|
||||||
break;
|
break;
|
||||||
|
map<unsigned int, string>::const_iterator vit =
|
||||||
if (!termsVec[ii].empty()) {
|
sparseDoc.find(ii);
|
||||||
|
if (vit != sparseDoc.end() && !vit->second.empty()) {
|
||||||
LOGDEB2(("Abstract: position %d -> [%s]\n",
|
LOGDEB2(("Abstract: position %d -> [%s]\n",
|
||||||
ii, termsVec[ii].c_str()));
|
ii, vit->second.c_str()));
|
||||||
mabs[ii] = termsVec[ii];
|
mabs[ii] = vit->second;
|
||||||
abslen += termsVec[ii].length();
|
abslen += vit->second.length();
|
||||||
} else {
|
} else {
|
||||||
LOGDEB2(("Abstract: empty position at %d\n", ii));
|
LOGDEB2(("Abstract: empty position at %d\n", ii));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Possibly add a ... at the end of chunk if it's not
|
// Possibly add a ... at the end of chunk if it's not
|
||||||
// overlapping and not at the end of doc
|
// overlapping
|
||||||
if (fin != termsVec.size()-1) {
|
if (mabs.find(sto+1) == mabs.end())
|
||||||
if (mabs.find(fin+1) == mabs.end())
|
mabs[sto+1] = "...";
|
||||||
mabs[fin+1] = "...";
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Build the abstract by walking the map (in order of position)
|
// Build the abstract by walking the map (in order of position)
|
||||||
string abstract;
|
string abstract;
|
||||||
for (map<unsigned int, string>::const_iterator it = mabs.begin();
|
for (map<unsigned int, string>::const_iterator it = mabs.begin();
|
||||||
it != mabs.end(); it++) {
|
it != mabs.end(); it++) {
|
||||||
LOGDEB2(("Abtract: output [%s]\n", (*it).second.c_str()));
|
LOGDEB2(("Abtract:output %u -> [%s]\n", it->first,it->second.c_str()));
|
||||||
abstract += (*it).second + " ";
|
abstract += it->second + " ";
|
||||||
}
|
}
|
||||||
LOGDEB(("Abtract: done in %d mS\n", chron.millis()));
|
LOGDEB(("Abtract: done in %d mS\n", chron.millis()));
|
||||||
return abstract;
|
return abstract;
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user