rclabstract: fixed log call formats and indentation

This commit is contained in:
Jean-Francois Dockes 2017-12-07 10:51:10 +01:00
parent 653b1fb5a5
commit 207e3d5af7

View File

@ -50,7 +50,7 @@ static void listList(const string& what, const vector<string>&l)
for (vector<string>::const_iterator it = l.begin(); it != l.end(); it++) { for (vector<string>::const_iterator it = l.begin(); it != l.end(); it++) {
a = a + *it + " "; a = a + *it + " ";
} }
LOGDEB("" << (what) << ": " << (a) << "\n" ); LOGDEB("" << what << ": " << a << "\n");
} }
#else #else
#define LOGABS LOGDEB2 #define LOGABS LOGDEB2
@ -67,12 +67,12 @@ static const bool prune_prefixed_terms = true;
static void noPrefixList(const vector<string>& in, vector<string>& out) static void noPrefixList(const vector<string>& in, vector<string>& out)
{ {
for (vector<string>::const_iterator qit = in.begin(); for (vector<string>::const_iterator qit = in.begin();
qit != in.end(); qit++) { qit != in.end(); qit++) {
if (prune_prefixed_terms) { if (prune_prefixed_terms) {
if (has_prefix(*qit)) if (has_prefix(*qit))
continue; continue;
} }
out.push_back(strip_prefix(*qit)); out.push_back(strip_prefix(*qit));
} }
sort(out.begin(), out.end()); sort(out.begin(), out.end());
vector<string>::iterator it = unique(out.begin(), out.end()); vector<string>::iterator it = unique(out.begin(), out.end());
@ -82,8 +82,8 @@ static void noPrefixList(const vector<string>& in, vector<string>& out)
bool Query::Native::getMatchTerms(unsigned long xdocid, vector<string>& terms) bool Query::Native::getMatchTerms(unsigned long xdocid, vector<string>& terms)
{ {
if (!xenquire) { if (!xenquire) {
LOGERR("Query::getMatchTerms: no query opened\n" ); LOGERR("Query::getMatchTerms: no query opened\n");
return false; return false;
} }
terms.clear(); terms.clear();
@ -95,8 +95,8 @@ bool Query::Native::getMatchTerms(unsigned long xdocid, vector<string>& terms)
xenquire->get_matching_terms_end(id)), xenquire->get_matching_terms_end(id)),
m_q->m_db->m_ndb->xrdb, m_q->m_reason); m_q->m_db->m_ndb->xrdb, m_q->m_reason);
if (!m_q->m_reason.empty()) { if (!m_q->m_reason.empty()) {
LOGERR("getMatchTerms: xapian error: " << (m_q->m_reason) << "\n" ); LOGERR("getMatchTerms: xapian error: " << m_q->m_reason << "\n");
return false; return false;
} }
noPrefixList(iterms, terms); noPrefixList(iterms, terms);
return true; return true;
@ -109,25 +109,26 @@ void Query::Native::setDbWideQTermsFreqs()
{ {
// Do it once only for a given query. // Do it once only for a given query.
if (!termfreqs.empty()) if (!termfreqs.empty())
return; return;
vector<string> qterms; vector<string> qterms;
{ {
vector<string> iqterms; vector<string> iqterms;
m_q->getQueryTerms(iqterms); m_q->getQueryTerms(iqterms);
noPrefixList(iqterms, qterms); noPrefixList(iqterms, qterms);
} }
// listList("Query terms: ", qterms); // listList("Query terms: ", qterms);
Xapian::Database &xrdb = m_q->m_db->m_ndb->xrdb; Xapian::Database &xrdb = m_q->m_db->m_ndb->xrdb;
double doccnt = xrdb.get_doccount(); double doccnt = xrdb.get_doccount();
if (doccnt == 0) if (doccnt == 0)
doccnt = 1; doccnt = 1;
for (vector<string>::const_iterator qit = qterms.begin(); for (vector<string>::const_iterator qit = qterms.begin();
qit != qterms.end(); qit++) { qit != qterms.end(); qit++) {
termfreqs[*qit] = xrdb.get_termfreq(*qit) / doccnt; termfreqs[*qit] = xrdb.get_termfreq(*qit) / doccnt;
LOGABS("setDbWideQTermFreqs: [" << (qit) << "] db freq " << (termfreqs[*qit]) << "\n" ); LOGABS("setDbWideQTermFreqs: [" << *qit << "] db freq " <<
termfreqs[*qit] << "\n");
} }
} }
@ -143,10 +144,10 @@ void Query::Native::setDbWideQTermsFreqs()
// occurrences, and let the frequency for each group member be the // occurrences, and let the frequency for each group member be the
// aggregated frequency. // aggregated frequency.
double Query::Native::qualityTerms(Xapian::docid docid, double Query::Native::qualityTerms(Xapian::docid docid,
const vector<string>& terms, const vector<string>& terms,
multimap<double, vector<string> >& byQ) multimap<double, vector<string> >& byQ)
{ {
LOGABS("qualityTerms\n" ); LOGABS("qualityTerms\n");
setDbWideQTermsFreqs(); setDbWideQTermsFreqs();
map<string, double> termQcoefs; map<string, double> termQcoefs;
@ -155,46 +156,46 @@ double Query::Native::qualityTerms(Xapian::docid docid,
Xapian::Database &xrdb = m_q->m_db->m_ndb->xrdb; Xapian::Database &xrdb = m_q->m_db->m_ndb->xrdb;
double doclen = xrdb.get_doclength(docid); double doclen = xrdb.get_doclength(docid);
if (doclen == 0) if (doclen == 0)
doclen = 1; doclen = 1;
HighlightData hld; HighlightData hld;
if (m_q->m_sd) { if (m_q->m_sd) {
m_q->m_sd->getTerms(hld); m_q->m_sd->getTerms(hld);
} }
#ifdef DEBUGABSTRACT #ifdef DEBUGABSTRACT
{ {
string deb; string deb;
hld.toString(deb); hld.toString(deb);
LOGABS("qualityTerms: hld: " << (deb) << "\n" ); LOGABS("qualityTerms: hld: " << deb << "\n");
} }
#endif #endif
// Group the input terms by the user term they were possibly expanded from // Group the input terms by the user term they were possibly expanded from
map<string, vector<string> > byRoot; map<string, vector<string> > byRoot;
for (vector<string>::const_iterator qit = terms.begin(); for (vector<string>::const_iterator qit = terms.begin();
qit != terms.end(); qit++) { qit != terms.end(); qit++) {
map<string, string>::const_iterator eit = hld.terms.find(*qit); map<string, string>::const_iterator eit = hld.terms.find(*qit);
if (eit != hld.terms.end()) { if (eit != hld.terms.end()) {
byRoot[eit->second].push_back(*qit); byRoot[eit->second].push_back(*qit);
} else { } else {
LOGDEB0("qualityTerms: [" << ((*qit)) << "] not found in hld\n" ); LOGDEB0("qualityTerms: [" << *qit << "] not found in hld\n");
byRoot[*qit].push_back(*qit); byRoot[*qit].push_back(*qit);
} }
} }
#ifdef DEBUGABSTRACT #ifdef DEBUGABSTRACT
{ {
string byRootstr; string byRootstr;
for (map<string, vector<string> >::const_iterator debit = for (map<string, vector<string> >::const_iterator debit =
byRoot.begin(); debit != byRoot.end(); debit++) { byRoot.begin(); debit != byRoot.end(); debit++) {
byRootstr.append("[").append(debit->first).append("]->"); byRootstr.append("[").append(debit->first).append("]->");
for (vector<string>::const_iterator it = debit->second.begin(); for (vector<string>::const_iterator it = debit->second.begin();
it != debit->second.end(); it++) { it != debit->second.end(); it++) {
byRootstr.append("[").append(*it).append("] "); byRootstr.append("[").append(*it).append("] ");
} }
byRootstr.append("\n"); byRootstr.append("\n");
} }
LOGABS("\nqualityTerms: uterms to terms: " << (byRootstr) << "\n" ); LOGABS("\nqualityTerms: uterms to terms: " << byRootstr << "\n");
} }
#endif #endif
@ -202,51 +203,51 @@ double Query::Native::qualityTerms(Xapian::docid docid,
map<string, double> grpwdfs; map<string, double> grpwdfs;
map<string, double> grptfreqs; map<string, double> grptfreqs;
for (map<string, vector<string> >::const_iterator git = byRoot.begin(); for (map<string, vector<string> >::const_iterator git = byRoot.begin();
git != byRoot.end(); git++) { git != byRoot.end(); git++) {
for (vector<string>::const_iterator qit = git->second.begin(); for (vector<string>::const_iterator qit = git->second.begin();
qit != git->second.end(); qit++) { qit != git->second.end(); qit++) {
Xapian::TermIterator term = xrdb.termlist_begin(docid); Xapian::TermIterator term = xrdb.termlist_begin(docid);
term.skip_to(*qit); term.skip_to(*qit);
if (term != xrdb.termlist_end(docid) && *term == *qit) { if (term != xrdb.termlist_end(docid) && *term == *qit) {
if (grpwdfs.find(git->first) != grpwdfs.end()) { if (grpwdfs.find(git->first) != grpwdfs.end()) {
grpwdfs[git->first] = term.get_wdf() / doclen; grpwdfs[git->first] = term.get_wdf() / doclen;
grptfreqs[git->first] = termfreqs[*qit]; grptfreqs[git->first] = termfreqs[*qit];
} else { } else {
grpwdfs[git->first] += term.get_wdf() / doclen; grpwdfs[git->first] += term.get_wdf() / doclen;
grptfreqs[git->first] += termfreqs[*qit]; grptfreqs[git->first] += termfreqs[*qit];
} }
} }
} }
} }
// Build a sorted by quality container for the groups // Build a sorted by quality container for the groups
for (map<string, vector<string> >::const_iterator git = byRoot.begin(); for (map<string, vector<string> >::const_iterator git = byRoot.begin();
git != byRoot.end(); git++) { git != byRoot.end(); git++) {
double q = (grpwdfs[git->first]) * grptfreqs[git->first]; double q = (grpwdfs[git->first]) * grptfreqs[git->first];
q = -log10(q); q = -log10(q);
if (q < 3) { if (q < 3) {
q = 0.05; q = 0.05;
} else if (q < 4) { } else if (q < 4) {
q = 0.3; q = 0.3;
} else if (q < 5) { } else if (q < 5) {
q = 0.7; q = 0.7;
} else if (q < 6) { } else if (q < 6) {
q = 0.8; q = 0.8;
} else { } else {
q = 1; q = 1;
} }
totalweight += q; totalweight += q;
byQ.insert(pair<double, vector<string> >(q, git->second)); byQ.insert(pair<double, vector<string> >(q, git->second));
} }
#ifdef DEBUGABSTRACT #ifdef DEBUGABSTRACT
for (multimap<double, vector<string> >::reverse_iterator mit= byQ.rbegin(); for (multimap<double, vector<string> >::reverse_iterator mit= byQ.rbegin();
mit != byQ.rend(); mit++) { mit != byQ.rend(); mit++) {
LOGABS("qualityTerms: group\n" ); LOGABS("qualityTerms: group\n");
for (vector<string>::const_iterator qit = mit->second.begin(); for (vector<string>::const_iterator qit = mit->second.begin();
qit != mit->second.end(); qit++) { qit != mit->second.end(); qit++) {
LOGABS("" << (mit->first) << "->[" << (qit) << "]\n" ); LOGABS("" << mit->first << "->[" << *qit << "]\n");
} }
} }
#endif #endif
return totalweight; return totalweight;
@ -257,8 +258,8 @@ int Query::Native::getFirstMatchPage(Xapian::docid docid, string& term)
{ {
LOGDEB("Query::Native::getFirstMatchPage\n"); LOGDEB("Query::Native::getFirstMatchPage\n");
if (!m_q|| !m_q->m_db || !m_q->m_db->m_ndb || !m_q->m_db->m_ndb->m_isopen) { if (!m_q|| !m_q->m_db || !m_q->m_db->m_ndb || !m_q->m_db->m_ndb->m_isopen) {
LOGERR("Query::getFirstMatchPage: no db\n" ); LOGERR("Query::getFirstMatchPage: no db\n");
return -1; return -1;
} }
Rcl::Db::Native *ndb(m_q->m_db->m_ndb); Rcl::Db::Native *ndb(m_q->m_db->m_ndb);
Xapian::Database& xrdb(ndb->xrdb); Xapian::Database& xrdb(ndb->xrdb);
@ -267,15 +268,15 @@ int Query::Native::getFirstMatchPage(Xapian::docid docid, string& term)
getMatchTerms(docid, terms); getMatchTerms(docid, terms);
if (terms.empty()) { if (terms.empty()) {
LOGDEB("getFirstMatchPage: empty match term list (field match?)\n" ); LOGDEB("getFirstMatchPage: empty match term list (field match?)\n");
return -1; return -1;
} }
vector<int> pagepos; vector<int> pagepos;
ndb->getPagePositions(docid, pagepos); ndb->getPagePositions(docid, pagepos);
if (pagepos.empty()) if (pagepos.empty())
return -1; return -1;
setDbWideQTermsFreqs(); setDbWideQTermsFreqs();
// We try to use a page which matches the "best" term. Get a sorted list // We try to use a page which matches the "best" term. Get a sorted list
@ -283,25 +284,25 @@ int Query::Native::getFirstMatchPage(Xapian::docid docid, string& term)
qualityTerms(docid, terms, byQ); qualityTerms(docid, terms, byQ);
for (multimap<double, vector<string> >::reverse_iterator mit = byQ.rbegin(); for (multimap<double, vector<string> >::reverse_iterator mit = byQ.rbegin();
mit != byQ.rend(); mit++) { mit != byQ.rend(); mit++) {
for (vector<string>::const_iterator qit = mit->second.begin(); for (vector<string>::const_iterator qit = mit->second.begin();
qit != mit->second.end(); qit++) { qit != mit->second.end(); qit++) {
string qterm = *qit; string qterm = *qit;
Xapian::PositionIterator pos; Xapian::PositionIterator pos;
string emptys; string emptys;
try { try {
for (pos = xrdb.positionlist_begin(docid, qterm); for (pos = xrdb.positionlist_begin(docid, qterm);
pos != xrdb.positionlist_end(docid, qterm); pos++) { pos != xrdb.positionlist_end(docid, qterm); pos++) {
int pagenum = ndb->getPageNumberForPosition(pagepos, *pos); int pagenum = ndb->getPageNumberForPosition(pagepos, *pos);
if (pagenum > 0) { if (pagenum > 0) {
term = qterm; term = qterm;
return pagenum; return pagenum;
} }
} }
} catch (...) { } catch (...) {
// Term does not occur. No problem. // Term does not occur. No problem.
} }
} }
} }
return -1; return -1;
} }
@ -312,18 +313,19 @@ int Query::Native::getFirstMatchPage(Xapian::docid docid, string& term)
// DatabaseModified and other general exceptions are catched and // DatabaseModified and other general exceptions are catched and
// possibly retried by our caller // possibly retried by our caller
int Query::Native::makeAbstract(Xapian::docid docid, int Query::Native::makeAbstract(Xapian::docid docid,
vector<Snippet>& vabs, vector<Snippet>& vabs,
int imaxoccs, int ictxwords) int imaxoccs, int ictxwords)
{ {
Chrono chron; Chrono chron;
LOGABS("makeAbstract: docid " << (long(docid)) << " imaxoccs " << (imaxoccs) << " ictxwords " << (ictxwords) << "\n" ); LOGABS("makeAbstract: docid " << docid << " imaxoccs " <<
imaxoccs << " ictxwords " << ictxwords << "\n");
// The (unprefixed) terms matched by this document // The (unprefixed) terms matched by this document
vector<string> matchedTerms; vector<string> matchedTerms;
getMatchTerms(docid, matchedTerms); getMatchTerms(docid, matchedTerms);
if (matchedTerms.empty()) { if (matchedTerms.empty()) {
LOGDEB("makeAbstract::Empty term list\n" ); LOGDEB("makeAbstract:"<<chron.millis()<<"mS:Empty term list\n");
return ABSRES_ERROR; return ABSRES_ERROR;
} }
listList("Match terms: ", matchedTerms); listList("Match terms: ", matchedTerms);
@ -339,11 +341,11 @@ int Query::Native::makeAbstract(Xapian::docid docid,
// aggregated by the qualityTerms() routine. // aggregated by the qualityTerms() routine.
multimap<double, vector<string> > byQ; multimap<double, vector<string> > byQ;
double totalweight = qualityTerms(docid, matchedTerms, byQ); double totalweight = qualityTerms(docid, matchedTerms, byQ);
LOGABS("makeAbstract:" << (chron.ms()) << ": computed Qcoefs.\n" ); LOGABS("makeAbstract:" << chron.millis() << "mS: computed Qcoefs.\n");
// This can't happen, but would crash us // This can't happen, but would crash us
if (totalweight == 0.0) { if (totalweight == 0.0) {
LOGERR("makeAbstract: totalweight == 0.0 !\n" ); LOGERR("makeAbstract:"<<chron.millis()<<"mS: totalweight == 0.0 !\n");
return ABSRES_ERROR; return ABSRES_ERROR;
} }
Rcl::Db::Native *ndb(m_q->m_db->m_ndb); Rcl::Db::Native *ndb(m_q->m_db->m_ndb);
@ -374,124 +376,129 @@ int Query::Native::makeAbstract(Xapian::docid docid,
// with words. We used to limit the character size at the end, but // with words. We used to limit the character size at the end, but
// this damaged our careful selection of terms // this damaged our careful selection of terms
const unsigned int maxtotaloccs = imaxoccs > 0 ? imaxoccs : const unsigned int maxtotaloccs = imaxoccs > 0 ? imaxoccs :
m_q->m_db->getAbsLen() /(7 * (m_q->m_db->getAbsCtxLen() + 1)); m_q->m_db->getAbsLen() /(7 * (m_q->m_db->getAbsCtxLen() + 1));
int ctxwords = ictxwords == -1 ? m_q->m_db->getAbsCtxLen() : ictxwords; int ctxwords = ictxwords == -1 ? m_q->m_db->getAbsCtxLen() : ictxwords;
LOGABS("makeAbstract:" << (chron.ms()) << ": mxttloccs " << (maxtotaloccs) << " ctxwords " << (ctxwords) << "\n" ); LOGABS("makeAbstract:" << chron.millis() << "mS: mxttloccs " <<
maxtotaloccs << " ctxwords " << ctxwords << "\n");
int ret = ABSRES_OK; int ret = ABSRES_OK;
// Let's go populate // Let's go populate
for (multimap<double, vector<string> >::reverse_iterator mit = byQ.rbegin(); for (multimap<double, vector<string> >::reverse_iterator mit = byQ.rbegin();
mit != byQ.rend(); mit++) { mit != byQ.rend(); mit++) {
unsigned int maxgrpoccs; unsigned int maxgrpoccs;
double q; double q;
if (byQ.size() == 1) { if (byQ.size() == 1) {
maxgrpoccs = maxtotaloccs; maxgrpoccs = maxtotaloccs;
q = 1.0; q = 1.0;
} else { } else {
// We give more slots to the better term groups // We give more slots to the better term groups
q = mit->first / totalweight; q = mit->first / totalweight;
maxgrpoccs = int(ceil(maxtotaloccs * q)); maxgrpoccs = int(ceil(maxtotaloccs * q));
} }
unsigned int grpoccs = 0; unsigned int grpoccs = 0;
for (vector<string>::const_iterator qit = mit->second.begin(); for (vector<string>::const_iterator qit = mit->second.begin();
qit != mit->second.end(); qit++) { qit != mit->second.end(); qit++) {
// Group done ? // Group done ?
if (grpoccs >= maxgrpoccs) if (grpoccs >= maxgrpoccs)
break; break;
string qterm = *qit; string qterm = *qit;
LOGABS("makeAbstract: [" << (qterm) << "] " << (maxgrpoccs) << " max grp occs (coef " << (q) << ")\n" ); LOGABS("makeAbstract: [" << qterm << "] " << maxgrpoccs <<
" max grp occs (coef " << q << ")\n");
// The match term may span several words // The match term may span several words
int qtrmwrdcnt = int qtrmwrdcnt =
TextSplit::countWords(qterm, TextSplit::TXTS_NOSPANS); TextSplit::countWords(qterm, TextSplit::TXTS_NOSPANS);
Xapian::PositionIterator pos; Xapian::PositionIterator pos;
// There may be query terms not in this doc. This raises an // There may be query terms not in this doc. This raises an
// exception when requesting the position list, we catch it ?? // exception when requesting the position list, we catch it ??
// Not clear how this can happen because we are walking the // Not clear how this can happen because we are walking the
// match list returned by Xapian. Maybe something with the // match list returned by Xapian. Maybe something with the
// fields? // fields?
string emptys; string emptys;
try { try {
for (pos = xrdb.positionlist_begin(docid, qterm); for (pos = xrdb.positionlist_begin(docid, qterm);
pos != xrdb.positionlist_end(docid, qterm); pos++) { pos != xrdb.positionlist_end(docid, qterm); pos++) {
int ipos = *pos; int ipos = *pos;
if (ipos < int(baseTextPosition)) // Not in text body if (ipos < int(baseTextPosition)) // Not in text body
continue; continue;
LOGABS("makeAbstract: [" << (qterm) << "] at pos " << (ipos) << " grpoccs " << (grpoccs) << " maxgrpoccs " << (maxgrpoccs) << "\n" ); LOGABS("makeAbstract: [" << qterm << "] at pos " <<
ipos << " grpoccs " << grpoccs << " maxgrpoccs " <<
maxgrpoccs << "\n");
totaloccs++; totaloccs++;
grpoccs++; grpoccs++;
// Add adjacent slots to the set to populate at next // Add adjacent slots to the set to populate at next
// step by inserting empty strings. Special provisions // step by inserting empty strings. Special provisions
// for adding ellipsis and for positions overlapped by // for adding ellipsis and for positions overlapped by
// the match term. // the match term.
unsigned int sta = MAX(int(baseTextPosition), unsigned int sta = MAX(int(baseTextPosition),
ipos - ctxwords); ipos - ctxwords);
unsigned int sto = ipos + qtrmwrdcnt-1 + unsigned int sto = ipos + qtrmwrdcnt-1 +
m_q->m_db->getAbsCtxLen(); m_q->m_db->getAbsCtxLen();
for (unsigned int ii = sta; ii <= sto; ii++) { for (unsigned int ii = sta; ii <= sto; ii++) {
if (ii == (unsigned int)ipos) { if (ii == (unsigned int)ipos) {
sparseDoc[ii] = qterm; sparseDoc[ii] = qterm;
searchTermPositions.insert(ii); searchTermPositions.insert(ii);
if (ii > maxpos) if (ii > maxpos)
maxpos = ii; maxpos = ii;
} else if (ii > (unsigned int)ipos && } else if (ii > (unsigned int)ipos &&
ii < (unsigned int)ipos + qtrmwrdcnt) { ii < (unsigned int)ipos + qtrmwrdcnt) {
sparseDoc[ii] = occupiedmarker; sparseDoc[ii] = occupiedmarker;
} else if (!sparseDoc[ii].compare(cstr_ellipsis)) { } else if (!sparseDoc[ii].compare(cstr_ellipsis)) {
// For an empty slot, the test has a side // For an empty slot, the test has a side
// effect of inserting an empty string which // effect of inserting an empty string which
// is what we want. // is what we want.
sparseDoc[ii] = emptys; sparseDoc[ii] = emptys;
} }
} }
// Add ellipsis at the end. This may be replaced later by // Add ellipsis at the end. This may be replaced later by
// an overlapping extract. Take care not to replace an // an overlapping extract. Take care not to replace an
// empty string here, we really want an empty slot, // empty string here, we really want an empty slot,
// use find() // use find()
if (sparseDoc.find(sto+1) == sparseDoc.end()) { if (sparseDoc.find(sto+1) == sparseDoc.end()) {
sparseDoc[sto+1] = cstr_ellipsis; sparseDoc[sto+1] = cstr_ellipsis;
} }
// Group done ? // Group done ?
if (grpoccs >= maxgrpoccs) { if (grpoccs >= maxgrpoccs) {
ret |= ABSRES_TRUNC; ret |= ABSRES_TRUNC;
LOGABS("Db::makeAbstract: max group occs cutoff\n" ); LOGABS("Db::makeAbstract: max group occs cutoff\n");
break; break;
} }
// Global done ? // Global done ?
if (totaloccs >= maxtotaloccs) { if (totaloccs >= maxtotaloccs) {
ret |= ABSRES_TRUNC; ret |= ABSRES_TRUNC;
LOGABS("Db::makeAbstract: max occurrences cutoff\n" ); LOGABS("Db::makeAbstract: max occurrences cutoff\n");
break; break;
} }
} }
} catch (...) { } catch (...) {
// Term does not occur. No problem. // Term does not occur. No problem.
} }
if (totaloccs >= maxtotaloccs) { if (totaloccs >= maxtotaloccs) {
ret |= ABSRES_TRUNC; ret |= ABSRES_TRUNC;
LOGABS("Db::makeAbstract: max1 occurrences cutoff\n" ); LOGABS("Db::makeAbstract: max1 occurrences cutoff\n");
break; break;
} }
} }
} }
maxpos += ctxwords + 1; maxpos += ctxwords + 1;
LOGABS("makeAbstract:" << (chron.millis()) << ":chosen number of positions " << (totaloccs) << "\n" ); LOGABS("makeAbstract:" << chron.millis() <<
"mS:chosen number of positions " << totaloccs << "\n");
// This can happen if there are term occurences in the keywords // This can happen if there are term occurences in the keywords
// etc. but not elsewhere ? // etc. but not elsewhere ?
if (totaloccs == 0) { if (totaloccs == 0) {
LOGDEB("makeAbstract: no occurrences\n" ); LOGDEB("makeAbstract: no occurrences\n");
return ABSRES_OK; return ABSRES_OK;
} }
// Walk all document's terms position lists and populate slots // Walk all document's terms position lists and populate slots
@ -500,69 +507,74 @@ int Query::Native::makeAbstract(Xapian::docid docid,
// inconsistant (missing words, potentially altering meaning), // inconsistant (missing words, potentially altering meaning),
// which is bad. // which is bad.
{ {
Xapian::TermIterator term; Xapian::TermIterator term;
int cutoff = m_q->m_snipMaxPosWalk; int cutoff = m_q->m_snipMaxPosWalk;
for (term = xrdb.termlist_begin(docid); for (term = xrdb.termlist_begin(docid);
term != xrdb.termlist_end(docid); term++) { term != xrdb.termlist_end(docid); term++) {
// Ignore prefixed terms // Ignore prefixed terms
if (has_prefix(*term)) if (has_prefix(*term))
continue; continue;
if (m_q->m_snipMaxPosWalk > 0 && cutoff-- < 0) { if (m_q->m_snipMaxPosWalk > 0 && cutoff-- < 0) {
ret |= ABSRES_TERMMISS; ret |= ABSRES_TERMMISS;
LOGDEB0("makeAbstract: max term count cutoff " << (m_q->m_snipMaxPosWalk) << "\n" ); LOGDEB0("makeAbstract: max term count cutoff " <<
break; m_q->m_snipMaxPosWalk << "\n");
} break;
}
map<unsigned int, string>::iterator vit; map<unsigned int, string>::iterator vit;
Xapian::PositionIterator pos; Xapian::PositionIterator pos;
for (pos = xrdb.positionlist_begin(docid, *term); for (pos = xrdb.positionlist_begin(docid, *term);
pos != xrdb.positionlist_end(docid, *term); pos++) { pos != xrdb.positionlist_end(docid, *term); pos++) {
if (m_q->m_snipMaxPosWalk > 0 && cutoff-- < 0) { if (m_q->m_snipMaxPosWalk > 0 && cutoff-- < 0) {
ret |= ABSRES_TERMMISS; ret |= ABSRES_TERMMISS;
LOGDEB0("makeAbstract: max term count cutoff " << (m_q->m_snipMaxPosWalk) << "\n" ); LOGDEB0("makeAbstract: max term count cutoff " <<
break; m_q->m_snipMaxPosWalk << "\n");
} break;
// If we are beyond the max possible position, stop }
// for this term // If we are beyond the max possible position, stop
if (*pos > maxpos) { // for this term
break; if (*pos > maxpos) {
} break;
if ((vit = sparseDoc.find(*pos)) != sparseDoc.end()) { }
// Don't replace a term: the terms list is in if ((vit = sparseDoc.find(*pos)) != sparseDoc.end()) {
// alphabetic order, and we may have several terms // Don't replace a term: the terms list is in
// at the same position, we want to keep only the // alphabetic order, and we may have several terms
// first one (ie: dockes and dockes@wanadoo.fr) // at the same position, we want to keep only the
if (vit->second.empty()) { // first one (ie: dockes and dockes@wanadoo.fr)
LOGDEB2("makeAbstract: populating: [" << ((*term)) << "] at " << (*pos) << "\n" ); if (vit->second.empty()) {
sparseDoc[*pos] = *term; LOGDEB2("makeAbstract: populating: [" << *term <<
} "] at " << *pos << "\n");
} sparseDoc[*pos] = *term;
} }
} }
}
}
} }
LOGABS("makeAbstract:" << chron.millis() << "mS: all term poslist read\n");
#if 0 #if 0
// Debug only: output the full term[position] vector // Debug only: output the full term[position] vector
bool epty = false; bool epty = false;
int ipos = 0; int ipos = 0;
for (map<unsigned int, string>::iterator it = sparseDoc.begin(); for (map<unsigned int, string>::iterator it = sparseDoc.begin();
it != sparseDoc.end(); it != sparseDoc.end();
it++, ipos++) { it++, ipos++) {
if (it->empty()) { if (it->empty()) {
if (!epty) if (!epty)
LOGDEB("makeAbstract:vec[" << (ipos) << "]: [" << (it) << "]\n" ); LOGDEB("makeAbstract:vec[" << ipos << "]: [" << it << "]\n");
epty=true; epty=true;
} else { } else {
epty = false; epty = false;
LOGDEB("makeAbstract:vec[" << (ipos) << "]: [" << (it) << "]\n" ); LOGDEB("makeAbstract:vec[" << ipos << "]: [" << it << "]\n");
} }
} }
#endif #endif
vector<int> vpbreaks; vector<int> vpbreaks;
ndb->getPagePositions(docid, vpbreaks); ndb->getPagePositions(docid, vpbreaks);
LOGABS("makeAbstract:" << (chron.millis()) << ": extracting. Got " << (vpbreaks.size()) << " pages\n" ); LOGABS("makeAbstract:" << chron.millis() << "mS: extracting. Got " <<
vpbreaks.size() << " pages\n");
// Finally build the abstract by walking the map (in order of position) // Finally build the abstract by walking the map (in order of position)
vabs.clear(); vabs.clear();
string chunk; string chunk;
@ -570,45 +582,43 @@ int Query::Native::makeAbstract(Xapian::docid docid,
int page = 0; int page = 0;
string term; string term;
for (map<unsigned int, string>::const_iterator it = sparseDoc.begin(); for (map<unsigned int, string>::const_iterator it = sparseDoc.begin();
it != sparseDoc.end(); it++) { it != sparseDoc.end(); it++) {
LOGDEB2("Abtract:output " << (it->first) << " -> [" << (it->second) << "]\n" ); LOGDEB2("Abtract:output " << it->first << " -> [" << it->second <<
if (!occupiedmarker.compare(it->second)) { "]\n");
LOGDEB("Abstract: qtrm position not filled ??\n" ); if (!occupiedmarker.compare(it->second)) {
continue; LOGDEB("Abstract: qtrm position not filled ??\n");
} continue;
if (chunk.empty() && !vpbreaks.empty()) { }
page = ndb->getPageNumberForPosition(vpbreaks, it->first); if (chunk.empty() && !vpbreaks.empty()) {
if (page < 0) page = ndb->getPageNumberForPosition(vpbreaks, it->first);
page = 0; if (page < 0)
term.clear(); page = 0;
} term.clear();
Utf8Iter uit(it->second); }
bool newcjk = false; Utf8Iter uit(it->second);
if (TextSplit::isCJK(*uit)) bool newcjk = false;
newcjk = true; if (TextSplit::isCJK(*uit))
if (!incjk || (incjk && !newcjk)) newcjk = true;
chunk += " "; if (!incjk || (incjk && !newcjk))
incjk = newcjk; chunk += " ";
if (searchTermPositions.find(it->first) != searchTermPositions.end()) incjk = newcjk;
term = it->second; if (searchTermPositions.find(it->first) != searchTermPositions.end())
if (it->second == cstr_ellipsis) { term = it->second;
vabs.push_back(Snippet(page, chunk).setTerm(term)); if (it->second == cstr_ellipsis) {
chunk.clear(); vabs.push_back(Snippet(page, chunk).setTerm(term));
} else { chunk.clear();
if (it->second.compare(end_of_field_term) && } else {
it->second.compare(start_of_field_term)) if (it->second.compare(end_of_field_term) &&
chunk += it->second; it->second.compare(start_of_field_term))
} chunk += it->second;
}
} }
if (!chunk.empty()) if (!chunk.empty())
vabs.push_back(Snippet(page, chunk).setTerm(term)); vabs.push_back(Snippet(page, chunk).setTerm(term));
LOGDEB2("makeAbtract: done in " << (chron.millis()) << " mS\n" ); LOGABS("makeAbtract: done in " << chron.millis() << " mS\n");
return ret; return ret;
} }
} }