From be7aed565912e9d6d8e295c77cd48bd9e7c10c66 Mon Sep 17 00:00:00 2001 From: "\"Jean-Francois Dockes ext:(%22)" Date: Thu, 27 Sep 2012 12:34:30 +0200 Subject: [PATCH] snippets: assign slots by group of derived terms --- src/rcldb/rclabstract.cpp | 307 +++++++++++++++++++++++--------------- src/rcldb/rclquery.cpp | 2 + src/rcldb/rclquery.h | 1 + src/rcldb/rclquery_p.h | 5 +- 4 files changed, 195 insertions(+), 120 deletions(-) diff --git a/src/rcldb/rclabstract.cpp b/src/rcldb/rclabstract.cpp index 0771fa42..547bb629 100644 --- a/src/rcldb/rclabstract.cpp +++ b/src/rcldb/rclabstract.cpp @@ -27,16 +27,19 @@ using namespace std; #include "rclquery.h" #include "rclquery_p.h" #include "textsplit.h" - +#include "searchdata.h" #include "utf8iter.h" +#include "hldata.h" namespace Rcl { // This is used as a marker inside the abstract frag lists, but // normally doesn't remain in final output (which is built with a // custom sep. by our caller). static const string cstr_ellipsis("..."); +// This is used to mark positions overlapped by a multi-word match term +static const string occupiedmarker("?"); -#undef DEBUGABSTRACT +#define DEBUGABSTRACT #ifdef DEBUGABSTRACT #define LOGABS LOGDEB static void listList(const string& what, const vector&l) @@ -96,16 +99,22 @@ void Query::Native::setDbWideQTermsFreqs() } } -// Compute query terms quality coefficients for a matched document by +// Compute matched terms quality coefficients for a matched document by // retrieving the Within Document Frequencies and multiplying by // overal term frequency, then using log-based thresholds. // 2012: it's not too clear to me why exactly we do the log thresholds thing. // Preferring terms wich are rare either or both in the db and the document // seems reasonable though +// To avoid setting a high quality for a low frequency expansion of a +// common stem, which seems wrong, we group the terms by +// root, compute a frequency for the group from the sum of member +// occurrences, and let the frequency for each group member be the +// aggregated frequency. double Query::Native::qualityTerms(Xapian::docid docid, const vector& terms, - multimap& byQ) + map >& byQ) { + LOGABS(("qualityTerms\n")); setDbWideQTermsFreqs(); map termQcoefs; @@ -115,42 +124,88 @@ double Query::Native::qualityTerms(Xapian::docid docid, double doclen = xrdb.get_doclength(docid); if (doclen == 0) doclen = 1; + HighlightData hld; + if (!m_q->m_sd.isNull()) { + m_q->m_sd->getTerms(hld); + } + map > byRoot; for (vector::const_iterator qit = terms.begin(); qit != terms.end(); qit++) { - Xapian::TermIterator term = xrdb.termlist_begin(docid); - term.skip_to(*qit); - if (term != xrdb.termlist_end(docid) && *term == *qit) { - double q = (term.get_wdf() / doclen) * termfreqs[*qit]; - q = -log10(q); - if (q < 3) { - q = 0.05; - } else if (q < 4) { - q = 0.3; - } else if (q < 5) { - q = 0.7; - } else if (q < 6) { - q = 0.8; - } else { - q = 1; + bool found = false; + for (unsigned int gidx = 0; gidx < hld.groups.size(); gidx++) { + if (hld.groups[gidx].size() == 1 && hld.groups[gidx][0] == *qit) { + string us = hld.ugroups[hld.grpsugidx[gidx]][0]; + LOGABS(("qualityTerms: [%s] found, comes from [%s]\n", + (*qit).c_str(), us.c_str())); + byRoot[us].push_back(*qit); + found = true; } - termQcoefs[*qit] = q; - totalweight += q; + } + if (!found) { + LOGABS(("qualityTerms: [%s] not found\n", (*qit).c_str())); + byRoot[*qit].push_back(*qit); } - } + } + + map grpwdfs; + map grptfreqs; + + for (map >::const_iterator git = byRoot.begin(); + git != byRoot.end(); git++) { + for (vector::const_iterator qit = git->second.begin(); + qit != git->second.end(); qit++) { + Xapian::TermIterator term = xrdb.termlist_begin(docid); + term.skip_to(*qit); + if (term != xrdb.termlist_end(docid) && *term == *qit) { + if (grpwdfs.find(git->first) != grpwdfs.end()) { + grpwdfs[git->first] = term.get_wdf() / doclen; + grptfreqs[git->first] = termfreqs[*qit]; + } else { + grpwdfs[git->first] += term.get_wdf() / doclen; + grptfreqs[git->first] += termfreqs[*qit]; + } + } + } + } + + for (map >::const_iterator git = byRoot.begin(); + git != byRoot.end(); git++) { + double q = (grpwdfs[git->first]) * grptfreqs[git->first]; + q = -log10(q); + if (q < 3) { + q = 0.05; + } else if (q < 4) { + q = 0.3; + } else if (q < 5) { + q = 0.7; + } else if (q < 6) { + q = 0.8; + } else { + q = 1; + } + totalweight += q; + for (vector::const_iterator qit = git->second.begin(); + qit != git->second.end(); qit++) { + termQcoefs[*qit] = q; + } + } // Build a sorted by quality term list. for (vector::const_iterator qit = terms.begin(); qit != terms.end(); qit++) { if (termQcoefs.find(*qit) != termQcoefs.end()) - byQ.insert(pair(termQcoefs[*qit], *qit)); + byQ[termQcoefs[*qit]].push_back(*qit); } #ifdef DEBUGABSTRACT - LOGDEB(("Db::qualityTerms:\n")); - for (multimap::reverse_iterator qit = byQ.rbegin(); - qit != byQ.rend(); qit++) { - LOGDEB(("%.1e->[%s]\n", qit->first, qit->second.c_str())); + for (map >::reverse_iterator mit = byQ.rbegin(); + mit != byQ.rend(); mit++) { + LOGABS(("qualityTerms: group\n")); + for (vector::const_iterator qit = mit->second.begin(); + qit != mit->second.end(); qit++) { + LOGABS(("%.1e->[%s]\n", mit->first, qit->c_str())); + } } #endif return totalweight; @@ -185,23 +240,26 @@ int Query::Native::getFirstMatchPage(Xapian::docid docid) setDbWideQTermsFreqs(); // We try to use a page which matches the "best" term. Get a sorted list - multimap byQ; + map > byQ; double totalweight = qualityTerms(docid, terms, byQ); - for (multimap::reverse_iterator qit = byQ.rbegin(); - qit != byQ.rend(); qit++) { - string qterm = qit->second; - Xapian::PositionIterator pos; - string emptys; - try { - for (pos = xrdb.positionlist_begin(docid, qterm); - pos != xrdb.positionlist_end(docid, qterm); pos++) { - int pagenum = ndb->getPageNumberForPosition(pagepos, *pos); - if (pagenum > 0) - return pagenum; + for (map >::reverse_iterator mit = byQ.rbegin(); + mit != byQ.rend(); mit++) { + for (vector::const_iterator qit = mit->second.begin(); + qit != mit->second.end(); qit++) { + string qterm = *qit; + Xapian::PositionIterator pos; + string emptys; + try { + for (pos = xrdb.positionlist_begin(docid, qterm); + pos != xrdb.positionlist_end(docid, qterm); pos++) { + int pagenum = ndb->getPageNumberForPosition(pagepos, *pos); + if (pagenum > 0) + return pagenum; + } + } catch (...) { + // Term does not occur. No problem. } - } catch (...) { - // Term does not occur. No problem. } } return -1; @@ -245,7 +303,7 @@ abstract_result Query::Native::makeAbstract(Xapian::docid docid, // such a group prevents displaying matches for other terms (by // removing its meaning from the maximum occurrences per term test // used while walking the list below) - multimap byQ; + map > byQ; double totalweight = qualityTerms(docid, matchedTerms, byQ); LOGABS(("makeAbstract:%d: computed Qcoefs.\n", chron.ms())); // This can't happen, but would crash us @@ -259,9 +317,8 @@ abstract_result Query::Native::makeAbstract(Xapian::docid docid, /////////////////// // For each of the query terms, ask xapian for its positions list - // in the document. For each position entry, remember it in - // qtermposs and insert it and its neighbours in the set of - // 'interesting' positions + // in the document. For each position entry, insert it and its + // neighbours in the set of 'interesting' positions // The terms 'array' that we partially populate with the document // terms, at their positions around the search terms positions: @@ -270,7 +327,7 @@ abstract_result Query::Native::makeAbstract(Xapian::docid docid, // Total number of occurences for all terms. We stop when we have too much unsigned int totaloccs = 0; - // Limit the total number of slots we populate. The 7 is taken as + // Total number of slots we populate. The 7 is taken as // average word size. It was a mistake to have the user max // abstract size parameter in characters, we basically only deal // with words. We used to limit the character size at the end, but @@ -281,91 +338,105 @@ abstract_result Query::Native::makeAbstract(Xapian::docid docid, LOGABS(("makeAbstract:%d: mxttloccs %d ctxwords %d\n", chron.ms(), maxtotaloccs, ctxwords)); - // This is used to mark positions overlapped by a multi-word match term - const string occupiedmarker("?"); - abstract_result ret = ABSRES_OK; // Let's go populate - for (multimap::reverse_iterator qit = byQ.rbegin(); - qit != byQ.rend(); qit++) { - string qterm = qit->second; - unsigned int maxoccs; + for (map >::reverse_iterator mit = byQ.rbegin(); + mit != byQ.rend(); mit++) { + unsigned int maxgrpoccs; + float q; if (byQ.size() == 1) { - maxoccs = maxtotaloccs; + maxgrpoccs = maxtotaloccs; + q = 1.0; } else { - // We give more slots to the better terms - float q = qit->first / totalweight; - maxoccs = int(ceil(maxtotaloccs * q)); - LOGABS(("makeAbstract: [%s] %d max occs (coef %.2f)\n", - qterm.c_str(), maxoccs, q)); + // We give more slots to the better term groups + q = mit->first / totalweight; + maxgrpoccs = int(ceil(maxtotaloccs * q)); } + unsigned int grpoccs = 0; - // The match term may span several words - int qtrmwrdcnt = TextSplit::countWords(qterm, TextSplit::TXTS_NOSPANS); + for (vector::const_iterator qit = mit->second.begin(); + qit != mit->second.end(); qit++) { - Xapian::PositionIterator pos; - // There may be query terms not in this doc. This raises an - // exception when requesting the position list, we catch it ?? - // Not clear how this can happen because we are walking the - // match list returned by Xapian. Maybe something with the - // fields? - string emptys; - try { - unsigned int occurrences = 0; - for (pos = xrdb.positionlist_begin(docid, qterm); - pos != xrdb.positionlist_end(docid, qterm); pos++) { - int ipos = *pos; - if (ipos < int(baseTextPosition)) // Not in text body - continue; - LOGABS(("makeAbstract: [%s] at %d occurrences %d maxoccs %d\n", - qterm.c_str(), ipos, occurrences, maxoccs)); + // Group done ? + if (grpoccs >= maxgrpoccs) + break; - totaloccs++; + string qterm = *qit; - // Add adjacent slots to the set to populate at next - // step by inserting empty strings. Special provisions - // for adding ellipsis and for positions overlapped by - // the match term. - unsigned int sta = MAX(0, ipos - ctxwords); - unsigned int sto = ipos + qtrmwrdcnt-1 + - m_q->m_db->getAbsCtxLen(); - for (unsigned int ii = sta; ii <= sto; ii++) { - if (ii == (unsigned int)ipos) { - sparseDoc[ii] = qterm; - } else if (ii > (unsigned int)ipos && - ii < (unsigned int)ipos + qtrmwrdcnt) { - sparseDoc[ii] = occupiedmarker; - } else if (!sparseDoc[ii].compare(cstr_ellipsis)) { - // For an empty slot, the test has a side - // effect of inserting an empty string which - // is what we want - sparseDoc[ii] = emptys; + LOGABS(("makeAbstract: [%s] %d max grp occs (coef %.2f)\n", + qterm.c_str(), maxgrpoccs, q)); + + // The match term may span several words + int qtrmwrdcnt = + TextSplit::countWords(qterm, TextSplit::TXTS_NOSPANS); + + Xapian::PositionIterator pos; + // There may be query terms not in this doc. This raises an + // exception when requesting the position list, we catch it ?? + // Not clear how this can happen because we are walking the + // match list returned by Xapian. Maybe something with the + // fields? + string emptys; + try { + for (pos = xrdb.positionlist_begin(docid, qterm); + pos != xrdb.positionlist_end(docid, qterm); pos++) { + int ipos = *pos; + if (ipos < int(baseTextPosition)) // Not in text body + continue; + LOGABS(("makeAbstract: [%s] at pos %d grpoccs %d maxgrpoccs %d\n", + qterm.c_str(), ipos, grpoccs, maxgrpoccs)); + + totaloccs++; + grpoccs++; + + // Add adjacent slots to the set to populate at next + // step by inserting empty strings. Special provisions + // for adding ellipsis and for positions overlapped by + // the match term. + unsigned int sta = MAX(0, ipos - ctxwords); + unsigned int sto = ipos + qtrmwrdcnt-1 + + m_q->m_db->getAbsCtxLen(); + for (unsigned int ii = sta; ii <= sto; ii++) { + if (ii == (unsigned int)ipos) { + sparseDoc[ii] = qterm; + } else if (ii > (unsigned int)ipos && + ii < (unsigned int)ipos + qtrmwrdcnt) { + sparseDoc[ii] = occupiedmarker; + } else if (!sparseDoc[ii].compare(cstr_ellipsis)) { + // For an empty slot, the test has a side + // effect of inserting an empty string which + // is what we want + sparseDoc[ii] = emptys; + } + } + // Add ellipsis at the end. This may be replaced later by + // an overlapping extract. Take care not to replace an + // empty string here, we really want an empty slot, + // use find() + if (sparseDoc.find(sto+1) == sparseDoc.end()) { + sparseDoc[sto+1] = cstr_ellipsis; + } + + // Group done ? + if (grpoccs >= maxgrpoccs) + break; + // Global done ? + if (totaloccs >= maxtotaloccs) { + ret = ABSRES_TRUNC; + LOGABS(("Db::makeAbstract: max occurrences cutoff\n")); + break; } } - // Add ellipsis at the end. This may be replaced later by - // an overlapping extract. Take care not to replace an - // empty string here, we really want an empty slot, - // use find() - if (sparseDoc.find(sto+1) == sparseDoc.end()) { - sparseDoc[sto+1] = cstr_ellipsis; - } - - // Limit to allocated occurences and total size - if (++occurrences >= maxoccs || - totaloccs >= maxtotaloccs) { - ret = ABSRES_TRUNC; - LOGDEB(("Db::makeAbstract: max occurrences cutoff\n")); - break; - } + } catch (...) { + // Term does not occur. No problem. + } + + if (totaloccs >= maxtotaloccs) { + ret = ABSRES_TRUNC; + LOGABS(("Db::makeAbstract: max1 occurrences cutoff\n")); + break; } - } catch (...) { - // Term does not occur. No problem. - } - if (totaloccs >= maxtotaloccs) { - ret = ABSRES_TRUNC; - LOGDEB(("Db::makeAbstract: max1 occurrences cutoff\n")); - break; } } LOGABS(("makeAbstract:%d:chosen number of positions %d\n", diff --git a/src/rcldb/rclquery.cpp b/src/rcldb/rclquery.cpp index ebdf3775..56105424 100644 --- a/src/rcldb/rclquery.cpp +++ b/src/rcldb/rclquery.cpp @@ -191,6 +191,7 @@ bool Query::setQuery(RefCntr sdata) m_reason.erase(); m_nq->clear(); + m_sd = sdata; Xapian::Query xq; if (!sdata->toNativeQuery(*m_db, &xq)) { @@ -244,6 +245,7 @@ bool Query::setQuery(RefCntr sdata) d.erase(0, strlen("Xapian::Query")); sdata->setDescription(d); + m_sd = sdata; LOGDEB(("Query::SetQuery: Q: %s\n", sdata->getDescription().c_str())); return true; } diff --git a/src/rcldb/rclquery.h b/src/rcldb/rclquery.h index 1431fa2e..5eaebc54 100644 --- a/src/rcldb/rclquery.h +++ b/src/rcldb/rclquery.h @@ -112,6 +112,7 @@ private: bool m_sortAscending; bool m_collapseDuplicates; int m_resCnt; + RefCntr m_sd; /* Copyconst and assignement private and forbidden */ Query(const Query &) {} diff --git a/src/rcldb/rclquery_p.h b/src/rcldb/rclquery_p.h index 04ee1e92..a3331851 100644 --- a/src/rcldb/rclquery_p.h +++ b/src/rcldb/rclquery_p.h @@ -58,8 +58,9 @@ public: int maxoccs = -1, int ctxwords = -1); int getFirstMatchPage(Xapian::docid docid); void setDbWideQTermsFreqs(); - double qualityTerms(Xapian::docid docid, const vector& terms, - std::multimap& byQ); + double qualityTerms(Xapian::docid docid, + const std::vector& terms, + std::map >& byQ); }; }