Abstracts: improve the way we group terms for quality computation

This commit is contained in:
Jean-Francois Dockes 2012-10-03 11:17:16 +02:00
parent d3a26706b5
commit c589419267
5 changed files with 81 additions and 25 deletions

View File

@ -42,7 +42,7 @@ static const string cstr_ellipsis("...");
// This is used to mark positions overlapped by a multi-word match term
static const string occupiedmarker("?");
#define DEBUGABSTRACT
#undef DEBUGABSTRACT
#ifdef DEBUGABSTRACT
#define LOGABS LOGDEB
static void listList(const string& what, const vector<string>&l)
@ -60,16 +60,16 @@ static void listList(const string&, const vector<string>&)
}
#endif
// Keep only non-prefixed terms. We use to remove prefixes and keep
// the terms instead, but field terms are normally also indexed
// un-prefixed, so this is simpler and better.
// Unprefix terms.
static void noPrefixList(const vector<string>& in, vector<string>& out)
{
for (vector<string>::const_iterator qit = in.begin();
qit != in.end(); qit++) {
if (!has_prefix(*qit))
out.push_back(*qit);
out.push_back(strip_prefix(*qit));
}
sort(out.begin(), out.end());
vector<string>::iterator it = unique(out.begin(), out.end());
out.resize(it - out.begin());
}
// Retrieve db-wide frequencies for the query terms and store them in
@ -132,26 +132,44 @@ double Query::Native::qualityTerms(Xapian::docid docid,
m_q->m_sd->getTerms(hld);
}
#ifdef DEBUGABSTRACT
{
string deb;
hld.toString(deb);
LOGABS(("qualityTerms: hld: %s\n", deb.c_str()));
}
#endif
// Group the input terms by the user term they were possibly expanded from
map<string, vector<string> > byRoot;
for (vector<string>::const_iterator qit = terms.begin();
qit != terms.end(); qit++) {
bool found = false;
for (unsigned int gidx = 0; gidx < hld.groups.size(); gidx++) {
if (hld.groups[gidx].size() == 1 && hld.groups[gidx][0] == *qit) {
string us = hld.ugroups[hld.grpsugidx[gidx]][0];
LOGABS(("qualityTerms: [%s] found, comes from [%s]\n",
(*qit).c_str(), us.c_str()));
byRoot[us].push_back(*qit);
found = true;
}
}
if (!found) {
map<string, string>::const_iterator eit = hld.terms.find(*qit);
if (eit != hld.terms.end()) {
byRoot[eit->second].push_back(*qit);
} else {
LOGDEB0(("qualityTerms: [%s] not found in hld\n", (*qit).c_str()));
byRoot[*qit].push_back(*qit);
}
}
#ifdef DEBUGABSTRACT
{
string byRootstr;
for (map<string, vector<string> >::const_iterator debit =
byRoot.begin(); debit != byRoot.end(); debit++) {
byRootstr.append("[").append(debit->first).append("]->");
for (vector<string>::const_iterator it = debit->second.begin();
it != debit->second.end(); it++) {
byRootstr.append("[").append(*it).append("] ");
}
byRootstr.append("\n");
}
LOGABS(("\nqualityTerms: uterms to terms: %s\n", byRootstr.c_str()));
}
#endif
// Compute in-document and global frequencies for the groups.
map<string, double> grpwdfs;
map<string, double> grptfreqs;

View File

@ -142,6 +142,29 @@ inline bool has_prefix(const string& trm)
#endif
}
inline string strip_prefix(const string& trm)
{
if (trm.empty())
return trm;
string::size_type st = 0;
#ifndef RCL_INDEX_STRIPCHARS
if (o_index_stripchars) {
#endif
st = trm.find_first_not_of("ABCDEFIJKLMNOPQRSTUVWXYZ");
if (st == string::npos)
return string();
#ifndef RCL_INDEX_STRIPCHARS
} else {
if (has_prefix(trm)) {
st = trm.find_last_of(":") + 1;
} else {
return trm;
}
}
#endif
return trm.substr(st);
}
inline string wrap_prefix(const string& pfx)
{
#ifndef RCL_INDEX_STRIPCHARS

View File

@ -745,6 +745,7 @@ void StringToXapianQ::expandTerm(int mods,
if (noexpansion) {
sterm = term;
oexp.push_back(prefix + term);
m_hld.terms[term] = m_hld.uterms.size() - 1;
LOGDEB(("ExpandTerm: final: %s\n", stringsToString(oexp).c_str()));
return;
}
@ -790,9 +791,9 @@ void StringToXapianQ::expandTerm(int mods,
// result:
if (diac_sensitive && case_sensitive) {
// No expansion whatsoever
m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, term, res, -1, m_field);
goto termmatchtoresult;
// No expansion whatsoever.
lexp.push_back(term);
goto exptotermatch;
} else if (diac_sensitive) {
// Expand for accents and case, filtering for same accents,
SynTermTransUnac foldtrans(UNACOP_FOLD);
@ -842,13 +843,12 @@ void StringToXapianQ::expandTerm(int mods,
lexp.resize(uit - lexp.begin());
}
// Bogus wildcard expand to generate the result
// Bogus wildcard expand to generate the result (possibly add prefixes)
exptotermatch:
LOGDEB(("ExpandTerm:TM: lexp: %s\n", stringsToString(lexp).c_str()));
for (vector<string>::const_iterator it = lexp.begin();
it != lexp.end(); it++) {
m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, *it,
res, -1, m_field);
m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, *it, res, -1, m_field);
}
#endif
@ -864,6 +864,11 @@ termmatchtoresult:
if (oexp.empty())
oexp.push_back(prefix + term);
// Remember the uterm-to-expansion links
for (vector<string>::const_iterator it = oexp.begin();
it != oexp.end(); it++) {
m_hld.terms[strip_prefix(*it)] = term;
}
LOGDEB(("ExpandTerm: final: %s\n", stringsToString(oexp).c_str()));
}

View File

@ -17,6 +17,12 @@ struct HighlightData {
*/
std::set<std::string> uterms;
/** The db query terms linked to the uterms entry they were expanded from.
* This is used for aggregating term stats when generating snippets (for
* choosing the best terms, allocating slots, etc. )
*/
std::map<std::string, std::string> terms;
/** The original user terms-or-groups. This is for display
* purposes: ie when creating a menu to look for a specific
* matched group inside a preview window. We want to show the

View File

@ -1050,7 +1050,12 @@ void HighlightData::toString(std::string& out)
it != uterms.end(); it++) {
out.append(" [").append(*it).append("]");
}
out.append("\nUser terms to Query terms:");
for (map<string, string>::const_iterator it = terms.begin();
it != terms.end(); it++) {
out.append("[").append(it->first).append("]->[");
out.append(it->second).append("] ");
}
out.append("\nGroups: ");
char cbuf[200];
sprintf(cbuf, "Groups size %d grpsugidx size %d ugroups size %d",
@ -1075,13 +1080,12 @@ void HighlightData::toString(std::string& out)
out.append("}").append(cbuf);
}
out.append("\n");
fprintf(stderr, "toString ok\n");
}
void HighlightData::append(const HighlightData& hl)
{
uterms.insert(hl.uterms.begin(), hl.uterms.end());
terms.insert(hl.terms.begin(), hl.terms.end());
size_t ugsz0 = ugroups.size();
ugroups.insert(ugroups.end(), hl.ugroups.begin(), hl.ugroups.end());