Abstracts: improve the way we group terms for quality computation

This commit is contained in:
Jean-Francois Dockes 2012-10-03 11:17:16 +02:00
parent d3a26706b5
commit c589419267
5 changed files with 81 additions and 25 deletions

View File

@ -42,7 +42,7 @@ static const string cstr_ellipsis("...");
// This is used to mark positions overlapped by a multi-word match term // This is used to mark positions overlapped by a multi-word match term
static const string occupiedmarker("?"); static const string occupiedmarker("?");
#define DEBUGABSTRACT #undef DEBUGABSTRACT
#ifdef DEBUGABSTRACT #ifdef DEBUGABSTRACT
#define LOGABS LOGDEB #define LOGABS LOGDEB
static void listList(const string& what, const vector<string>&l) static void listList(const string& what, const vector<string>&l)
@ -60,16 +60,16 @@ static void listList(const string&, const vector<string>&)
} }
#endif #endif
// Keep only non-prefixed terms. We use to remove prefixes and keep // Unprefix terms.
// the terms instead, but field terms are normally also indexed
// un-prefixed, so this is simpler and better.
static void noPrefixList(const vector<string>& in, vector<string>& out) static void noPrefixList(const vector<string>& in, vector<string>& out)
{ {
for (vector<string>::const_iterator qit = in.begin(); for (vector<string>::const_iterator qit = in.begin();
qit != in.end(); qit++) { qit != in.end(); qit++) {
if (!has_prefix(*qit)) out.push_back(strip_prefix(*qit));
out.push_back(*qit);
} }
sort(out.begin(), out.end());
vector<string>::iterator it = unique(out.begin(), out.end());
out.resize(it - out.begin());
} }
// Retrieve db-wide frequencies for the query terms and store them in // Retrieve db-wide frequencies for the query terms and store them in
@ -132,26 +132,44 @@ double Query::Native::qualityTerms(Xapian::docid docid,
m_q->m_sd->getTerms(hld); m_q->m_sd->getTerms(hld);
} }
#ifdef DEBUGABSTRACT
{
string deb;
hld.toString(deb);
LOGABS(("qualityTerms: hld: %s\n", deb.c_str()));
}
#endif
// Group the input terms by the user term they were possibly expanded from // Group the input terms by the user term they were possibly expanded from
map<string, vector<string> > byRoot; map<string, vector<string> > byRoot;
for (vector<string>::const_iterator qit = terms.begin(); for (vector<string>::const_iterator qit = terms.begin();
qit != terms.end(); qit++) { qit != terms.end(); qit++) {
bool found = false; bool found = false;
for (unsigned int gidx = 0; gidx < hld.groups.size(); gidx++) { map<string, string>::const_iterator eit = hld.terms.find(*qit);
if (hld.groups[gidx].size() == 1 && hld.groups[gidx][0] == *qit) { if (eit != hld.terms.end()) {
string us = hld.ugroups[hld.grpsugidx[gidx]][0]; byRoot[eit->second].push_back(*qit);
LOGABS(("qualityTerms: [%s] found, comes from [%s]\n", } else {
(*qit).c_str(), us.c_str()));
byRoot[us].push_back(*qit);
found = true;
}
}
if (!found) {
LOGDEB0(("qualityTerms: [%s] not found in hld\n", (*qit).c_str())); LOGDEB0(("qualityTerms: [%s] not found in hld\n", (*qit).c_str()));
byRoot[*qit].push_back(*qit); byRoot[*qit].push_back(*qit);
} }
} }
#ifdef DEBUGABSTRACT
{
string byRootstr;
for (map<string, vector<string> >::const_iterator debit =
byRoot.begin(); debit != byRoot.end(); debit++) {
byRootstr.append("[").append(debit->first).append("]->");
for (vector<string>::const_iterator it = debit->second.begin();
it != debit->second.end(); it++) {
byRootstr.append("[").append(*it).append("] ");
}
byRootstr.append("\n");
}
LOGABS(("\nqualityTerms: uterms to terms: %s\n", byRootstr.c_str()));
}
#endif
// Compute in-document and global frequencies for the groups. // Compute in-document and global frequencies for the groups.
map<string, double> grpwdfs; map<string, double> grpwdfs;
map<string, double> grptfreqs; map<string, double> grptfreqs;

View File

@ -142,6 +142,29 @@ inline bool has_prefix(const string& trm)
#endif #endif
} }
inline string strip_prefix(const string& trm)
{
if (trm.empty())
return trm;
string::size_type st = 0;
#ifndef RCL_INDEX_STRIPCHARS
if (o_index_stripchars) {
#endif
st = trm.find_first_not_of("ABCDEFIJKLMNOPQRSTUVWXYZ");
if (st == string::npos)
return string();
#ifndef RCL_INDEX_STRIPCHARS
} else {
if (has_prefix(trm)) {
st = trm.find_last_of(":") + 1;
} else {
return trm;
}
}
#endif
return trm.substr(st);
}
inline string wrap_prefix(const string& pfx) inline string wrap_prefix(const string& pfx)
{ {
#ifndef RCL_INDEX_STRIPCHARS #ifndef RCL_INDEX_STRIPCHARS

View File

@ -745,6 +745,7 @@ void StringToXapianQ::expandTerm(int mods,
if (noexpansion) { if (noexpansion) {
sterm = term; sterm = term;
oexp.push_back(prefix + term); oexp.push_back(prefix + term);
m_hld.terms[term] = m_hld.uterms.size() - 1;
LOGDEB(("ExpandTerm: final: %s\n", stringsToString(oexp).c_str())); LOGDEB(("ExpandTerm: final: %s\n", stringsToString(oexp).c_str()));
return; return;
} }
@ -790,9 +791,9 @@ void StringToXapianQ::expandTerm(int mods,
// result: // result:
if (diac_sensitive && case_sensitive) { if (diac_sensitive && case_sensitive) {
// No expansion whatsoever // No expansion whatsoever.
m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, term, res, -1, m_field); lexp.push_back(term);
goto termmatchtoresult; goto exptotermatch;
} else if (diac_sensitive) { } else if (diac_sensitive) {
// Expand for accents and case, filtering for same accents, // Expand for accents and case, filtering for same accents,
SynTermTransUnac foldtrans(UNACOP_FOLD); SynTermTransUnac foldtrans(UNACOP_FOLD);
@ -842,13 +843,12 @@ void StringToXapianQ::expandTerm(int mods,
lexp.resize(uit - lexp.begin()); lexp.resize(uit - lexp.begin());
} }
// Bogus wildcard expand to generate the result // Bogus wildcard expand to generate the result (possibly add prefixes)
exptotermatch: exptotermatch:
LOGDEB(("ExpandTerm:TM: lexp: %s\n", stringsToString(lexp).c_str())); LOGDEB(("ExpandTerm:TM: lexp: %s\n", stringsToString(lexp).c_str()));
for (vector<string>::const_iterator it = lexp.begin(); for (vector<string>::const_iterator it = lexp.begin();
it != lexp.end(); it++) { it != lexp.end(); it++) {
m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, *it, m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, *it, res, -1, m_field);
res, -1, m_field);
} }
#endif #endif
@ -864,6 +864,11 @@ termmatchtoresult:
if (oexp.empty()) if (oexp.empty())
oexp.push_back(prefix + term); oexp.push_back(prefix + term);
// Remember the uterm-to-expansion links
for (vector<string>::const_iterator it = oexp.begin();
it != oexp.end(); it++) {
m_hld.terms[strip_prefix(*it)] = term;
}
LOGDEB(("ExpandTerm: final: %s\n", stringsToString(oexp).c_str())); LOGDEB(("ExpandTerm: final: %s\n", stringsToString(oexp).c_str()));
} }

View File

@ -17,6 +17,12 @@ struct HighlightData {
*/ */
std::set<std::string> uterms; std::set<std::string> uterms;
/** The db query terms linked to the uterms entry they were expanded from.
* This is used for aggregating term stats when generating snippets (for
* choosing the best terms, allocating slots, etc. )
*/
std::map<std::string, std::string> terms;
/** The original user terms-or-groups. This is for display /** The original user terms-or-groups. This is for display
* purposes: ie when creating a menu to look for a specific * purposes: ie when creating a menu to look for a specific
* matched group inside a preview window. We want to show the * matched group inside a preview window. We want to show the

View File

@ -1050,7 +1050,12 @@ void HighlightData::toString(std::string& out)
it != uterms.end(); it++) { it != uterms.end(); it++) {
out.append(" [").append(*it).append("]"); out.append(" [").append(*it).append("]");
} }
out.append("\nUser terms to Query terms:");
for (map<string, string>::const_iterator it = terms.begin();
it != terms.end(); it++) {
out.append("[").append(it->first).append("]->[");
out.append(it->second).append("] ");
}
out.append("\nGroups: "); out.append("\nGroups: ");
char cbuf[200]; char cbuf[200];
sprintf(cbuf, "Groups size %d grpsugidx size %d ugroups size %d", sprintf(cbuf, "Groups size %d grpsugidx size %d ugroups size %d",
@ -1075,13 +1080,12 @@ void HighlightData::toString(std::string& out)
out.append("}").append(cbuf); out.append("}").append(cbuf);
} }
out.append("\n"); out.append("\n");
fprintf(stderr, "toString ok\n");
} }
void HighlightData::append(const HighlightData& hl) void HighlightData::append(const HighlightData& hl)
{ {
uterms.insert(hl.uterms.begin(), hl.uterms.end()); uterms.insert(hl.uterms.begin(), hl.uterms.end());
terms.insert(hl.terms.begin(), hl.terms.end());
size_t ugsz0 = ugroups.size(); size_t ugsz0 = ugroups.size();
ugroups.insert(ugroups.end(), hl.ugroups.begin(), hl.ugroups.end()); ugroups.insert(ugroups.end(), hl.ugroups.begin(), hl.ugroups.end());