Abstracts: improve the way we group terms for quality computation
This commit is contained in:
parent
d3a26706b5
commit
c589419267
@ -42,7 +42,7 @@ static const string cstr_ellipsis("...");
|
||||
// This is used to mark positions overlapped by a multi-word match term
|
||||
static const string occupiedmarker("?");
|
||||
|
||||
#define DEBUGABSTRACT
|
||||
#undef DEBUGABSTRACT
|
||||
#ifdef DEBUGABSTRACT
|
||||
#define LOGABS LOGDEB
|
||||
static void listList(const string& what, const vector<string>&l)
|
||||
@ -60,16 +60,16 @@ static void listList(const string&, const vector<string>&)
|
||||
}
|
||||
#endif
|
||||
|
||||
// Keep only non-prefixed terms. We use to remove prefixes and keep
|
||||
// the terms instead, but field terms are normally also indexed
|
||||
// un-prefixed, so this is simpler and better.
|
||||
// Unprefix terms.
|
||||
static void noPrefixList(const vector<string>& in, vector<string>& out)
|
||||
{
|
||||
for (vector<string>::const_iterator qit = in.begin();
|
||||
qit != in.end(); qit++) {
|
||||
if (!has_prefix(*qit))
|
||||
out.push_back(*qit);
|
||||
out.push_back(strip_prefix(*qit));
|
||||
}
|
||||
sort(out.begin(), out.end());
|
||||
vector<string>::iterator it = unique(out.begin(), out.end());
|
||||
out.resize(it - out.begin());
|
||||
}
|
||||
|
||||
// Retrieve db-wide frequencies for the query terms and store them in
|
||||
@ -132,26 +132,44 @@ double Query::Native::qualityTerms(Xapian::docid docid,
|
||||
m_q->m_sd->getTerms(hld);
|
||||
}
|
||||
|
||||
#ifdef DEBUGABSTRACT
|
||||
{
|
||||
string deb;
|
||||
hld.toString(deb);
|
||||
LOGABS(("qualityTerms: hld: %s\n", deb.c_str()));
|
||||
}
|
||||
#endif
|
||||
|
||||
// Group the input terms by the user term they were possibly expanded from
|
||||
map<string, vector<string> > byRoot;
|
||||
for (vector<string>::const_iterator qit = terms.begin();
|
||||
qit != terms.end(); qit++) {
|
||||
bool found = false;
|
||||
for (unsigned int gidx = 0; gidx < hld.groups.size(); gidx++) {
|
||||
if (hld.groups[gidx].size() == 1 && hld.groups[gidx][0] == *qit) {
|
||||
string us = hld.ugroups[hld.grpsugidx[gidx]][0];
|
||||
LOGABS(("qualityTerms: [%s] found, comes from [%s]\n",
|
||||
(*qit).c_str(), us.c_str()));
|
||||
byRoot[us].push_back(*qit);
|
||||
found = true;
|
||||
}
|
||||
}
|
||||
if (!found) {
|
||||
map<string, string>::const_iterator eit = hld.terms.find(*qit);
|
||||
if (eit != hld.terms.end()) {
|
||||
byRoot[eit->second].push_back(*qit);
|
||||
} else {
|
||||
LOGDEB0(("qualityTerms: [%s] not found in hld\n", (*qit).c_str()));
|
||||
byRoot[*qit].push_back(*qit);
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef DEBUGABSTRACT
|
||||
{
|
||||
string byRootstr;
|
||||
for (map<string, vector<string> >::const_iterator debit =
|
||||
byRoot.begin(); debit != byRoot.end(); debit++) {
|
||||
byRootstr.append("[").append(debit->first).append("]->");
|
||||
for (vector<string>::const_iterator it = debit->second.begin();
|
||||
it != debit->second.end(); it++) {
|
||||
byRootstr.append("[").append(*it).append("] ");
|
||||
}
|
||||
byRootstr.append("\n");
|
||||
}
|
||||
LOGABS(("\nqualityTerms: uterms to terms: %s\n", byRootstr.c_str()));
|
||||
}
|
||||
#endif
|
||||
|
||||
// Compute in-document and global frequencies for the groups.
|
||||
map<string, double> grpwdfs;
|
||||
map<string, double> grptfreqs;
|
||||
|
||||
@ -142,6 +142,29 @@ inline bool has_prefix(const string& trm)
|
||||
#endif
|
||||
}
|
||||
|
||||
inline string strip_prefix(const string& trm)
|
||||
{
|
||||
if (trm.empty())
|
||||
return trm;
|
||||
string::size_type st = 0;
|
||||
#ifndef RCL_INDEX_STRIPCHARS
|
||||
if (o_index_stripchars) {
|
||||
#endif
|
||||
st = trm.find_first_not_of("ABCDEFIJKLMNOPQRSTUVWXYZ");
|
||||
if (st == string::npos)
|
||||
return string();
|
||||
#ifndef RCL_INDEX_STRIPCHARS
|
||||
} else {
|
||||
if (has_prefix(trm)) {
|
||||
st = trm.find_last_of(":") + 1;
|
||||
} else {
|
||||
return trm;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
return trm.substr(st);
|
||||
}
|
||||
|
||||
inline string wrap_prefix(const string& pfx)
|
||||
{
|
||||
#ifndef RCL_INDEX_STRIPCHARS
|
||||
|
||||
@ -745,6 +745,7 @@ void StringToXapianQ::expandTerm(int mods,
|
||||
if (noexpansion) {
|
||||
sterm = term;
|
||||
oexp.push_back(prefix + term);
|
||||
m_hld.terms[term] = m_hld.uterms.size() - 1;
|
||||
LOGDEB(("ExpandTerm: final: %s\n", stringsToString(oexp).c_str()));
|
||||
return;
|
||||
}
|
||||
@ -790,9 +791,9 @@ void StringToXapianQ::expandTerm(int mods,
|
||||
// result:
|
||||
|
||||
if (diac_sensitive && case_sensitive) {
|
||||
// No expansion whatsoever
|
||||
m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, term, res, -1, m_field);
|
||||
goto termmatchtoresult;
|
||||
// No expansion whatsoever.
|
||||
lexp.push_back(term);
|
||||
goto exptotermatch;
|
||||
} else if (diac_sensitive) {
|
||||
// Expand for accents and case, filtering for same accents,
|
||||
SynTermTransUnac foldtrans(UNACOP_FOLD);
|
||||
@ -842,13 +843,12 @@ void StringToXapianQ::expandTerm(int mods,
|
||||
lexp.resize(uit - lexp.begin());
|
||||
}
|
||||
|
||||
// Bogus wildcard expand to generate the result
|
||||
// Bogus wildcard expand to generate the result (possibly add prefixes)
|
||||
exptotermatch:
|
||||
LOGDEB(("ExpandTerm:TM: lexp: %s\n", stringsToString(lexp).c_str()));
|
||||
for (vector<string>::const_iterator it = lexp.begin();
|
||||
it != lexp.end(); it++) {
|
||||
m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, *it,
|
||||
res, -1, m_field);
|
||||
m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, *it, res, -1, m_field);
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -864,6 +864,11 @@ termmatchtoresult:
|
||||
if (oexp.empty())
|
||||
oexp.push_back(prefix + term);
|
||||
|
||||
// Remember the uterm-to-expansion links
|
||||
for (vector<string>::const_iterator it = oexp.begin();
|
||||
it != oexp.end(); it++) {
|
||||
m_hld.terms[strip_prefix(*it)] = term;
|
||||
}
|
||||
LOGDEB(("ExpandTerm: final: %s\n", stringsToString(oexp).c_str()));
|
||||
}
|
||||
|
||||
|
||||
@ -17,6 +17,12 @@ struct HighlightData {
|
||||
*/
|
||||
std::set<std::string> uterms;
|
||||
|
||||
/** The db query terms linked to the uterms entry they were expanded from.
|
||||
* This is used for aggregating term stats when generating snippets (for
|
||||
* choosing the best terms, allocating slots, etc. )
|
||||
*/
|
||||
std::map<std::string, std::string> terms;
|
||||
|
||||
/** The original user terms-or-groups. This is for display
|
||||
* purposes: ie when creating a menu to look for a specific
|
||||
* matched group inside a preview window. We want to show the
|
||||
|
||||
@ -1050,7 +1050,12 @@ void HighlightData::toString(std::string& out)
|
||||
it != uterms.end(); it++) {
|
||||
out.append(" [").append(*it).append("]");
|
||||
}
|
||||
|
||||
out.append("\nUser terms to Query terms:");
|
||||
for (map<string, string>::const_iterator it = terms.begin();
|
||||
it != terms.end(); it++) {
|
||||
out.append("[").append(it->first).append("]->[");
|
||||
out.append(it->second).append("] ");
|
||||
}
|
||||
out.append("\nGroups: ");
|
||||
char cbuf[200];
|
||||
sprintf(cbuf, "Groups size %d grpsugidx size %d ugroups size %d",
|
||||
@ -1075,13 +1080,12 @@ void HighlightData::toString(std::string& out)
|
||||
out.append("}").append(cbuf);
|
||||
}
|
||||
out.append("\n");
|
||||
fprintf(stderr, "toString ok\n");
|
||||
}
|
||||
|
||||
void HighlightData::append(const HighlightData& hl)
|
||||
{
|
||||
uterms.insert(hl.uterms.begin(), hl.uterms.end());
|
||||
|
||||
terms.insert(hl.terms.begin(), hl.terms.end());
|
||||
size_t ugsz0 = ugroups.size();
|
||||
ugroups.insert(ugroups.end(), hl.ugroups.begin(), hl.ugroups.end());
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user