Abstracts: improve the way we group terms for quality computation
This commit is contained in:
parent
d3a26706b5
commit
c589419267
@ -42,7 +42,7 @@ static const string cstr_ellipsis("...");
|
|||||||
// This is used to mark positions overlapped by a multi-word match term
|
// This is used to mark positions overlapped by a multi-word match term
|
||||||
static const string occupiedmarker("?");
|
static const string occupiedmarker("?");
|
||||||
|
|
||||||
#define DEBUGABSTRACT
|
#undef DEBUGABSTRACT
|
||||||
#ifdef DEBUGABSTRACT
|
#ifdef DEBUGABSTRACT
|
||||||
#define LOGABS LOGDEB
|
#define LOGABS LOGDEB
|
||||||
static void listList(const string& what, const vector<string>&l)
|
static void listList(const string& what, const vector<string>&l)
|
||||||
@ -60,16 +60,16 @@ static void listList(const string&, const vector<string>&)
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Keep only non-prefixed terms. We use to remove prefixes and keep
|
// Unprefix terms.
|
||||||
// the terms instead, but field terms are normally also indexed
|
|
||||||
// un-prefixed, so this is simpler and better.
|
|
||||||
static void noPrefixList(const vector<string>& in, vector<string>& out)
|
static void noPrefixList(const vector<string>& in, vector<string>& out)
|
||||||
{
|
{
|
||||||
for (vector<string>::const_iterator qit = in.begin();
|
for (vector<string>::const_iterator qit = in.begin();
|
||||||
qit != in.end(); qit++) {
|
qit != in.end(); qit++) {
|
||||||
if (!has_prefix(*qit))
|
out.push_back(strip_prefix(*qit));
|
||||||
out.push_back(*qit);
|
|
||||||
}
|
}
|
||||||
|
sort(out.begin(), out.end());
|
||||||
|
vector<string>::iterator it = unique(out.begin(), out.end());
|
||||||
|
out.resize(it - out.begin());
|
||||||
}
|
}
|
||||||
|
|
||||||
// Retrieve db-wide frequencies for the query terms and store them in
|
// Retrieve db-wide frequencies for the query terms and store them in
|
||||||
@ -132,26 +132,44 @@ double Query::Native::qualityTerms(Xapian::docid docid,
|
|||||||
m_q->m_sd->getTerms(hld);
|
m_q->m_sd->getTerms(hld);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef DEBUGABSTRACT
|
||||||
|
{
|
||||||
|
string deb;
|
||||||
|
hld.toString(deb);
|
||||||
|
LOGABS(("qualityTerms: hld: %s\n", deb.c_str()));
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// Group the input terms by the user term they were possibly expanded from
|
// Group the input terms by the user term they were possibly expanded from
|
||||||
map<string, vector<string> > byRoot;
|
map<string, vector<string> > byRoot;
|
||||||
for (vector<string>::const_iterator qit = terms.begin();
|
for (vector<string>::const_iterator qit = terms.begin();
|
||||||
qit != terms.end(); qit++) {
|
qit != terms.end(); qit++) {
|
||||||
bool found = false;
|
bool found = false;
|
||||||
for (unsigned int gidx = 0; gidx < hld.groups.size(); gidx++) {
|
map<string, string>::const_iterator eit = hld.terms.find(*qit);
|
||||||
if (hld.groups[gidx].size() == 1 && hld.groups[gidx][0] == *qit) {
|
if (eit != hld.terms.end()) {
|
||||||
string us = hld.ugroups[hld.grpsugidx[gidx]][0];
|
byRoot[eit->second].push_back(*qit);
|
||||||
LOGABS(("qualityTerms: [%s] found, comes from [%s]\n",
|
} else {
|
||||||
(*qit).c_str(), us.c_str()));
|
|
||||||
byRoot[us].push_back(*qit);
|
|
||||||
found = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (!found) {
|
|
||||||
LOGDEB0(("qualityTerms: [%s] not found in hld\n", (*qit).c_str()));
|
LOGDEB0(("qualityTerms: [%s] not found in hld\n", (*qit).c_str()));
|
||||||
byRoot[*qit].push_back(*qit);
|
byRoot[*qit].push_back(*qit);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef DEBUGABSTRACT
|
||||||
|
{
|
||||||
|
string byRootstr;
|
||||||
|
for (map<string, vector<string> >::const_iterator debit =
|
||||||
|
byRoot.begin(); debit != byRoot.end(); debit++) {
|
||||||
|
byRootstr.append("[").append(debit->first).append("]->");
|
||||||
|
for (vector<string>::const_iterator it = debit->second.begin();
|
||||||
|
it != debit->second.end(); it++) {
|
||||||
|
byRootstr.append("[").append(*it).append("] ");
|
||||||
|
}
|
||||||
|
byRootstr.append("\n");
|
||||||
|
}
|
||||||
|
LOGABS(("\nqualityTerms: uterms to terms: %s\n", byRootstr.c_str()));
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// Compute in-document and global frequencies for the groups.
|
// Compute in-document and global frequencies for the groups.
|
||||||
map<string, double> grpwdfs;
|
map<string, double> grpwdfs;
|
||||||
map<string, double> grptfreqs;
|
map<string, double> grptfreqs;
|
||||||
|
|||||||
@ -142,6 +142,29 @@ inline bool has_prefix(const string& trm)
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline string strip_prefix(const string& trm)
|
||||||
|
{
|
||||||
|
if (trm.empty())
|
||||||
|
return trm;
|
||||||
|
string::size_type st = 0;
|
||||||
|
#ifndef RCL_INDEX_STRIPCHARS
|
||||||
|
if (o_index_stripchars) {
|
||||||
|
#endif
|
||||||
|
st = trm.find_first_not_of("ABCDEFIJKLMNOPQRSTUVWXYZ");
|
||||||
|
if (st == string::npos)
|
||||||
|
return string();
|
||||||
|
#ifndef RCL_INDEX_STRIPCHARS
|
||||||
|
} else {
|
||||||
|
if (has_prefix(trm)) {
|
||||||
|
st = trm.find_last_of(":") + 1;
|
||||||
|
} else {
|
||||||
|
return trm;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
return trm.substr(st);
|
||||||
|
}
|
||||||
|
|
||||||
inline string wrap_prefix(const string& pfx)
|
inline string wrap_prefix(const string& pfx)
|
||||||
{
|
{
|
||||||
#ifndef RCL_INDEX_STRIPCHARS
|
#ifndef RCL_INDEX_STRIPCHARS
|
||||||
|
|||||||
@ -745,6 +745,7 @@ void StringToXapianQ::expandTerm(int mods,
|
|||||||
if (noexpansion) {
|
if (noexpansion) {
|
||||||
sterm = term;
|
sterm = term;
|
||||||
oexp.push_back(prefix + term);
|
oexp.push_back(prefix + term);
|
||||||
|
m_hld.terms[term] = m_hld.uterms.size() - 1;
|
||||||
LOGDEB(("ExpandTerm: final: %s\n", stringsToString(oexp).c_str()));
|
LOGDEB(("ExpandTerm: final: %s\n", stringsToString(oexp).c_str()));
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@ -790,9 +791,9 @@ void StringToXapianQ::expandTerm(int mods,
|
|||||||
// result:
|
// result:
|
||||||
|
|
||||||
if (diac_sensitive && case_sensitive) {
|
if (diac_sensitive && case_sensitive) {
|
||||||
// No expansion whatsoever
|
// No expansion whatsoever.
|
||||||
m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, term, res, -1, m_field);
|
lexp.push_back(term);
|
||||||
goto termmatchtoresult;
|
goto exptotermatch;
|
||||||
} else if (diac_sensitive) {
|
} else if (diac_sensitive) {
|
||||||
// Expand for accents and case, filtering for same accents,
|
// Expand for accents and case, filtering for same accents,
|
||||||
SynTermTransUnac foldtrans(UNACOP_FOLD);
|
SynTermTransUnac foldtrans(UNACOP_FOLD);
|
||||||
@ -842,13 +843,12 @@ void StringToXapianQ::expandTerm(int mods,
|
|||||||
lexp.resize(uit - lexp.begin());
|
lexp.resize(uit - lexp.begin());
|
||||||
}
|
}
|
||||||
|
|
||||||
// Bogus wildcard expand to generate the result
|
// Bogus wildcard expand to generate the result (possibly add prefixes)
|
||||||
exptotermatch:
|
exptotermatch:
|
||||||
LOGDEB(("ExpandTerm:TM: lexp: %s\n", stringsToString(lexp).c_str()));
|
LOGDEB(("ExpandTerm:TM: lexp: %s\n", stringsToString(lexp).c_str()));
|
||||||
for (vector<string>::const_iterator it = lexp.begin();
|
for (vector<string>::const_iterator it = lexp.begin();
|
||||||
it != lexp.end(); it++) {
|
it != lexp.end(); it++) {
|
||||||
m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, *it,
|
m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, *it, res, -1, m_field);
|
||||||
res, -1, m_field);
|
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -864,6 +864,11 @@ termmatchtoresult:
|
|||||||
if (oexp.empty())
|
if (oexp.empty())
|
||||||
oexp.push_back(prefix + term);
|
oexp.push_back(prefix + term);
|
||||||
|
|
||||||
|
// Remember the uterm-to-expansion links
|
||||||
|
for (vector<string>::const_iterator it = oexp.begin();
|
||||||
|
it != oexp.end(); it++) {
|
||||||
|
m_hld.terms[strip_prefix(*it)] = term;
|
||||||
|
}
|
||||||
LOGDEB(("ExpandTerm: final: %s\n", stringsToString(oexp).c_str()));
|
LOGDEB(("ExpandTerm: final: %s\n", stringsToString(oexp).c_str()));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -17,6 +17,12 @@ struct HighlightData {
|
|||||||
*/
|
*/
|
||||||
std::set<std::string> uterms;
|
std::set<std::string> uterms;
|
||||||
|
|
||||||
|
/** The db query terms linked to the uterms entry they were expanded from.
|
||||||
|
* This is used for aggregating term stats when generating snippets (for
|
||||||
|
* choosing the best terms, allocating slots, etc. )
|
||||||
|
*/
|
||||||
|
std::map<std::string, std::string> terms;
|
||||||
|
|
||||||
/** The original user terms-or-groups. This is for display
|
/** The original user terms-or-groups. This is for display
|
||||||
* purposes: ie when creating a menu to look for a specific
|
* purposes: ie when creating a menu to look for a specific
|
||||||
* matched group inside a preview window. We want to show the
|
* matched group inside a preview window. We want to show the
|
||||||
|
|||||||
@ -1050,7 +1050,12 @@ void HighlightData::toString(std::string& out)
|
|||||||
it != uterms.end(); it++) {
|
it != uterms.end(); it++) {
|
||||||
out.append(" [").append(*it).append("]");
|
out.append(" [").append(*it).append("]");
|
||||||
}
|
}
|
||||||
|
out.append("\nUser terms to Query terms:");
|
||||||
|
for (map<string, string>::const_iterator it = terms.begin();
|
||||||
|
it != terms.end(); it++) {
|
||||||
|
out.append("[").append(it->first).append("]->[");
|
||||||
|
out.append(it->second).append("] ");
|
||||||
|
}
|
||||||
out.append("\nGroups: ");
|
out.append("\nGroups: ");
|
||||||
char cbuf[200];
|
char cbuf[200];
|
||||||
sprintf(cbuf, "Groups size %d grpsugidx size %d ugroups size %d",
|
sprintf(cbuf, "Groups size %d grpsugidx size %d ugroups size %d",
|
||||||
@ -1075,13 +1080,12 @@ void HighlightData::toString(std::string& out)
|
|||||||
out.append("}").append(cbuf);
|
out.append("}").append(cbuf);
|
||||||
}
|
}
|
||||||
out.append("\n");
|
out.append("\n");
|
||||||
fprintf(stderr, "toString ok\n");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void HighlightData::append(const HighlightData& hl)
|
void HighlightData::append(const HighlightData& hl)
|
||||||
{
|
{
|
||||||
uterms.insert(hl.uterms.begin(), hl.uterms.end());
|
uterms.insert(hl.uterms.begin(), hl.uterms.end());
|
||||||
|
terms.insert(hl.terms.begin(), hl.terms.end());
|
||||||
size_t ugsz0 = ugroups.size();
|
size_t ugsz0 = ugroups.size();
|
||||||
ugroups.insert(ugroups.end(), hl.ugroups.begin(), hl.ugroups.end());
|
ugroups.insert(ugroups.end(), hl.ugroups.begin(), hl.ugroups.end());
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user