Abstracts: improve the way we group terms for quality computation

2012-10-03 11:17:16 +02:00 · 2012-10-03 11:17:16 +02:00 · c589419267
commit c589419267
parent d3a26706b5
5 changed files with 81 additions and 25 deletions
--- a/src/rcldb/rclabstract.cpp
+++ b/src/rcldb/rclabstract.cpp
@ -42,7 +42,7 @@ static const string cstr_ellipsis("...");
 // This is used to mark positions overlapped by a multi-word match term
 static const string occupiedmarker("?");

-#define DEBUGABSTRACT  
+#undef DEBUGABSTRACT  
 #ifdef DEBUGABSTRACT
 #define LOGABS LOGDEB
 static void listList(const string& what, const vector<string>&l)
@ -60,16 +60,16 @@ static void listList(const string&, const vector<string>&)
 }
 #endif

-// Keep only non-prefixed terms. We use to remove prefixes and keep
-// the terms instead, but field terms are normally also indexed
-// un-prefixed, so this is simpler and better.
+// Unprefix terms.
 static void noPrefixList(const vector<string>& in, vector<string>& out) 
 {
    for (vector<string>::const_iterator qit = in.begin(); 
 	 qit != in.end(); qit++) {
-	if (!has_prefix(*qit))
-	    out.push_back(*qit);
+	out.push_back(strip_prefix(*qit));
    }
+    sort(out.begin(), out.end());
+    vector<string>::iterator it = unique(out.begin(), out.end());
+    out.resize(it - out.begin());
 }

 // Retrieve db-wide frequencies for the query terms and store them in
@ -132,26 +132,44 @@ double Query::Native::qualityTerms(Xapian::docid docid,
 	m_q->m_sd->getTerms(hld);
    }

+#ifdef DEBUGABSTRACT
+    {
+	string deb;
+	hld.toString(deb);
+	LOGABS(("qualityTerms: hld: %s\n", deb.c_str()));
+    }
+#endif
+
    // Group the input terms by the user term they were possibly expanded from
    map<string, vector<string> > byRoot;
    for (vector<string>::const_iterator qit = terms.begin(); 
 	 qit != terms.end(); qit++) {
 	bool found = false;
-	for (unsigned int gidx = 0; gidx < hld.groups.size(); gidx++) {
-	    if (hld.groups[gidx].size() == 1 && hld.groups[gidx][0] == *qit) {
-		string us = hld.ugroups[hld.grpsugidx[gidx]][0];
-		LOGABS(("qualityTerms: [%s] found, comes from [%s]\n", 
-			(*qit).c_str(),	us.c_str()));
-		byRoot[us].push_back(*qit);
-		found = true;
-	    }
-	} 
-	if (!found) {
+	map<string, string>::const_iterator eit = hld.terms.find(*qit);
+	if (eit != hld.terms.end()) {
+	    byRoot[eit->second].push_back(*qit);
+	} else {
 	    LOGDEB0(("qualityTerms: [%s] not found in hld\n", (*qit).c_str()));
 	    byRoot[*qit].push_back(*qit);
 	}
    }

+#ifdef DEBUGABSTRACT
+    {
+	string byRootstr;
+	for (map<string, vector<string> >::const_iterator debit = 
+		 byRoot.begin();  debit != byRoot.end(); debit++) {
+	    byRootstr.append("[").append(debit->first).append("]->");
+	    for (vector<string>::const_iterator it = debit->second.begin();
+		 it != debit->second.end(); it++) {
+		byRootstr.append("[").append(*it).append("] ");
+	    }
+	    byRootstr.append("\n");
+	}
+	LOGABS(("\nqualityTerms: uterms to terms: %s\n", byRootstr.c_str()));
+    }
+#endif
+
    // Compute in-document and global frequencies for the groups.
    map<string, double> grpwdfs;
    map<string, double> grptfreqs;
--- a/src/rcldb/rcldb.h
+++ b/src/rcldb/rcldb.h
@ -142,6 +142,29 @@ inline bool has_prefix(const string& trm)
 #endif
 }

+inline string strip_prefix(const string& trm)
+{
+    if (trm.empty())
+	return trm;
+    string::size_type st = 0;
+#ifndef RCL_INDEX_STRIPCHARS
+    if (o_index_stripchars) {
+#endif
+	st = trm.find_first_not_of("ABCDEFIJKLMNOPQRSTUVWXYZ");
+	if (st == string::npos)
+	    return string();
+#ifndef RCL_INDEX_STRIPCHARS
+    } else {
+	if (has_prefix(trm)) {
+	    st = trm.find_last_of(":") + 1;
+	} else {
+	    return trm;
+	}
+    }
+#endif
+    return trm.substr(st);
+}
+
 inline string wrap_prefix(const string& pfx) 
 {
 #ifndef RCL_INDEX_STRIPCHARS
--- a/src/rcldb/searchdata.cpp
+++ b/src/rcldb/searchdata.cpp
@ -745,6 +745,7 @@ void StringToXapianQ::expandTerm(int mods,
    if (noexpansion) {
 	sterm = term;
 	oexp.push_back(prefix + term);
+	m_hld.terms[term] = m_hld.uterms.size() - 1;
 	LOGDEB(("ExpandTerm: final: %s\n", stringsToString(oexp).c_str()));
 	return;
    } 
@ -790,9 +791,9 @@ void StringToXapianQ::expandTerm(int mods,
    // result:

    if (diac_sensitive && case_sensitive) {
-	// No expansion whatsoever
-	m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, term, res, -1, m_field);
-	goto termmatchtoresult;
+	// No expansion whatsoever. 
+	lexp.push_back(term);
+	goto exptotermatch;
    } else if (diac_sensitive) {
 	// Expand for accents and case, filtering for same accents,
 	SynTermTransUnac foldtrans(UNACOP_FOLD);
@ -842,13 +843,12 @@ void StringToXapianQ::expandTerm(int mods,
 	lexp.resize(uit - lexp.begin());
    }

-    // Bogus wildcard expand to generate the result
+    // Bogus wildcard expand to generate the result (possibly add prefixes)
 exptotermatch:
    LOGDEB(("ExpandTerm:TM: lexp: %s\n", stringsToString(lexp).c_str()));
    for (vector<string>::const_iterator it = lexp.begin();
 	 it != lexp.end(); it++) {
-	m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, *it, 
-		       res, -1, m_field);
+	m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, *it,  res, -1, m_field);
    }
 #endif

@ -864,6 +864,11 @@ termmatchtoresult:
    if (oexp.empty())
 	oexp.push_back(prefix + term);

+    // Remember the uterm-to-expansion links
+    for (vector<string>::const_iterator it = oexp.begin(); 
+	 it != oexp.end(); it++) {
+	m_hld.terms[strip_prefix(*it)] = term;
+    }
    LOGDEB(("ExpandTerm: final: %s\n", stringsToString(oexp).c_str()));
 }

--- a/src/utils/hldata.h
+++ b/src/utils/hldata.h
@ -17,6 +17,12 @@ struct HighlightData {
     */
    std::set<std::string> uterms;

+    /** The db query terms linked to the uterms entry they were expanded from. 
+     * This is used for aggregating term stats when generating snippets (for 
+     * choosing the best terms, allocating slots, etc. )
+     */
+    std::map<std::string, std::string> terms;
+
    /** The original user terms-or-groups. This is for display
     * purposes: ie when creating a menu to look for a specific
     * matched group inside a preview window. We want to show the
--- a/src/utils/smallut.cpp
+++ b/src/utils/smallut.cpp
@ -1050,7 +1050,12 @@ void HighlightData::toString(std::string& out)
 	 it != uterms.end(); it++) {
 	out.append(" [").append(*it).append("]");
    }
-
+    out.append("\nUser terms to Query terms:");
+    for (map<string, string>::const_iterator it = terms.begin();
+	 it != terms.end(); it++) {
+	out.append("[").append(it->first).append("]->[");
+	out.append(it->second).append("] ");
+    }
    out.append("\nGroups: ");
    char cbuf[200];
    sprintf(cbuf, "Groups size %d grpsugidx size %d ugroups size %d",
@ -1075,13 +1080,12 @@ void HighlightData::toString(std::string& out)
 	out.append("}").append(cbuf);
    }
    out.append("\n");
-    fprintf(stderr, "toString ok\n");
 }

 void HighlightData::append(const HighlightData& hl)
 {
    uterms.insert(hl.uterms.begin(), hl.uterms.end());
-
+    terms.insert(hl.terms.begin(), hl.terms.end());
    size_t ugsz0 = ugroups.size();
    ugroups.insert(ugroups.end(), hl.ugroups.begin(), hl.ugroups.end());