From ebbcc115a89d0c9a445cec22a0a96bda68486d78 Mon Sep 17 00:00:00 2001
From: "\"Jean-Francois Dockes ext:(%22)" <jfd@recoll.org>
Date: Fri, 22 Jul 2011 16:43:39 +0200
Subject: [PATCH] Allow setting a weight increase for field terms

---
 src/common/rclconfig.cpp | 102 +++++++++++++++--------------------
 src/common/rclconfig.h   |  20 ++++---
 src/query/xadump.cpp     |   2 +-
 src/rcldb/rcldb.cpp      | 111 +++++++++++++++++++++++----------------
 src/rcldb/rcldb.h        |   3 +-
 src/rcldb/searchdata.cpp |   8 ++-
 src/sampleconf/fields    |  28 +++++++---
 7 files changed, 150 insertions(+), 124 deletions(-)
diff --git a/src/common/rclconfig.cpp b/src/common/rclconfig.cpp
index ee975168..6d437917 100644
--- a/src/common/rclconfig.cpp
+++ b/src/common/rclconfig.cpp
@@ -561,6 +561,7 @@ bool RclConfig::valueSplitAttributes(const string& whole, string& value,
     /* There is currently no way to escape a semi-colon */
     string::size_type semicol0 = whole.find_first_of(";");
     value = whole.substr(0, semicol0);
+    trimstring(value);
     string attrstr;
     if (semicol0 != string::npos && semicol0 < whole.size() - 1) {
         attrstr = whole.substr(semicol0+1);
@@ -602,6 +603,7 @@ void RclConfig::storeMissingHelperDesc(const string &s)
 // things for speed (theses are used a lot during indexing)
 bool RclConfig::readFieldsConfig(const string& cnferrloc)
 {
+    LOGDEB2(("RclConfig::readFieldsConfig\n"));
     m_fields = new ConfStack<ConfSimple>("fields", m_cdirs, true);
     if (m_fields == 0 || !m_fields->ok()) {
 	m_reason = string("No/bad fields file in: ") + cnferrloc;
@@ -615,16 +617,34 @@ bool RclConfig::readFieldsConfig(const string& cnferrloc)
     for (list<string>::const_iterator it = tps.begin(); it != tps.end();it++) {
 	string val;
 	m_fields->get(*it, val, "prefixes");
-	m_fldtopfx[stringtolower(*it)] = val;
+	ConfSimple attrs;
+	FieldTraits ft;
+	if (!valueSplitAttributes(val, ft.pfx, attrs)) {
+	    LOGERR(("readFieldsConfig: bad config line for [%s]: [%s]\n", 
+		    it->c_str(), val.c_str()));
+	    return 0;
+	}
+	string tval;
+	if (attrs.get("wdfinc", tval))
+	    ft.wdfinc = atoi(tval.c_str());
+	if (attrs.get("boost", tval))
+	    ft.boost = atof(tval.c_str());
+	m_fldtotraits[stringtolower(*it)] = ft;
+	LOGDEB2(("readFieldsConfig: [%s] -> [%s] %d %.1f\n", 
+		it->c_str(), ft.pfx.c_str(), ft.wdfinc, ft.boost));
     }
-    // Add prefixes for aliases (build alias-to-canonic map while we're at it)
+
+    // Add prefixes for aliases  an build alias-to-canonic map while we're at it
+    // Having the aliases in the prefix map avoids an additional indirection
+    // at index time.
     tps = m_fields->getNames("aliases");
     for (list<string>::const_iterator it = tps.begin(); it != tps.end();it++) {
 	string canonic = stringtolower(*it); // canonic name
-	string pfx;
-	map<string,string>::const_iterator pit = m_fldtopfx.find(canonic);
-	if (pit != m_fldtopfx.end()) {
-	    pfx = pit->second;
+	FieldTraits ft;
+	map<string, FieldTraits>::const_iterator pit = 
+	    m_fldtotraits.find(canonic);
+	if (pit != m_fldtotraits.end()) {
+	    ft = pit->second;
 	}
 	string aliases;
 	m_fields->get(canonic, aliases, "aliases");
@@ -632,16 +652,18 @@ bool RclConfig::readFieldsConfig(const string& cnferrloc)
 	stringToStrings(aliases, l);
 	for (list<string>::const_iterator ait = l.begin();
 	     ait != l.end(); ait++) {
-	    if (!pfx.empty())
-		m_fldtopfx[stringtolower(*ait)] = pfx;
+	    if (pit != m_fldtotraits.end())
+		m_fldtotraits[stringtolower(*ait)] = ft;
 	    m_aliastocanon[stringtolower(*ait)] = canonic;
 	}
     }
+
 #if 0
-    for (map<string,string>::const_iterator it = m_fldtopfx.begin();
-	 it != m_fldtopfx.end(); it++) {
-	LOGDEB(("RclConfig::readFieldsConfig: [%s] => [%s]\n",
-		it->first.c_str(), it->second.c_str()));
+    for (map<string, FieldTraits>::const_iterator it = m_fldtotraits.begin();
+	 it != m_fldtotraits.end(); it++) {
+	LOGDEB(("readFieldsConfig: [%s] -> [%s] %d %.1f\n", 
+		it->c_str(), it->second.pfx.c_str(), it->second.wdfinc, 
+		it->second.boost));
     }
 #endif
 
@@ -666,19 +688,20 @@ bool RclConfig::readFieldsConfig(const string& cnferrloc)
     return true;
 }
 
-// Return term indexing prefix for field name (ie: "filename" -> "XSFN")
-bool RclConfig::getFieldPrefix(const string& _fld, string &pfx)
+// Return specifics for field name:
+bool RclConfig::getFieldTraits(const string& _fld, const FieldTraits **ftpp)
 {
     string fld = fieldCanon(_fld);
-    map<string,string>::const_iterator pit = m_fldtopfx.find(fld);
-    if (pit != m_fldtopfx.end()) {
-	pfx = pit->second;
+    map<string, FieldTraits>::const_iterator pit = m_fldtotraits.find(fld);
+    if (pit != m_fldtotraits.end()) {
+	*ftpp = &pit->second;
 	LOGDEB1(("RclConfig::getFieldPrefix: [%s]->[%s]\n", 
-		 _fld.c_str(), pfx.c_str()));
+		 _fld.c_str(), ft.pfx.c_str()));
 	return true;
     } else {
 	LOGDEB1(("RclConfig::readFieldsConfig: no prefix for field [%s]\n",
 		 fld.c_str()));
+	*ftpp = 0;
 	return false;
     }
 }
@@ -694,47 +717,6 @@ set<string> RclConfig::getIndexedFields()
     return flds;
 }
 
-// Return specialisations of field name for search expansion 
-// (ie: author->[author, from])
-bool RclConfig::getFieldSpecialisations(const string& fld, 
-					list<string>& children, bool top)
-{
-    if (m_fields == 0)
-        return false;
-    string sclds;
-    children.push_back(fld);
-    if (m_fields->get(fld, sclds, "specialisations")) {
-	list<string> clds;
-	stringToStrings(sclds, clds);
-	for (list<string>::const_iterator it = clds.begin();
-	     it != clds.end(); it++) {
-	    getFieldSpecialisations(*it, children, false);
-	}
-    }
-    if (top) {
-	children.sort();
-	children.unique();
-    }
-    return true;
-}
-
-// 
-bool RclConfig::getFieldSpecialisationPrefixes(const string& fld, 
-					       list<string>& pfxes)
-{
-    list<string> clds;
-    getFieldSpecialisations(fld, clds);
-    for (list<string>::const_iterator it = clds.begin();
-	 it != clds.end(); it++) {
-	string pfx;
-	if (getFieldPrefix(*it, pfx))
-	    pfxes.push_back(pfx);
-    }
-    pfxes.sort();
-    pfxes.unique();
-    return true;
-}
-
 string RclConfig::fieldCanon(const string& f)
 {
     string fld = stringtolower(f);
@@ -1075,7 +1057,7 @@ void RclConfig::initFrom(const RclConfig& r)
 	mimeview = new ConfStack<ConfSimple>(*(r.mimeview));
     if (r.m_fields)
 	m_fields = new ConfStack<ConfSimple>(*(r.m_fields));
-    m_fldtopfx = r.m_fldtopfx;
+    m_fldtotraits = r.m_fldtotraits;
     m_aliastocanon = r.m_aliastocanon;
     m_storedFields = r.m_storedFields;
     m_xattrtofld = r.m_xattrtofld;
diff --git a/src/common/rclconfig.h b/src/common/rclconfig.h
index 3a8f94d8..4bf0c59c 100644
--- a/src/common/rclconfig.h
+++ b/src/common/rclconfig.h
@@ -55,6 +55,16 @@ public:
     bool needrecompute();
 };
 
+// Data associated to a indexed field name: 
+struct FieldTraits {
+    string pfx; // indexing prefix, 
+    int    wdfinc; // Index time term frequency increment (default 1)
+    double boost; // Query time boost (default 1.0)
+    FieldTraits(int i, double f) {wdfinc = i; boost = f;}
+    FieldTraits() : wdfinc(1), boost(1.0) {}
+    FieldTraits(const string& s) : pfx(s), wdfinc(1), boost(1.0) {}
+};
+
 class RclConfig {
  public:
 
@@ -188,13 +198,7 @@ class RclConfig {
     bool getMimeCatTypes(const string& cat, list<string>&);
 
     /** fields: get field prefix from field name */
-    bool getFieldPrefix(const string& fldname, string &pfx);
-    /** Get implied meanings for field name (ie: author->[author, from]) */
-    bool getFieldSpecialisations(const string& fld, 
-				 list<string>& childrens, bool top = true);
-    /** Get prefixes for specialisations of field name */
-    bool getFieldSpecialisationPrefixes(const string& fld, 
-					list<string>& pfxes);
+    bool getFieldTraits(const string& fldname, const FieldTraits **);
     const set<string>& getStoredFields() {return m_storedFields;}
     set<string> getIndexedFields();
     /** Get canonic name for possible alias */
@@ -256,7 +260,7 @@ class RclConfig {
     ConfStack<ConfSimple> *mimeconf; // but their content may depend on it.
     ConfStack<ConfSimple> *mimeview; // 
     ConfStack<ConfSimple> *m_fields;
-    map<string, string>  m_fldtopfx;
+    map<string, FieldTraits>  m_fldtotraits; // Field to field params
     map<string, string>  m_aliastocanon;
     set<string>          m_storedFields;
     map<string, string>  m_xattrtofld;
diff --git a/src/query/xadump.cpp b/src/query/xadump.cpp
index 26a18c29..9db4be34 100644
--- a/src/query/xadump.cpp
+++ b/src/query/xadump.cpp
@@ -271,7 +271,7 @@ int main(int argc, char **argv)
 	    Xapian::PostingIterator doc;
 	    for (doc = db->postlist_begin(aterm);
 		 doc != db->postlist_end(aterm); doc++) {
-		cout << *doc << " : " ;
+		cout << *doc << "(" << doc.get_wdf() << ") : " ;
 		Xapian::PositionIterator pos;
 		for (pos = doc.positionlist_begin(); 
 		     pos != doc.positionlist_end(); pos++) {
diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp
index 87908353..8c4ebf5a 100644
--- a/src/rcldb/rcldb.cpp
+++ b/src/rcldb/rcldb.cpp
@@ -89,33 +89,43 @@ static const string rclSyntAbs("?!#@");
 // omega
 static const string keycap("caption");
 
-// Default table for field->prefix translation.  We prefer the data
-// from rclconfig if available. Note that this is logically const
-// after initialization.  Can't use a static object to init this as
-// the static std::string objects may not be ready
-static map<string, string> fldToPrefs;
-static void initFldToPrefs() 
+// Static/Default table for field->prefix/weight translation. 
+// This is logically const after initialization. Can't use a
+// static object to init this as the static std::string objects may
+// not be ready.
+//
+// This map is searched if a match is not found in the dynamic
+// "fields" configuration (cf: Db::fieldToTraits()), meaning that the
+// entries can be overriden in the configuration, but not
+// suppressed. 
+
+static map<string, FieldTraits> fldToTraits;
+static void initFldToTraits() 
 {
-    fldToPrefs[Doc::keyabs] = string();
-    fldToPrefs["ext"] = "XE";
-    fldToPrefs[Doc::keyfn] = "XSFN";
+    // Can't remember why "abstract" is indexed without a prefix
+    // (result: it's indexed twice actually). Maybe I'll dare change
+    // this one day
+    fldToTraits[Doc::keyabs] = FieldTraits();
 
-    fldToPrefs[keycap] = "S";
-    fldToPrefs[Doc::keytt] = "S";
-    fldToPrefs["subject"] = "S";
+    fldToTraits["ext"] = FieldTraits("XE");
+    fldToTraits[Doc::keyfn] = FieldTraits("XSFN");
 
-    fldToPrefs[Doc::keyau] = "A";
-    fldToPrefs["creator"] = "A";
-    fldToPrefs["from"] = "A";
+    fldToTraits[keycap] = FieldTraits("S");
+    fldToTraits[Doc::keytt] = FieldTraits("S");
+    fldToTraits["subject"] = FieldTraits("S");
 
-    fldToPrefs[Doc::keykw] = "K";
-    fldToPrefs["keyword"] = "K";
-    fldToPrefs["tag"] = "K";
-    fldToPrefs["tags"] = "K";
+    fldToTraits[Doc::keyau] = FieldTraits("A");
+    fldToTraits["creator"] = FieldTraits("A");
+    fldToTraits["from"] = FieldTraits("A");
 
-    fldToPrefs["xapyear"] = "Y";
-    fldToPrefs["xapyearmon"] = "M";
-    fldToPrefs["xapdate"] = "D";
+    fldToTraits[Doc::keykw] = FieldTraits("K");
+    fldToTraits["keyword"] = FieldTraits("K");
+    fldToTraits["tag"] = FieldTraits("K");
+    fldToTraits["tags"] = FieldTraits("K");
+
+    fldToTraits["xapyear"] = FieldTraits("Y");
+    fldToTraits["xapyearmon"] = FieldTraits("M");
+    fldToTraits["xapdate"] = FieldTraits("D");
 }
 
 // Compute the unique term used to link documents to their origin. 
@@ -539,8 +549,8 @@ Db::Db(RclConfig *cfp)
       m_curtxtsz(0), m_flushtxtsz(0), m_occtxtsz(0), m_occFirstCheck(1),
       m_maxFsOccupPc(0), m_mode(Db::DbRO)
 {
-    if (!fldToPrefs.size())
-	initFldToPrefs();
+    if (!fldToTraits.size())
+	initFldToTraits();
 
     m_ndb = new Native(this);
     if (m_config) {
@@ -791,17 +801,18 @@ bool Db::isopen()
 // reason (old config not updated ?). We use it only if the config
 // translation fails. Also we add in there fields which should be
 // indexed with no prefix (ie: abstract)
-bool Db::fieldToPrefix(const string& fld, string &pfx)
+bool Db::fieldToTraits(const string& fld, const FieldTraits **ftpp)
 {
-    if (m_config && m_config->getFieldPrefix(fld, pfx))
+    if (m_config && m_config->getFieldTraits(fld, ftpp))
 	return true;
 
     // No data in rclconfig? Check default values
-    map<string, string>::const_iterator it = fldToPrefs.find(fld);
-    if (it != fldToPrefs.end()) {
-	pfx = it->second;
+    map<string, FieldTraits>::const_iterator it = fldToTraits.find(fld);
+    if (it != fldToTraits.end()) {
+	*ftpp = &it->second;
 	return true;
     }
+    *ftpp = 0;
     return false;
 }
 
@@ -817,15 +828,18 @@ class TextSplitDb : public TextSplit {
     StopList &stops;
     TextSplitDb(Xapian::WritableDatabase idb, 
 		Xapian::Document &d, StopList &_stops) 
-	: db(idb), doc(d), basepos(1), curpos(0), stops(_stops)
+	: db(idb), doc(d), basepos(1), curpos(0), stops(_stops), wdfinc(1)
     {}
     bool takeword(const std::string &term, int pos, int, int);
     void setprefix(const string& pref) {prefix = pref;}
+    void setwdfinc(int i) {wdfinc = i;}
 
 private:
     // If prefix is set, we also add a posting for the prefixed terms
     // (ie: for titles, add postings for both "term" and "Sterm")
     string  prefix; 
+    // Some fields have more weight
+    int wdfinc;
 };
 
 // Get one term from the doc, remove accents and lowercase, then add posting
@@ -853,17 +867,16 @@ bool TextSplitDb::takeword(const std::string &_term, int pos, int, int)
     pos += basepos;
     string ermsg;
     try {
-	// Note: 1 is the within document frequency increment. It would 
-	// be possible to assign different weigths to doc parts (ie title)
-	// by using a higher value
-	doc.add_posting(term, pos, 1);
+	// Index without prefix, using the field-specific weighting
+	doc.add_posting(term, pos, wdfinc);
 #ifdef TESTING_XAPIAN_SPELL
 	if (Db::isSpellingCandidate(term)) {
 	    db.add_spelling(term);
 	}
 #endif
+	// Index the prefixed term.
 	if (!prefix.empty()) {
-	    doc.add_posting(prefix + term, pos, 1);
+	    doc.add_posting(prefix + term, pos, wdfinc);
 	}
 	return true;
     } XCATCHERROR(ermsg);
@@ -984,26 +997,30 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
     //
     // The order has no importance, and we set a position gap of 100
     // between fields to avoid false proximity matches.
-    map<string,string>::iterator meta_it;
-    string pfx;
+    map<string, string>::iterator meta_it;
     for (meta_it = doc.meta.begin(); meta_it != doc.meta.end(); meta_it++) {
 	if (!meta_it->second.empty()) {
-	    if (!fieldToPrefix(meta_it->first, pfx)) {
+	    const FieldTraits *ftp;
+	    // We don't test for an empty prefix here. Some fields are part
+	    // of the internal conf with an empty prefix (ie: abstract).
+	    if (!fieldToTraits(meta_it->first, &ftp)) {
 		LOGDEB0(("Db::add: no prefix for field [%s], no indexing\n",
 			 meta_it->first.c_str()));
 		continue;
 	    }
-	    LOGDEB0(("Db::add: field [%s] pfx [%s]: [%s]\n", 
-		     meta_it->first.c_str(), pfx.c_str(), 
+	    LOGDEB0(("Db::add: field [%s] pfx [%s] inc %d: [%s]\n", 
+		     meta_it->first.c_str(), ftp->pfx.c_str(), ftp->wdfinc,
 		     meta_it->second.c_str()));
-	    splitter.setprefix(pfx); // Subject
+	    splitter.setprefix(ftp->pfx); // Subject
+	    splitter.setwdfinc(ftp->wdfinc);
 	    if (!splitter.text_to_words(meta_it->second))
                 LOGDEB(("Db::addOrUpdate: split failed for %s\n", 
                         meta_it->first.c_str()));
-	    splitter.setprefix(string());
 	    splitter.basepos += splitter.curpos + 100;
 	}
     }
+    splitter.setprefix(string());
+    splitter.setwdfinc(1);
 
     if (splitter.curpos < baseTextPosition)
 	splitter.basepos = baseTextPosition;
@@ -1011,7 +1028,7 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
 	splitter.basepos += splitter.curpos + 100;
 
     // Split and index body text
-    LOGDEB2(("Db::add: split body\n"));
+    LOGDEB2(("Db::add: split body: [%s]\n", doc.text.c_str()));
     if (!splitter.text_to_words(doc.text))
         LOGDEB(("Db::addOrUpdate: split failed for main text\n"));
 
@@ -1560,11 +1577,13 @@ bool Db::termMatch(MatchType typ, const string &lang,
 
     string prefix;
     if (!field.empty()) {
-	(void)fieldToPrefix(field, prefix); 
-        if (prefix.empty()) {
+	const FieldTraits *ftp = 0;
+	if (!fieldToTraits(field, &ftp) || ftp->pfx.empty()) {
             LOGDEB(("Db::termMatch: field is not indexed (no prefix): [%s]\n", 
                     field.c_str()));
-        }
+        } else {
+	    prefix = ftp->pfx;
+	}
         if (prefixp)
             *prefixp = prefix;
     }
diff --git a/src/rcldb/rcldb.h b/src/rcldb/rcldb.h
index 4586cd2d..046e5092 100644
--- a/src/rcldb/rcldb.h
+++ b/src/rcldb/rcldb.h
@@ -24,6 +24,7 @@
 #include "refcntr.h"
 #include "rcldoc.h"
 #include "stoplist.h"
+#include "rclconfig.h"
 
 #ifndef NO_NAMESPACES
 using std::string;
@@ -130,7 +131,7 @@ class Db {
     /* Return list of configured stop words */
     const StopList& getStopList() const {return m_stops;}
     /* Field name to prefix translation (ie: author -> 'A') */
-    bool fieldToPrefix(const string& fldname, string &pfx);
+    bool fieldToTraits(const string& fldname, const FieldTraits **ftpp);
 
     /* Update-related methods ******************************************/
 
diff --git a/src/rcldb/searchdata.cpp b/src/rcldb/searchdata.cpp
index e9aab438..efa1ae03 100644
--- a/src/rcldb/searchdata.cpp
+++ b/src/rcldb/searchdata.cpp
@@ -219,6 +219,7 @@ bool SearchData::toNativeQuery(Rcl::Db &db, void *d)
                 LOGERR(("Can't retrieve index min/max dates\n"));
                 //whatever, go on.
             }
+
             if (m_dates.y1 == 0) {
                 m_dates.y1 = minyear;
                 m_dates.m1 = 1;
@@ -572,8 +573,11 @@ void StringToXapianQ::expandTerm(bool nostemexp,
     if (nostemexp && !haswild) {
 	// Neither stemming nor wildcard expansion: just the word
         string pfx;
-        if (!m_field.empty())
-            m_db.fieldToPrefix(m_field, pfx);
+	const FieldTraits *ftp;
+        if (!m_field.empty() && m_db.fieldToTraits(m_field, &ftp)) {
+	    pfx = ftp->pfx;
+	}
+	    
 	sterm = term;
         m_uterms.push_back(sterm);
 	exp.push_front(pfx+term);
diff --git a/src/sampleconf/fields b/src/sampleconf/fields
index 89c6ab14..d86f39c0 100644
--- a/src/sampleconf/fields
+++ b/src/sampleconf/fields
@@ -1,5 +1,7 @@
-# @(#$Id: fields,v 1.5 2008-10-08 08:27:34 dockes Exp $  (C) 2007 J.F.Dockes
-# Field names configuration. This defines how one may search ie for 
+# (C) 2007-2011 J.F.Dockes
+# License: GPL V2
+#
+# Field names configuration. This defines how one may search ie for:
 #   author:Hemingway
 #
 # Important: 
@@ -14,19 +16,33 @@
 # The choice of field names is rather arbitrary. Use of any of the aliases
 # defined in the following section will yield exactly the same results,
 # (both for indexing and search).
+#
+# Fields can have two relevance boost factors defined, such as in:
+# caption = S ; wdfinc=10
+#  and/or
+# caption = S ; boost = 10
+# The first line would boost the xapian "within document frequency" of
+# caption terms by a factor of 10 at indexing time. The second one (not
+# currently implemented) would automatically boost the weight of a
+# caption-based field query (ie: caption:mytitle or title:mytitle) at query
+# time.
+
 [prefixes]
 
 # Native fields matching omega uses, which we index without an X first
 # letter. Don't change these. Caption is used for 'title' to keep a last
-# remnant of omega compatibility inside the data record. Also D,F,M,Q,T,Y
-caption = S
+# remnant of omega compatibility inside the data record. 
+# Also reserved/hardcoded: D(ate), M(onth), Y(ear), 
+#           F(parentid), Q(uniqueid), T(mime type)
+caption = S ; wdfinc = 10
 author = A
 keywords = K
 
 # Extension examples. These are actually used by default by Recoll, you can
 # add your own to search for fields produced by the filters and not handled
 # by default. 
-# Some values are reserved by recoll: XP (for path elements).
+# Some values are internally reserved by recoll: 
+#   XP (for path elements).
 ext = XE
 filename = XSFN
 recipient = XTO
@@ -65,7 +81,7 @@ filename=
 [aliases]
 abstract = summary dc:summary description xesam:description
 author = creator dc:creator xesam:author xesam:creator from
-caption = title title dc:title subject
+caption = title dc:title subject
 # catg = dc:type contentCategory
 dbytes = size xesam:size
 dmtime = date dc:date dc:datemodified datemodified contentmodified \