Allow setting a weight increase for field terms

2011-07-22 16:43:39 +02:00 · 2011-07-22 16:43:39 +02:00 · ebbcc115a8
commit ebbcc115a8
parent 48e86c99b5
7 changed files with 150 additions and 124 deletions
--- a/src/common/rclconfig.cpp
+++ b/src/common/rclconfig.cpp
@ -561,6 +561,7 @@ bool RclConfig::valueSplitAttributes(const string& whole, string& value,
    /* There is currently no way to escape a semi-colon */
    string::size_type semicol0 = whole.find_first_of(";");
    value = whole.substr(0, semicol0);
    trimstring(value);
    string attrstr;
    if (semicol0 != string::npos && semicol0 < whole.size() - 1) {
        attrstr = whole.substr(semicol0+1);
@ -602,6 +603,7 @@ void RclConfig::storeMissingHelperDesc(const string &s)
 // things for speed (theses are used a lot during indexing)
 bool RclConfig::readFieldsConfig(const string& cnferrloc)
 {
    LOGDEB2(("RclConfig::readFieldsConfig\n"));
    m_fields = new ConfStack<ConfSimple>("fields", m_cdirs, true);
    if (m_fields == 0 || !m_fields->ok()) {
 	m_reason = string("No/bad fields file in: ") + cnferrloc;
@ -615,16 +617,34 @@ bool RclConfig::readFieldsConfig(const string& cnferrloc)
    for (list<string>::const_iterator it = tps.begin(); it != tps.end();it++) {
 	string val;
 	m_fields->get(*it, val, "prefixes");
-	m_fldtopfx[stringtolower(*it)] = val;
+	ConfSimple attrs;
 	FieldTraits ft;
 	if (!valueSplitAttributes(val, ft.pfx, attrs)) {
 	    LOGERR(("readFieldsConfig: bad config line for [%s]: [%s]\n", 
 		    it->c_str(), val.c_str()));
 	    return 0;
 	}
 	string tval;
 	if (attrs.get("wdfinc", tval))
 	    ft.wdfinc = atoi(tval.c_str());
 	if (attrs.get("boost", tval))
 	    ft.boost = atof(tval.c_str());
 	m_fldtotraits[stringtolower(*it)] = ft;
 	LOGDEB2(("readFieldsConfig: [%s] -> [%s] %d %.1f\n", 
 		it->c_str(), ft.pfx.c_str(), ft.wdfinc, ft.boost));
    }
-    // Add prefixes for aliases (build alias-to-canonic map while we're at it)
+
    // Add prefixes for aliases  an build alias-to-canonic map while we're at it
    // Having the aliases in the prefix map avoids an additional indirection
    // at index time.
    tps = m_fields->getNames("aliases");
    for (list<string>::const_iterator it = tps.begin(); it != tps.end();it++) {
 	string canonic = stringtolower(*it); // canonic name
-	string pfx;
+	FieldTraits ft;
-	map<string,string>::const_iterator pit = m_fldtopfx.find(canonic);
+	map<string, FieldTraits>::const_iterator pit = 
-	if (pit != m_fldtopfx.end()) {
+	    m_fldtotraits.find(canonic);
-	    pfx = pit->second;
+	if (pit != m_fldtotraits.end()) {
 	    ft = pit->second;
 	}
 	string aliases;
 	m_fields->get(canonic, aliases, "aliases");
@ -632,16 +652,18 @@ bool RclConfig::readFieldsConfig(const string& cnferrloc)
 	stringToStrings(aliases, l);
 	for (list<string>::const_iterator ait = l.begin();
 	     ait != l.end(); ait++) {
-	    if (!pfx.empty())
+	    if (pit != m_fldtotraits.end())
-		m_fldtopfx[stringtolower(*ait)] = pfx;
+		m_fldtotraits[stringtolower(*ait)] = ft;
 	    m_aliastocanon[stringtolower(*ait)] = canonic;
 	}
    }
 #if 0
-    for (map<string,string>::const_iterator it = m_fldtopfx.begin();
+    for (map<string, FieldTraits>::const_iterator it = m_fldtotraits.begin();
-	 it != m_fldtopfx.end(); it++) {
+	 it != m_fldtotraits.end(); it++) {
-	LOGDEB(("RclConfig::readFieldsConfig: [%s] => [%s]\n",
+	LOGDEB(("readFieldsConfig: [%s] -> [%s] %d %.1f\n", 
-		it->first.c_str(), it->second.c_str()));
+		it->c_str(), it->second.pfx.c_str(), it->second.wdfinc, 
 		it->second.boost));
    }
 #endif
@ -666,19 +688,20 @@ bool RclConfig::readFieldsConfig(const string& cnferrloc)
    return true;
 }
-// Return term indexing prefix for field name (ie: "filename" -> "XSFN")
+// Return specifics for field name:
-bool RclConfig::getFieldPrefix(const string& _fld, string &pfx)
+bool RclConfig::getFieldTraits(const string& _fld, const FieldTraits **ftpp)
 {
    string fld = fieldCanon(_fld);
-    map<string,string>::const_iterator pit = m_fldtopfx.find(fld);
+    map<string, FieldTraits>::const_iterator pit = m_fldtotraits.find(fld);
-    if (pit != m_fldtopfx.end()) {
+    if (pit != m_fldtotraits.end()) {
-	pfx = pit->second;
+	*ftpp = &pit->second;
 	LOGDEB1(("RclConfig::getFieldPrefix: [%s]->[%s]\n", 
-		 _fld.c_str(), pfx.c_str()));
+		 _fld.c_str(), ft.pfx.c_str()));
 	return true;
    } else {
 	LOGDEB1(("RclConfig::readFieldsConfig: no prefix for field [%s]\n",
 		 fld.c_str()));
 	*ftpp = 0;
 	return false;
    }
 }
@ -694,47 +717,6 @@ set<string> RclConfig::getIndexedFields()
    return flds;
 }
 // Return specialisations of field name for search expansion 
 // (ie: author->[author, from])
 bool RclConfig::getFieldSpecialisations(const string& fld, 
 					list<string>& children, bool top)
 {
    if (m_fields == 0)
        return false;
    string sclds;
    children.push_back(fld);
    if (m_fields->get(fld, sclds, "specialisations")) {
 	list<string> clds;
 	stringToStrings(sclds, clds);
 	for (list<string>::const_iterator it = clds.begin();
 	     it != clds.end(); it++) {
 	    getFieldSpecialisations(*it, children, false);
 	}
    }
    if (top) {
 	children.sort();
 	children.unique();
    }
    return true;
 }
 // 
 bool RclConfig::getFieldSpecialisationPrefixes(const string& fld, 
 					       list<string>& pfxes)
 {
    list<string> clds;
    getFieldSpecialisations(fld, clds);
    for (list<string>::const_iterator it = clds.begin();
 	 it != clds.end(); it++) {
 	string pfx;
 	if (getFieldPrefix(*it, pfx))
 	    pfxes.push_back(pfx);
    }
    pfxes.sort();
    pfxes.unique();
    return true;
 }
 string RclConfig::fieldCanon(const string& f)
 {
    string fld = stringtolower(f);
@ -1075,7 +1057,7 @@ void RclConfig::initFrom(const RclConfig& r)
 	mimeview = new ConfStack<ConfSimple>(*(r.mimeview));
    if (r.m_fields)
 	m_fields = new ConfStack<ConfSimple>(*(r.m_fields));
-    m_fldtopfx = r.m_fldtopfx;
+    m_fldtotraits = r.m_fldtotraits;
    m_aliastocanon = r.m_aliastocanon;
    m_storedFields = r.m_storedFields;
    m_xattrtofld = r.m_xattrtofld;
--- a/src/common/rclconfig.h
+++ b/src/common/rclconfig.h
@ -55,6 +55,16 @@ public:
    bool needrecompute();
 };
 // Data associated to a indexed field name: 
 struct FieldTraits {
    string pfx; // indexing prefix, 
    int    wdfinc; // Index time term frequency increment (default 1)
    double boost; // Query time boost (default 1.0)
    FieldTraits(int i, double f) {wdfinc = i; boost = f;}
    FieldTraits() : wdfinc(1), boost(1.0) {}
    FieldTraits(const string& s) : pfx(s), wdfinc(1), boost(1.0) {}
 };
 class RclConfig {
 public:
@ -188,13 +198,7 @@ class RclConfig {
    bool getMimeCatTypes(const string& cat, list<string>&);
    /** fields: get field prefix from field name */
-    bool getFieldPrefix(const string& fldname, string &pfx);
+    bool getFieldTraits(const string& fldname, const FieldTraits **);
    /** Get implied meanings for field name (ie: author->[author, from]) */
    bool getFieldSpecialisations(const string& fld, 
 				 list<string>& childrens, bool top = true);
    /** Get prefixes for specialisations of field name */
    bool getFieldSpecialisationPrefixes(const string& fld, 
 					list<string>& pfxes);
    const set<string>& getStoredFields() {return m_storedFields;}
    set<string> getIndexedFields();
    /** Get canonic name for possible alias */
@ -256,7 +260,7 @@ class RclConfig {
    ConfStack<ConfSimple> *mimeconf; // but their content may depend on it.
    ConfStack<ConfSimple> *mimeview; // 
    ConfStack<ConfSimple> *m_fields;
-    map<string, string>  m_fldtopfx;
+    map<string, FieldTraits>  m_fldtotraits; // Field to field params
    map<string, string>  m_aliastocanon;
    set<string>          m_storedFields;
    map<string, string>  m_xattrtofld;
--- a/src/query/xadump.cpp
+++ b/src/query/xadump.cpp
@ -271,7 +271,7 @@ int main(int argc, char **argv)
 	    Xapian::PostingIterator doc;
 	    for (doc = db->postlist_begin(aterm);
 		 doc != db->postlist_end(aterm); doc++) {
-		cout << *doc << " : " ;
+		cout << *doc << "(" << doc.get_wdf() << ") : " ;
 		Xapian::PositionIterator pos;
 		for (pos = doc.positionlist_begin(); 
 		     pos != doc.positionlist_end(); pos++) {
--- a/src/rcldb/rcldb.cpp
+++ b/src/rcldb/rcldb.cpp
@ -89,33 +89,43 @@ static const string rclSyntAbs("?!#@");
 // omega
 static const string keycap("caption");
-// Default table for field->prefix translation.  We prefer the data
+// Static/Default table for field->prefix/weight translation. 
-// from rclconfig if available. Note that this is logically const
+// This is logically const after initialization. Can't use a
-// after initialization.  Can't use a static object to init this as
+// static object to init this as the static std::string objects may
-// the static std::string objects may not be ready
+// not be ready.
-static map<string, string> fldToPrefs;
+//
-static void initFldToPrefs() 
+// This map is searched if a match is not found in the dynamic
 // "fields" configuration (cf: Db::fieldToTraits()), meaning that the
 // entries can be overriden in the configuration, but not
 // suppressed. 
 static map<string, FieldTraits> fldToTraits;
 static void initFldToTraits() 
 {
-    fldToPrefs[Doc::keyabs] = string();
+    // Can't remember why "abstract" is indexed without a prefix
-    fldToPrefs["ext"] = "XE";
+    // (result: it's indexed twice actually). Maybe I'll dare change
-    fldToPrefs[Doc::keyfn] = "XSFN";
+    // this one day
    fldToTraits[Doc::keyabs] = FieldTraits();
-    fldToPrefs[keycap] = "S";
+    fldToTraits["ext"] = FieldTraits("XE");
-    fldToPrefs[Doc::keytt] = "S";
+    fldToTraits[Doc::keyfn] = FieldTraits("XSFN");
    fldToPrefs["subject"] = "S";
-    fldToPrefs[Doc::keyau] = "A";
+    fldToTraits[keycap] = FieldTraits("S");
-    fldToPrefs["creator"] = "A";
+    fldToTraits[Doc::keytt] = FieldTraits("S");
-    fldToPrefs["from"] = "A";
+    fldToTraits["subject"] = FieldTraits("S");
-    fldToPrefs[Doc::keykw] = "K";
+    fldToTraits[Doc::keyau] = FieldTraits("A");
-    fldToPrefs["keyword"] = "K";
+    fldToTraits["creator"] = FieldTraits("A");
-    fldToPrefs["tag"] = "K";
+    fldToTraits["from"] = FieldTraits("A");
    fldToPrefs["tags"] = "K";
-    fldToPrefs["xapyear"] = "Y";
+    fldToTraits[Doc::keykw] = FieldTraits("K");
-    fldToPrefs["xapyearmon"] = "M";
+    fldToTraits["keyword"] = FieldTraits("K");
-    fldToPrefs["xapdate"] = "D";
+    fldToTraits["tag"] = FieldTraits("K");
    fldToTraits["tags"] = FieldTraits("K");
    fldToTraits["xapyear"] = FieldTraits("Y");
    fldToTraits["xapyearmon"] = FieldTraits("M");
    fldToTraits["xapdate"] = FieldTraits("D");
 }
 // Compute the unique term used to link documents to their origin. 
@ -539,8 +549,8 @@ Db::Db(RclConfig *cfp)
      m_curtxtsz(0), m_flushtxtsz(0), m_occtxtsz(0), m_occFirstCheck(1),
      m_maxFsOccupPc(0), m_mode(Db::DbRO)
 {
-    if (!fldToPrefs.size())
+    if (!fldToTraits.size())
-	initFldToPrefs();
+	initFldToTraits();
    m_ndb = new Native(this);
    if (m_config) {
@ -791,17 +801,18 @@ bool Db::isopen()
 // reason (old config not updated ?). We use it only if the config
 // translation fails. Also we add in there fields which should be
 // indexed with no prefix (ie: abstract)
-bool Db::fieldToPrefix(const string& fld, string &pfx)
+bool Db::fieldToTraits(const string& fld, const FieldTraits **ftpp)
 {
-    if (m_config && m_config->getFieldPrefix(fld, pfx))
+    if (m_config && m_config->getFieldTraits(fld, ftpp))
 	return true;
    // No data in rclconfig? Check default values
-    map<string, string>::const_iterator it = fldToPrefs.find(fld);
+    map<string, FieldTraits>::const_iterator it = fldToTraits.find(fld);
-    if (it != fldToPrefs.end()) {
+    if (it != fldToTraits.end()) {
-	pfx = it->second;
+	*ftpp = &it->second;
 	return true;
    }
    *ftpp = 0;
    return false;
 }
@ -817,15 +828,18 @@ class TextSplitDb : public TextSplit {
    StopList &stops;
    TextSplitDb(Xapian::WritableDatabase idb, 
 		Xapian::Document &d, StopList &_stops) 
-	: db(idb), doc(d), basepos(1), curpos(0), stops(_stops)
+	: db(idb), doc(d), basepos(1), curpos(0), stops(_stops), wdfinc(1)
    {}
    bool takeword(const std::string &term, int pos, int, int);
    void setprefix(const string& pref) {prefix = pref;}
    void setwdfinc(int i) {wdfinc = i;}
 private:
    // If prefix is set, we also add a posting for the prefixed terms
    // (ie: for titles, add postings for both "term" and "Sterm")
    string  prefix; 
    // Some fields have more weight
    int wdfinc;
 };
 // Get one term from the doc, remove accents and lowercase, then add posting
@ -853,17 +867,16 @@ bool TextSplitDb::takeword(const std::string &_term, int pos, int, int)
    pos += basepos;
    string ermsg;
    try {
-	// Note: 1 is the within document frequency increment. It would 
+	// Index without prefix, using the field-specific weighting
-	// be possible to assign different weigths to doc parts (ie title)
+	doc.add_posting(term, pos, wdfinc);
 	// by using a higher value
 	doc.add_posting(term, pos, 1);
 #ifdef TESTING_XAPIAN_SPELL
 	if (Db::isSpellingCandidate(term)) {
 	    db.add_spelling(term);
 	}
 #endif
 	// Index the prefixed term.
 	if (!prefix.empty()) {
-	    doc.add_posting(prefix + term, pos, 1);
+	    doc.add_posting(prefix + term, pos, wdfinc);
 	}
 	return true;
    } XCATCHERROR(ermsg);
@ -984,26 +997,30 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
    //
    // The order has no importance, and we set a position gap of 100
    // between fields to avoid false proximity matches.
-    map<string,string>::iterator meta_it;
+    map<string, string>::iterator meta_it;
    string pfx;
    for (meta_it = doc.meta.begin(); meta_it != doc.meta.end(); meta_it++) {
 	if (!meta_it->second.empty()) {
-	    if (!fieldToPrefix(meta_it->first, pfx)) {
+	    const FieldTraits *ftp;
 	    // We don't test for an empty prefix here. Some fields are part
 	    // of the internal conf with an empty prefix (ie: abstract).
 	    if (!fieldToTraits(meta_it->first, &ftp)) {
 		LOGDEB0(("Db::add: no prefix for field [%s], no indexing\n",
 			 meta_it->first.c_str()));
 		continue;
 	    }
-	    LOGDEB0(("Db::add: field [%s] pfx [%s]: [%s]\n", 
+	    LOGDEB0(("Db::add: field [%s] pfx [%s] inc %d: [%s]\n", 
-		     meta_it->first.c_str(), pfx.c_str(), 
+		     meta_it->first.c_str(), ftp->pfx.c_str(), ftp->wdfinc,
 		     meta_it->second.c_str()));
-	    splitter.setprefix(pfx); // Subject
+	    splitter.setprefix(ftp->pfx); // Subject
 	    splitter.setwdfinc(ftp->wdfinc);
 	    if (!splitter.text_to_words(meta_it->second))
                LOGDEB(("Db::addOrUpdate: split failed for %s\n", 
                        meta_it->first.c_str()));
 	    splitter.setprefix(string());
 	    splitter.basepos += splitter.curpos + 100;
 	}
    }
    splitter.setprefix(string());
    splitter.setwdfinc(1);
    if (splitter.curpos < baseTextPosition)
 	splitter.basepos = baseTextPosition;
@ -1011,7 +1028,7 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
 	splitter.basepos += splitter.curpos + 100;
    // Split and index body text
-    LOGDEB2(("Db::add: split body\n"));
+    LOGDEB2(("Db::add: split body: [%s]\n", doc.text.c_str()));
    if (!splitter.text_to_words(doc.text))
        LOGDEB(("Db::addOrUpdate: split failed for main text\n"));
@ -1560,11 +1577,13 @@ bool Db::termMatch(MatchType typ, const string &lang,
    string prefix;
    if (!field.empty()) {
-	(void)fieldToPrefix(field, prefix); 
+	const FieldTraits *ftp = 0;
-        if (prefix.empty()) {
+	if (!fieldToTraits(field, &ftp) || ftp->pfx.empty()) {
            LOGDEB(("Db::termMatch: field is not indexed (no prefix): [%s]\n", 
                    field.c_str()));
-        }
+        } else {
 	    prefix = ftp->pfx;
 	}
        if (prefixp)
            *prefixp = prefix;
    }
--- a/src/rcldb/rcldb.h
+++ b/src/rcldb/rcldb.h
@ -24,6 +24,7 @@
 #include "refcntr.h"
 #include "rcldoc.h"
 #include "stoplist.h"
 #include "rclconfig.h"
 #ifndef NO_NAMESPACES
 using std::string;
@ -130,7 +131,7 @@ class Db {
    /* Return list of configured stop words */
    const StopList& getStopList() const {return m_stops;}
    /* Field name to prefix translation (ie: author -> 'A') */
-    bool fieldToPrefix(const string& fldname, string &pfx);
+    bool fieldToTraits(const string& fldname, const FieldTraits **ftpp);
    /* Update-related methods ******************************************/
--- a/src/rcldb/searchdata.cpp
+++ b/src/rcldb/searchdata.cpp
@ -219,6 +219,7 @@ bool SearchData::toNativeQuery(Rcl::Db &db, void *d)
                LOGERR(("Can't retrieve index min/max dates\n"));
                //whatever, go on.
            }
            if (m_dates.y1 == 0) {
                m_dates.y1 = minyear;
                m_dates.m1 = 1;
@ -572,8 +573,11 @@ void StringToXapianQ::expandTerm(bool nostemexp,
    if (nostemexp && !haswild) {
 	// Neither stemming nor wildcard expansion: just the word
        string pfx;
-        if (!m_field.empty())
+	const FieldTraits *ftp;
-            m_db.fieldToPrefix(m_field, pfx);
+        if (!m_field.empty() && m_db.fieldToTraits(m_field, &ftp)) {
 	    pfx = ftp->pfx;
 	}
 	sterm = term;
        m_uterms.push_back(sterm);
 	exp.push_front(pfx+term);
--- a/src/sampleconf/fields
+++ b/src/sampleconf/fields
@ -1,5 +1,7 @@
-# @(#$Id: fields,v 1.5 2008-10-08 08:27:34 dockes Exp $  (C) 2007 J.F.Dockes
+# (C) 2007-2011 J.F.Dockes
-# Field names configuration. This defines how one may search ie for 
+# License: GPL V2
 #
 # Field names configuration. This defines how one may search ie for:
 #   author:Hemingway
 #
 # Important: 
@ -14,19 +16,33 @@
 # The choice of field names is rather arbitrary. Use of any of the aliases
 # defined in the following section will yield exactly the same results,
 # (both for indexing and search).
 #
 # Fields can have two relevance boost factors defined, such as in:
 # caption = S ; wdfinc=10
 #  and/or
 # caption = S ; boost = 10
 # The first line would boost the xapian "within document frequency" of
 # caption terms by a factor of 10 at indexing time. The second one (not
 # currently implemented) would automatically boost the weight of a
 # caption-based field query (ie: caption:mytitle or title:mytitle) at query
 # time.
 [prefixes]
 # Native fields matching omega uses, which we index without an X first
 # letter. Don't change these. Caption is used for 'title' to keep a last
-# remnant of omega compatibility inside the data record. Also D,F,M,Q,T,Y
+# remnant of omega compatibility inside the data record. 
-caption = S
+# Also reserved/hardcoded: D(ate), M(onth), Y(ear), 
 #           F(parentid), Q(uniqueid), T(mime type)
 caption = S ; wdfinc = 10
 author = A
 keywords = K
 # Extension examples. These are actually used by default by Recoll, you can
 # add your own to search for fields produced by the filters and not handled
 # by default. 
-# Some values are reserved by recoll: XP (for path elements).
+# Some values are internally reserved by recoll: 
 #   XP (for path elements).
 ext = XE
 filename = XSFN
 recipient = XTO
@ -65,7 +81,7 @@ filename=
 [aliases]
 abstract = summary dc:summary description xesam:description
 author = creator dc:creator xesam:author xesam:creator from
-caption = title title dc:title subject
+caption = title dc:title subject
 # catg = dc:type contentCategory
 dbytes = size xesam:size
 dmtime = date dc:date dc:datemodified datemodified contentmodified \