From ebbcc115a89d0c9a445cec22a0a96bda68486d78 Mon Sep 17 00:00:00 2001 From: "\"Jean-Francois Dockes ext:(%22)" Date: Fri, 22 Jul 2011 16:43:39 +0200 Subject: [PATCH] Allow setting a weight increase for field terms --- src/common/rclconfig.cpp | 102 +++++++++++++++-------------------- src/common/rclconfig.h | 20 ++++--- src/query/xadump.cpp | 2 +- src/rcldb/rcldb.cpp | 111 +++++++++++++++++++++++---------------- src/rcldb/rcldb.h | 3 +- src/rcldb/searchdata.cpp | 8 ++- src/sampleconf/fields | 28 +++++++--- 7 files changed, 150 insertions(+), 124 deletions(-) diff --git a/src/common/rclconfig.cpp b/src/common/rclconfig.cpp index ee975168..6d437917 100644 --- a/src/common/rclconfig.cpp +++ b/src/common/rclconfig.cpp @@ -561,6 +561,7 @@ bool RclConfig::valueSplitAttributes(const string& whole, string& value, /* There is currently no way to escape a semi-colon */ string::size_type semicol0 = whole.find_first_of(";"); value = whole.substr(0, semicol0); + trimstring(value); string attrstr; if (semicol0 != string::npos && semicol0 < whole.size() - 1) { attrstr = whole.substr(semicol0+1); @@ -602,6 +603,7 @@ void RclConfig::storeMissingHelperDesc(const string &s) // things for speed (theses are used a lot during indexing) bool RclConfig::readFieldsConfig(const string& cnferrloc) { + LOGDEB2(("RclConfig::readFieldsConfig\n")); m_fields = new ConfStack("fields", m_cdirs, true); if (m_fields == 0 || !m_fields->ok()) { m_reason = string("No/bad fields file in: ") + cnferrloc; @@ -615,16 +617,34 @@ bool RclConfig::readFieldsConfig(const string& cnferrloc) for (list::const_iterator it = tps.begin(); it != tps.end();it++) { string val; m_fields->get(*it, val, "prefixes"); - m_fldtopfx[stringtolower(*it)] = val; + ConfSimple attrs; + FieldTraits ft; + if (!valueSplitAttributes(val, ft.pfx, attrs)) { + LOGERR(("readFieldsConfig: bad config line for [%s]: [%s]\n", + it->c_str(), val.c_str())); + return 0; + } + string tval; + if (attrs.get("wdfinc", tval)) + ft.wdfinc = atoi(tval.c_str()); + if (attrs.get("boost", tval)) + ft.boost = atof(tval.c_str()); + m_fldtotraits[stringtolower(*it)] = ft; + LOGDEB2(("readFieldsConfig: [%s] -> [%s] %d %.1f\n", + it->c_str(), ft.pfx.c_str(), ft.wdfinc, ft.boost)); } - // Add prefixes for aliases (build alias-to-canonic map while we're at it) + + // Add prefixes for aliases an build alias-to-canonic map while we're at it + // Having the aliases in the prefix map avoids an additional indirection + // at index time. tps = m_fields->getNames("aliases"); for (list::const_iterator it = tps.begin(); it != tps.end();it++) { string canonic = stringtolower(*it); // canonic name - string pfx; - map::const_iterator pit = m_fldtopfx.find(canonic); - if (pit != m_fldtopfx.end()) { - pfx = pit->second; + FieldTraits ft; + map::const_iterator pit = + m_fldtotraits.find(canonic); + if (pit != m_fldtotraits.end()) { + ft = pit->second; } string aliases; m_fields->get(canonic, aliases, "aliases"); @@ -632,16 +652,18 @@ bool RclConfig::readFieldsConfig(const string& cnferrloc) stringToStrings(aliases, l); for (list::const_iterator ait = l.begin(); ait != l.end(); ait++) { - if (!pfx.empty()) - m_fldtopfx[stringtolower(*ait)] = pfx; + if (pit != m_fldtotraits.end()) + m_fldtotraits[stringtolower(*ait)] = ft; m_aliastocanon[stringtolower(*ait)] = canonic; } } + #if 0 - for (map::const_iterator it = m_fldtopfx.begin(); - it != m_fldtopfx.end(); it++) { - LOGDEB(("RclConfig::readFieldsConfig: [%s] => [%s]\n", - it->first.c_str(), it->second.c_str())); + for (map::const_iterator it = m_fldtotraits.begin(); + it != m_fldtotraits.end(); it++) { + LOGDEB(("readFieldsConfig: [%s] -> [%s] %d %.1f\n", + it->c_str(), it->second.pfx.c_str(), it->second.wdfinc, + it->second.boost)); } #endif @@ -666,19 +688,20 @@ bool RclConfig::readFieldsConfig(const string& cnferrloc) return true; } -// Return term indexing prefix for field name (ie: "filename" -> "XSFN") -bool RclConfig::getFieldPrefix(const string& _fld, string &pfx) +// Return specifics for field name: +bool RclConfig::getFieldTraits(const string& _fld, const FieldTraits **ftpp) { string fld = fieldCanon(_fld); - map::const_iterator pit = m_fldtopfx.find(fld); - if (pit != m_fldtopfx.end()) { - pfx = pit->second; + map::const_iterator pit = m_fldtotraits.find(fld); + if (pit != m_fldtotraits.end()) { + *ftpp = &pit->second; LOGDEB1(("RclConfig::getFieldPrefix: [%s]->[%s]\n", - _fld.c_str(), pfx.c_str())); + _fld.c_str(), ft.pfx.c_str())); return true; } else { LOGDEB1(("RclConfig::readFieldsConfig: no prefix for field [%s]\n", fld.c_str())); + *ftpp = 0; return false; } } @@ -694,47 +717,6 @@ set RclConfig::getIndexedFields() return flds; } -// Return specialisations of field name for search expansion -// (ie: author->[author, from]) -bool RclConfig::getFieldSpecialisations(const string& fld, - list& children, bool top) -{ - if (m_fields == 0) - return false; - string sclds; - children.push_back(fld); - if (m_fields->get(fld, sclds, "specialisations")) { - list clds; - stringToStrings(sclds, clds); - for (list::const_iterator it = clds.begin(); - it != clds.end(); it++) { - getFieldSpecialisations(*it, children, false); - } - } - if (top) { - children.sort(); - children.unique(); - } - return true; -} - -// -bool RclConfig::getFieldSpecialisationPrefixes(const string& fld, - list& pfxes) -{ - list clds; - getFieldSpecialisations(fld, clds); - for (list::const_iterator it = clds.begin(); - it != clds.end(); it++) { - string pfx; - if (getFieldPrefix(*it, pfx)) - pfxes.push_back(pfx); - } - pfxes.sort(); - pfxes.unique(); - return true; -} - string RclConfig::fieldCanon(const string& f) { string fld = stringtolower(f); @@ -1075,7 +1057,7 @@ void RclConfig::initFrom(const RclConfig& r) mimeview = new ConfStack(*(r.mimeview)); if (r.m_fields) m_fields = new ConfStack(*(r.m_fields)); - m_fldtopfx = r.m_fldtopfx; + m_fldtotraits = r.m_fldtotraits; m_aliastocanon = r.m_aliastocanon; m_storedFields = r.m_storedFields; m_xattrtofld = r.m_xattrtofld; diff --git a/src/common/rclconfig.h b/src/common/rclconfig.h index 3a8f94d8..4bf0c59c 100644 --- a/src/common/rclconfig.h +++ b/src/common/rclconfig.h @@ -55,6 +55,16 @@ public: bool needrecompute(); }; +// Data associated to a indexed field name: +struct FieldTraits { + string pfx; // indexing prefix, + int wdfinc; // Index time term frequency increment (default 1) + double boost; // Query time boost (default 1.0) + FieldTraits(int i, double f) {wdfinc = i; boost = f;} + FieldTraits() : wdfinc(1), boost(1.0) {} + FieldTraits(const string& s) : pfx(s), wdfinc(1), boost(1.0) {} +}; + class RclConfig { public: @@ -188,13 +198,7 @@ class RclConfig { bool getMimeCatTypes(const string& cat, list&); /** fields: get field prefix from field name */ - bool getFieldPrefix(const string& fldname, string &pfx); - /** Get implied meanings for field name (ie: author->[author, from]) */ - bool getFieldSpecialisations(const string& fld, - list& childrens, bool top = true); - /** Get prefixes for specialisations of field name */ - bool getFieldSpecialisationPrefixes(const string& fld, - list& pfxes); + bool getFieldTraits(const string& fldname, const FieldTraits **); const set& getStoredFields() {return m_storedFields;} set getIndexedFields(); /** Get canonic name for possible alias */ @@ -256,7 +260,7 @@ class RclConfig { ConfStack *mimeconf; // but their content may depend on it. ConfStack *mimeview; // ConfStack *m_fields; - map m_fldtopfx; + map m_fldtotraits; // Field to field params map m_aliastocanon; set m_storedFields; map m_xattrtofld; diff --git a/src/query/xadump.cpp b/src/query/xadump.cpp index 26a18c29..9db4be34 100644 --- a/src/query/xadump.cpp +++ b/src/query/xadump.cpp @@ -271,7 +271,7 @@ int main(int argc, char **argv) Xapian::PostingIterator doc; for (doc = db->postlist_begin(aterm); doc != db->postlist_end(aterm); doc++) { - cout << *doc << " : " ; + cout << *doc << "(" << doc.get_wdf() << ") : " ; Xapian::PositionIterator pos; for (pos = doc.positionlist_begin(); pos != doc.positionlist_end(); pos++) { diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index 87908353..8c4ebf5a 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -89,33 +89,43 @@ static const string rclSyntAbs("?!#@"); // omega static const string keycap("caption"); -// Default table for field->prefix translation. We prefer the data -// from rclconfig if available. Note that this is logically const -// after initialization. Can't use a static object to init this as -// the static std::string objects may not be ready -static map fldToPrefs; -static void initFldToPrefs() +// Static/Default table for field->prefix/weight translation. +// This is logically const after initialization. Can't use a +// static object to init this as the static std::string objects may +// not be ready. +// +// This map is searched if a match is not found in the dynamic +// "fields" configuration (cf: Db::fieldToTraits()), meaning that the +// entries can be overriden in the configuration, but not +// suppressed. + +static map fldToTraits; +static void initFldToTraits() { - fldToPrefs[Doc::keyabs] = string(); - fldToPrefs["ext"] = "XE"; - fldToPrefs[Doc::keyfn] = "XSFN"; + // Can't remember why "abstract" is indexed without a prefix + // (result: it's indexed twice actually). Maybe I'll dare change + // this one day + fldToTraits[Doc::keyabs] = FieldTraits(); - fldToPrefs[keycap] = "S"; - fldToPrefs[Doc::keytt] = "S"; - fldToPrefs["subject"] = "S"; + fldToTraits["ext"] = FieldTraits("XE"); + fldToTraits[Doc::keyfn] = FieldTraits("XSFN"); - fldToPrefs[Doc::keyau] = "A"; - fldToPrefs["creator"] = "A"; - fldToPrefs["from"] = "A"; + fldToTraits[keycap] = FieldTraits("S"); + fldToTraits[Doc::keytt] = FieldTraits("S"); + fldToTraits["subject"] = FieldTraits("S"); - fldToPrefs[Doc::keykw] = "K"; - fldToPrefs["keyword"] = "K"; - fldToPrefs["tag"] = "K"; - fldToPrefs["tags"] = "K"; + fldToTraits[Doc::keyau] = FieldTraits("A"); + fldToTraits["creator"] = FieldTraits("A"); + fldToTraits["from"] = FieldTraits("A"); - fldToPrefs["xapyear"] = "Y"; - fldToPrefs["xapyearmon"] = "M"; - fldToPrefs["xapdate"] = "D"; + fldToTraits[Doc::keykw] = FieldTraits("K"); + fldToTraits["keyword"] = FieldTraits("K"); + fldToTraits["tag"] = FieldTraits("K"); + fldToTraits["tags"] = FieldTraits("K"); + + fldToTraits["xapyear"] = FieldTraits("Y"); + fldToTraits["xapyearmon"] = FieldTraits("M"); + fldToTraits["xapdate"] = FieldTraits("D"); } // Compute the unique term used to link documents to their origin. @@ -539,8 +549,8 @@ Db::Db(RclConfig *cfp) m_curtxtsz(0), m_flushtxtsz(0), m_occtxtsz(0), m_occFirstCheck(1), m_maxFsOccupPc(0), m_mode(Db::DbRO) { - if (!fldToPrefs.size()) - initFldToPrefs(); + if (!fldToTraits.size()) + initFldToTraits(); m_ndb = new Native(this); if (m_config) { @@ -791,17 +801,18 @@ bool Db::isopen() // reason (old config not updated ?). We use it only if the config // translation fails. Also we add in there fields which should be // indexed with no prefix (ie: abstract) -bool Db::fieldToPrefix(const string& fld, string &pfx) +bool Db::fieldToTraits(const string& fld, const FieldTraits **ftpp) { - if (m_config && m_config->getFieldPrefix(fld, pfx)) + if (m_config && m_config->getFieldTraits(fld, ftpp)) return true; // No data in rclconfig? Check default values - map::const_iterator it = fldToPrefs.find(fld); - if (it != fldToPrefs.end()) { - pfx = it->second; + map::const_iterator it = fldToTraits.find(fld); + if (it != fldToTraits.end()) { + *ftpp = &it->second; return true; } + *ftpp = 0; return false; } @@ -817,15 +828,18 @@ class TextSplitDb : public TextSplit { StopList &stops; TextSplitDb(Xapian::WritableDatabase idb, Xapian::Document &d, StopList &_stops) - : db(idb), doc(d), basepos(1), curpos(0), stops(_stops) + : db(idb), doc(d), basepos(1), curpos(0), stops(_stops), wdfinc(1) {} bool takeword(const std::string &term, int pos, int, int); void setprefix(const string& pref) {prefix = pref;} + void setwdfinc(int i) {wdfinc = i;} private: // If prefix is set, we also add a posting for the prefixed terms // (ie: for titles, add postings for both "term" and "Sterm") string prefix; + // Some fields have more weight + int wdfinc; }; // Get one term from the doc, remove accents and lowercase, then add posting @@ -853,17 +867,16 @@ bool TextSplitDb::takeword(const std::string &_term, int pos, int, int) pos += basepos; string ermsg; try { - // Note: 1 is the within document frequency increment. It would - // be possible to assign different weigths to doc parts (ie title) - // by using a higher value - doc.add_posting(term, pos, 1); + // Index without prefix, using the field-specific weighting + doc.add_posting(term, pos, wdfinc); #ifdef TESTING_XAPIAN_SPELL if (Db::isSpellingCandidate(term)) { db.add_spelling(term); } #endif + // Index the prefixed term. if (!prefix.empty()) { - doc.add_posting(prefix + term, pos, 1); + doc.add_posting(prefix + term, pos, wdfinc); } return true; } XCATCHERROR(ermsg); @@ -984,26 +997,30 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, // // The order has no importance, and we set a position gap of 100 // between fields to avoid false proximity matches. - map::iterator meta_it; - string pfx; + map::iterator meta_it; for (meta_it = doc.meta.begin(); meta_it != doc.meta.end(); meta_it++) { if (!meta_it->second.empty()) { - if (!fieldToPrefix(meta_it->first, pfx)) { + const FieldTraits *ftp; + // We don't test for an empty prefix here. Some fields are part + // of the internal conf with an empty prefix (ie: abstract). + if (!fieldToTraits(meta_it->first, &ftp)) { LOGDEB0(("Db::add: no prefix for field [%s], no indexing\n", meta_it->first.c_str())); continue; } - LOGDEB0(("Db::add: field [%s] pfx [%s]: [%s]\n", - meta_it->first.c_str(), pfx.c_str(), + LOGDEB0(("Db::add: field [%s] pfx [%s] inc %d: [%s]\n", + meta_it->first.c_str(), ftp->pfx.c_str(), ftp->wdfinc, meta_it->second.c_str())); - splitter.setprefix(pfx); // Subject + splitter.setprefix(ftp->pfx); // Subject + splitter.setwdfinc(ftp->wdfinc); if (!splitter.text_to_words(meta_it->second)) LOGDEB(("Db::addOrUpdate: split failed for %s\n", meta_it->first.c_str())); - splitter.setprefix(string()); splitter.basepos += splitter.curpos + 100; } } + splitter.setprefix(string()); + splitter.setwdfinc(1); if (splitter.curpos < baseTextPosition) splitter.basepos = baseTextPosition; @@ -1011,7 +1028,7 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, splitter.basepos += splitter.curpos + 100; // Split and index body text - LOGDEB2(("Db::add: split body\n")); + LOGDEB2(("Db::add: split body: [%s]\n", doc.text.c_str())); if (!splitter.text_to_words(doc.text)) LOGDEB(("Db::addOrUpdate: split failed for main text\n")); @@ -1560,11 +1577,13 @@ bool Db::termMatch(MatchType typ, const string &lang, string prefix; if (!field.empty()) { - (void)fieldToPrefix(field, prefix); - if (prefix.empty()) { + const FieldTraits *ftp = 0; + if (!fieldToTraits(field, &ftp) || ftp->pfx.empty()) { LOGDEB(("Db::termMatch: field is not indexed (no prefix): [%s]\n", field.c_str())); - } + } else { + prefix = ftp->pfx; + } if (prefixp) *prefixp = prefix; } diff --git a/src/rcldb/rcldb.h b/src/rcldb/rcldb.h index 4586cd2d..046e5092 100644 --- a/src/rcldb/rcldb.h +++ b/src/rcldb/rcldb.h @@ -24,6 +24,7 @@ #include "refcntr.h" #include "rcldoc.h" #include "stoplist.h" +#include "rclconfig.h" #ifndef NO_NAMESPACES using std::string; @@ -130,7 +131,7 @@ class Db { /* Return list of configured stop words */ const StopList& getStopList() const {return m_stops;} /* Field name to prefix translation (ie: author -> 'A') */ - bool fieldToPrefix(const string& fldname, string &pfx); + bool fieldToTraits(const string& fldname, const FieldTraits **ftpp); /* Update-related methods ******************************************/ diff --git a/src/rcldb/searchdata.cpp b/src/rcldb/searchdata.cpp index e9aab438..efa1ae03 100644 --- a/src/rcldb/searchdata.cpp +++ b/src/rcldb/searchdata.cpp @@ -219,6 +219,7 @@ bool SearchData::toNativeQuery(Rcl::Db &db, void *d) LOGERR(("Can't retrieve index min/max dates\n")); //whatever, go on. } + if (m_dates.y1 == 0) { m_dates.y1 = minyear; m_dates.m1 = 1; @@ -572,8 +573,11 @@ void StringToXapianQ::expandTerm(bool nostemexp, if (nostemexp && !haswild) { // Neither stemming nor wildcard expansion: just the word string pfx; - if (!m_field.empty()) - m_db.fieldToPrefix(m_field, pfx); + const FieldTraits *ftp; + if (!m_field.empty() && m_db.fieldToTraits(m_field, &ftp)) { + pfx = ftp->pfx; + } + sterm = term; m_uterms.push_back(sterm); exp.push_front(pfx+term); diff --git a/src/sampleconf/fields b/src/sampleconf/fields index 89c6ab14..d86f39c0 100644 --- a/src/sampleconf/fields +++ b/src/sampleconf/fields @@ -1,5 +1,7 @@ -# @(#$Id: fields,v 1.5 2008-10-08 08:27:34 dockes Exp $ (C) 2007 J.F.Dockes -# Field names configuration. This defines how one may search ie for +# (C) 2007-2011 J.F.Dockes +# License: GPL V2 +# +# Field names configuration. This defines how one may search ie for: # author:Hemingway # # Important: @@ -14,19 +16,33 @@ # The choice of field names is rather arbitrary. Use of any of the aliases # defined in the following section will yield exactly the same results, # (both for indexing and search). +# +# Fields can have two relevance boost factors defined, such as in: +# caption = S ; wdfinc=10 +# and/or +# caption = S ; boost = 10 +# The first line would boost the xapian "within document frequency" of +# caption terms by a factor of 10 at indexing time. The second one (not +# currently implemented) would automatically boost the weight of a +# caption-based field query (ie: caption:mytitle or title:mytitle) at query +# time. + [prefixes] # Native fields matching omega uses, which we index without an X first # letter. Don't change these. Caption is used for 'title' to keep a last -# remnant of omega compatibility inside the data record. Also D,F,M,Q,T,Y -caption = S +# remnant of omega compatibility inside the data record. +# Also reserved/hardcoded: D(ate), M(onth), Y(ear), +# F(parentid), Q(uniqueid), T(mime type) +caption = S ; wdfinc = 10 author = A keywords = K # Extension examples. These are actually used by default by Recoll, you can # add your own to search for fields produced by the filters and not handled # by default. -# Some values are reserved by recoll: XP (for path elements). +# Some values are internally reserved by recoll: +# XP (for path elements). ext = XE filename = XSFN recipient = XTO @@ -65,7 +81,7 @@ filename= [aliases] abstract = summary dc:summary description xesam:description author = creator dc:creator xesam:author xesam:creator from -caption = title title dc:title subject +caption = title dc:title subject # catg = dc:type contentCategory dbytes = size xesam:size dmtime = date dc:date dc:datemodified datemodified contentmodified \