Allow setting a weight increase for field terms

This commit is contained in:
"Jean-Francois Dockes ext:(%22) 2011-07-22 16:43:39 +02:00
parent 48e86c99b5
commit ebbcc115a8
7 changed files with 150 additions and 124 deletions

View File

@ -561,6 +561,7 @@ bool RclConfig::valueSplitAttributes(const string& whole, string& value,
/* There is currently no way to escape a semi-colon */
string::size_type semicol0 = whole.find_first_of(";");
value = whole.substr(0, semicol0);
trimstring(value);
string attrstr;
if (semicol0 != string::npos && semicol0 < whole.size() - 1) {
attrstr = whole.substr(semicol0+1);
@ -602,6 +603,7 @@ void RclConfig::storeMissingHelperDesc(const string &s)
// things for speed (theses are used a lot during indexing)
bool RclConfig::readFieldsConfig(const string& cnferrloc)
{
LOGDEB2(("RclConfig::readFieldsConfig\n"));
m_fields = new ConfStack<ConfSimple>("fields", m_cdirs, true);
if (m_fields == 0 || !m_fields->ok()) {
m_reason = string("No/bad fields file in: ") + cnferrloc;
@ -615,16 +617,34 @@ bool RclConfig::readFieldsConfig(const string& cnferrloc)
for (list<string>::const_iterator it = tps.begin(); it != tps.end();it++) {
string val;
m_fields->get(*it, val, "prefixes");
m_fldtopfx[stringtolower(*it)] = val;
ConfSimple attrs;
FieldTraits ft;
if (!valueSplitAttributes(val, ft.pfx, attrs)) {
LOGERR(("readFieldsConfig: bad config line for [%s]: [%s]\n",
it->c_str(), val.c_str()));
return 0;
}
string tval;
if (attrs.get("wdfinc", tval))
ft.wdfinc = atoi(tval.c_str());
if (attrs.get("boost", tval))
ft.boost = atof(tval.c_str());
m_fldtotraits[stringtolower(*it)] = ft;
LOGDEB2(("readFieldsConfig: [%s] -> [%s] %d %.1f\n",
it->c_str(), ft.pfx.c_str(), ft.wdfinc, ft.boost));
}
// Add prefixes for aliases (build alias-to-canonic map while we're at it)
// Add prefixes for aliases an build alias-to-canonic map while we're at it
// Having the aliases in the prefix map avoids an additional indirection
// at index time.
tps = m_fields->getNames("aliases");
for (list<string>::const_iterator it = tps.begin(); it != tps.end();it++) {
string canonic = stringtolower(*it); // canonic name
string pfx;
map<string,string>::const_iterator pit = m_fldtopfx.find(canonic);
if (pit != m_fldtopfx.end()) {
pfx = pit->second;
FieldTraits ft;
map<string, FieldTraits>::const_iterator pit =
m_fldtotraits.find(canonic);
if (pit != m_fldtotraits.end()) {
ft = pit->second;
}
string aliases;
m_fields->get(canonic, aliases, "aliases");
@ -632,16 +652,18 @@ bool RclConfig::readFieldsConfig(const string& cnferrloc)
stringToStrings(aliases, l);
for (list<string>::const_iterator ait = l.begin();
ait != l.end(); ait++) {
if (!pfx.empty())
m_fldtopfx[stringtolower(*ait)] = pfx;
if (pit != m_fldtotraits.end())
m_fldtotraits[stringtolower(*ait)] = ft;
m_aliastocanon[stringtolower(*ait)] = canonic;
}
}
#if 0
for (map<string,string>::const_iterator it = m_fldtopfx.begin();
it != m_fldtopfx.end(); it++) {
LOGDEB(("RclConfig::readFieldsConfig: [%s] => [%s]\n",
it->first.c_str(), it->second.c_str()));
for (map<string, FieldTraits>::const_iterator it = m_fldtotraits.begin();
it != m_fldtotraits.end(); it++) {
LOGDEB(("readFieldsConfig: [%s] -> [%s] %d %.1f\n",
it->c_str(), it->second.pfx.c_str(), it->second.wdfinc,
it->second.boost));
}
#endif
@ -666,19 +688,20 @@ bool RclConfig::readFieldsConfig(const string& cnferrloc)
return true;
}
// Return term indexing prefix for field name (ie: "filename" -> "XSFN")
bool RclConfig::getFieldPrefix(const string& _fld, string &pfx)
// Return specifics for field name:
bool RclConfig::getFieldTraits(const string& _fld, const FieldTraits **ftpp)
{
string fld = fieldCanon(_fld);
map<string,string>::const_iterator pit = m_fldtopfx.find(fld);
if (pit != m_fldtopfx.end()) {
pfx = pit->second;
map<string, FieldTraits>::const_iterator pit = m_fldtotraits.find(fld);
if (pit != m_fldtotraits.end()) {
*ftpp = &pit->second;
LOGDEB1(("RclConfig::getFieldPrefix: [%s]->[%s]\n",
_fld.c_str(), pfx.c_str()));
_fld.c_str(), ft.pfx.c_str()));
return true;
} else {
LOGDEB1(("RclConfig::readFieldsConfig: no prefix for field [%s]\n",
fld.c_str()));
*ftpp = 0;
return false;
}
}
@ -694,47 +717,6 @@ set<string> RclConfig::getIndexedFields()
return flds;
}
// Return specialisations of field name for search expansion
// (ie: author->[author, from])
bool RclConfig::getFieldSpecialisations(const string& fld,
list<string>& children, bool top)
{
if (m_fields == 0)
return false;
string sclds;
children.push_back(fld);
if (m_fields->get(fld, sclds, "specialisations")) {
list<string> clds;
stringToStrings(sclds, clds);
for (list<string>::const_iterator it = clds.begin();
it != clds.end(); it++) {
getFieldSpecialisations(*it, children, false);
}
}
if (top) {
children.sort();
children.unique();
}
return true;
}
//
bool RclConfig::getFieldSpecialisationPrefixes(const string& fld,
list<string>& pfxes)
{
list<string> clds;
getFieldSpecialisations(fld, clds);
for (list<string>::const_iterator it = clds.begin();
it != clds.end(); it++) {
string pfx;
if (getFieldPrefix(*it, pfx))
pfxes.push_back(pfx);
}
pfxes.sort();
pfxes.unique();
return true;
}
string RclConfig::fieldCanon(const string& f)
{
string fld = stringtolower(f);
@ -1075,7 +1057,7 @@ void RclConfig::initFrom(const RclConfig& r)
mimeview = new ConfStack<ConfSimple>(*(r.mimeview));
if (r.m_fields)
m_fields = new ConfStack<ConfSimple>(*(r.m_fields));
m_fldtopfx = r.m_fldtopfx;
m_fldtotraits = r.m_fldtotraits;
m_aliastocanon = r.m_aliastocanon;
m_storedFields = r.m_storedFields;
m_xattrtofld = r.m_xattrtofld;

View File

@ -55,6 +55,16 @@ public:
bool needrecompute();
};
// Data associated to a indexed field name:
struct FieldTraits {
string pfx; // indexing prefix,
int wdfinc; // Index time term frequency increment (default 1)
double boost; // Query time boost (default 1.0)
FieldTraits(int i, double f) {wdfinc = i; boost = f;}
FieldTraits() : wdfinc(1), boost(1.0) {}
FieldTraits(const string& s) : pfx(s), wdfinc(1), boost(1.0) {}
};
class RclConfig {
public:
@ -188,13 +198,7 @@ class RclConfig {
bool getMimeCatTypes(const string& cat, list<string>&);
/** fields: get field prefix from field name */
bool getFieldPrefix(const string& fldname, string &pfx);
/** Get implied meanings for field name (ie: author->[author, from]) */
bool getFieldSpecialisations(const string& fld,
list<string>& childrens, bool top = true);
/** Get prefixes for specialisations of field name */
bool getFieldSpecialisationPrefixes(const string& fld,
list<string>& pfxes);
bool getFieldTraits(const string& fldname, const FieldTraits **);
const set<string>& getStoredFields() {return m_storedFields;}
set<string> getIndexedFields();
/** Get canonic name for possible alias */
@ -256,7 +260,7 @@ class RclConfig {
ConfStack<ConfSimple> *mimeconf; // but their content may depend on it.
ConfStack<ConfSimple> *mimeview; //
ConfStack<ConfSimple> *m_fields;
map<string, string> m_fldtopfx;
map<string, FieldTraits> m_fldtotraits; // Field to field params
map<string, string> m_aliastocanon;
set<string> m_storedFields;
map<string, string> m_xattrtofld;

View File

@ -271,7 +271,7 @@ int main(int argc, char **argv)
Xapian::PostingIterator doc;
for (doc = db->postlist_begin(aterm);
doc != db->postlist_end(aterm); doc++) {
cout << *doc << " : " ;
cout << *doc << "(" << doc.get_wdf() << ") : " ;
Xapian::PositionIterator pos;
for (pos = doc.positionlist_begin();
pos != doc.positionlist_end(); pos++) {

View File

@ -89,33 +89,43 @@ static const string rclSyntAbs("?!#@");
// omega
static const string keycap("caption");
// Default table for field->prefix translation. We prefer the data
// from rclconfig if available. Note that this is logically const
// after initialization. Can't use a static object to init this as
// the static std::string objects may not be ready
static map<string, string> fldToPrefs;
static void initFldToPrefs()
// Static/Default table for field->prefix/weight translation.
// This is logically const after initialization. Can't use a
// static object to init this as the static std::string objects may
// not be ready.
//
// This map is searched if a match is not found in the dynamic
// "fields" configuration (cf: Db::fieldToTraits()), meaning that the
// entries can be overriden in the configuration, but not
// suppressed.
static map<string, FieldTraits> fldToTraits;
static void initFldToTraits()
{
fldToPrefs[Doc::keyabs] = string();
fldToPrefs["ext"] = "XE";
fldToPrefs[Doc::keyfn] = "XSFN";
// Can't remember why "abstract" is indexed without a prefix
// (result: it's indexed twice actually). Maybe I'll dare change
// this one day
fldToTraits[Doc::keyabs] = FieldTraits();
fldToPrefs[keycap] = "S";
fldToPrefs[Doc::keytt] = "S";
fldToPrefs["subject"] = "S";
fldToTraits["ext"] = FieldTraits("XE");
fldToTraits[Doc::keyfn] = FieldTraits("XSFN");
fldToPrefs[Doc::keyau] = "A";
fldToPrefs["creator"] = "A";
fldToPrefs["from"] = "A";
fldToTraits[keycap] = FieldTraits("S");
fldToTraits[Doc::keytt] = FieldTraits("S");
fldToTraits["subject"] = FieldTraits("S");
fldToPrefs[Doc::keykw] = "K";
fldToPrefs["keyword"] = "K";
fldToPrefs["tag"] = "K";
fldToPrefs["tags"] = "K";
fldToTraits[Doc::keyau] = FieldTraits("A");
fldToTraits["creator"] = FieldTraits("A");
fldToTraits["from"] = FieldTraits("A");
fldToPrefs["xapyear"] = "Y";
fldToPrefs["xapyearmon"] = "M";
fldToPrefs["xapdate"] = "D";
fldToTraits[Doc::keykw] = FieldTraits("K");
fldToTraits["keyword"] = FieldTraits("K");
fldToTraits["tag"] = FieldTraits("K");
fldToTraits["tags"] = FieldTraits("K");
fldToTraits["xapyear"] = FieldTraits("Y");
fldToTraits["xapyearmon"] = FieldTraits("M");
fldToTraits["xapdate"] = FieldTraits("D");
}
// Compute the unique term used to link documents to their origin.
@ -539,8 +549,8 @@ Db::Db(RclConfig *cfp)
m_curtxtsz(0), m_flushtxtsz(0), m_occtxtsz(0), m_occFirstCheck(1),
m_maxFsOccupPc(0), m_mode(Db::DbRO)
{
if (!fldToPrefs.size())
initFldToPrefs();
if (!fldToTraits.size())
initFldToTraits();
m_ndb = new Native(this);
if (m_config) {
@ -791,17 +801,18 @@ bool Db::isopen()
// reason (old config not updated ?). We use it only if the config
// translation fails. Also we add in there fields which should be
// indexed with no prefix (ie: abstract)
bool Db::fieldToPrefix(const string& fld, string &pfx)
bool Db::fieldToTraits(const string& fld, const FieldTraits **ftpp)
{
if (m_config && m_config->getFieldPrefix(fld, pfx))
if (m_config && m_config->getFieldTraits(fld, ftpp))
return true;
// No data in rclconfig? Check default values
map<string, string>::const_iterator it = fldToPrefs.find(fld);
if (it != fldToPrefs.end()) {
pfx = it->second;
map<string, FieldTraits>::const_iterator it = fldToTraits.find(fld);
if (it != fldToTraits.end()) {
*ftpp = &it->second;
return true;
}
*ftpp = 0;
return false;
}
@ -817,15 +828,18 @@ class TextSplitDb : public TextSplit {
StopList &stops;
TextSplitDb(Xapian::WritableDatabase idb,
Xapian::Document &d, StopList &_stops)
: db(idb), doc(d), basepos(1), curpos(0), stops(_stops)
: db(idb), doc(d), basepos(1), curpos(0), stops(_stops), wdfinc(1)
{}
bool takeword(const std::string &term, int pos, int, int);
void setprefix(const string& pref) {prefix = pref;}
void setwdfinc(int i) {wdfinc = i;}
private:
// If prefix is set, we also add a posting for the prefixed terms
// (ie: for titles, add postings for both "term" and "Sterm")
string prefix;
// Some fields have more weight
int wdfinc;
};
// Get one term from the doc, remove accents and lowercase, then add posting
@ -853,17 +867,16 @@ bool TextSplitDb::takeword(const std::string &_term, int pos, int, int)
pos += basepos;
string ermsg;
try {
// Note: 1 is the within document frequency increment. It would
// be possible to assign different weigths to doc parts (ie title)
// by using a higher value
doc.add_posting(term, pos, 1);
// Index without prefix, using the field-specific weighting
doc.add_posting(term, pos, wdfinc);
#ifdef TESTING_XAPIAN_SPELL
if (Db::isSpellingCandidate(term)) {
db.add_spelling(term);
}
#endif
// Index the prefixed term.
if (!prefix.empty()) {
doc.add_posting(prefix + term, pos, 1);
doc.add_posting(prefix + term, pos, wdfinc);
}
return true;
} XCATCHERROR(ermsg);
@ -984,26 +997,30 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
//
// The order has no importance, and we set a position gap of 100
// between fields to avoid false proximity matches.
map<string,string>::iterator meta_it;
string pfx;
map<string, string>::iterator meta_it;
for (meta_it = doc.meta.begin(); meta_it != doc.meta.end(); meta_it++) {
if (!meta_it->second.empty()) {
if (!fieldToPrefix(meta_it->first, pfx)) {
const FieldTraits *ftp;
// We don't test for an empty prefix here. Some fields are part
// of the internal conf with an empty prefix (ie: abstract).
if (!fieldToTraits(meta_it->first, &ftp)) {
LOGDEB0(("Db::add: no prefix for field [%s], no indexing\n",
meta_it->first.c_str()));
continue;
}
LOGDEB0(("Db::add: field [%s] pfx [%s]: [%s]\n",
meta_it->first.c_str(), pfx.c_str(),
LOGDEB0(("Db::add: field [%s] pfx [%s] inc %d: [%s]\n",
meta_it->first.c_str(), ftp->pfx.c_str(), ftp->wdfinc,
meta_it->second.c_str()));
splitter.setprefix(pfx); // Subject
splitter.setprefix(ftp->pfx); // Subject
splitter.setwdfinc(ftp->wdfinc);
if (!splitter.text_to_words(meta_it->second))
LOGDEB(("Db::addOrUpdate: split failed for %s\n",
meta_it->first.c_str()));
splitter.setprefix(string());
splitter.basepos += splitter.curpos + 100;
}
}
splitter.setprefix(string());
splitter.setwdfinc(1);
if (splitter.curpos < baseTextPosition)
splitter.basepos = baseTextPosition;
@ -1011,7 +1028,7 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
splitter.basepos += splitter.curpos + 100;
// Split and index body text
LOGDEB2(("Db::add: split body\n"));
LOGDEB2(("Db::add: split body: [%s]\n", doc.text.c_str()));
if (!splitter.text_to_words(doc.text))
LOGDEB(("Db::addOrUpdate: split failed for main text\n"));
@ -1560,11 +1577,13 @@ bool Db::termMatch(MatchType typ, const string &lang,
string prefix;
if (!field.empty()) {
(void)fieldToPrefix(field, prefix);
if (prefix.empty()) {
const FieldTraits *ftp = 0;
if (!fieldToTraits(field, &ftp) || ftp->pfx.empty()) {
LOGDEB(("Db::termMatch: field is not indexed (no prefix): [%s]\n",
field.c_str()));
}
} else {
prefix = ftp->pfx;
}
if (prefixp)
*prefixp = prefix;
}

View File

@ -24,6 +24,7 @@
#include "refcntr.h"
#include "rcldoc.h"
#include "stoplist.h"
#include "rclconfig.h"
#ifndef NO_NAMESPACES
using std::string;
@ -130,7 +131,7 @@ class Db {
/* Return list of configured stop words */
const StopList& getStopList() const {return m_stops;}
/* Field name to prefix translation (ie: author -> 'A') */
bool fieldToPrefix(const string& fldname, string &pfx);
bool fieldToTraits(const string& fldname, const FieldTraits **ftpp);
/* Update-related methods ******************************************/

View File

@ -219,6 +219,7 @@ bool SearchData::toNativeQuery(Rcl::Db &db, void *d)
LOGERR(("Can't retrieve index min/max dates\n"));
//whatever, go on.
}
if (m_dates.y1 == 0) {
m_dates.y1 = minyear;
m_dates.m1 = 1;
@ -572,8 +573,11 @@ void StringToXapianQ::expandTerm(bool nostemexp,
if (nostemexp && !haswild) {
// Neither stemming nor wildcard expansion: just the word
string pfx;
if (!m_field.empty())
m_db.fieldToPrefix(m_field, pfx);
const FieldTraits *ftp;
if (!m_field.empty() && m_db.fieldToTraits(m_field, &ftp)) {
pfx = ftp->pfx;
}
sterm = term;
m_uterms.push_back(sterm);
exp.push_front(pfx+term);

View File

@ -1,5 +1,7 @@
# @(#$Id: fields,v 1.5 2008-10-08 08:27:34 dockes Exp $ (C) 2007 J.F.Dockes
# Field names configuration. This defines how one may search ie for
# (C) 2007-2011 J.F.Dockes
# License: GPL V2
#
# Field names configuration. This defines how one may search ie for:
# author:Hemingway
#
# Important:
@ -14,19 +16,33 @@
# The choice of field names is rather arbitrary. Use of any of the aliases
# defined in the following section will yield exactly the same results,
# (both for indexing and search).
#
# Fields can have two relevance boost factors defined, such as in:
# caption = S ; wdfinc=10
# and/or
# caption = S ; boost = 10
# The first line would boost the xapian "within document frequency" of
# caption terms by a factor of 10 at indexing time. The second one (not
# currently implemented) would automatically boost the weight of a
# caption-based field query (ie: caption:mytitle or title:mytitle) at query
# time.
[prefixes]
# Native fields matching omega uses, which we index without an X first
# letter. Don't change these. Caption is used for 'title' to keep a last
# remnant of omega compatibility inside the data record. Also D,F,M,Q,T,Y
caption = S
# remnant of omega compatibility inside the data record.
# Also reserved/hardcoded: D(ate), M(onth), Y(ear),
# F(parentid), Q(uniqueid), T(mime type)
caption = S ; wdfinc = 10
author = A
keywords = K
# Extension examples. These are actually used by default by Recoll, you can
# add your own to search for fields produced by the filters and not handled
# by default.
# Some values are reserved by recoll: XP (for path elements).
# Some values are internally reserved by recoll:
# XP (for path elements).
ext = XE
filename = XSFN
recipient = XTO
@ -65,7 +81,7 @@ filename=
[aliases]
abstract = summary dc:summary description xesam:description
author = creator dc:creator xesam:author xesam:creator from
caption = title title dc:title subject
caption = title dc:title subject
# catg = dc:type contentCategory
dbytes = size xesam:size
dmtime = date dc:date dc:datemodified datemodified contentmodified \