foundation work for configurable stored/indexed fields

2008-09-08 16:49:10 +00:00 · 2008-09-08 16:49:10 +00:00 · 89c3dfdf98
commit 89c3dfdf98
parent c8114446cf
17 changed files with 370 additions and 110 deletions
--- a/src/common/rclconfig.cpp
+++ b/src/common/rclconfig.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: rclconfig.cpp,v 1.56 2007-12-13 06:58:21 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: rclconfig.cpp,v 1.57 2008-09-08 16:49:10 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 /*
 *   This program is free software; you can redistribute it and/or modify
@ -109,25 +109,27 @@ RclConfig::RclConfig(const string *argcnf)
    m_cdirs.push_back(path_cat(m_datadir, "examples"));
    string cnferrloc = m_confdir + " or " + path_cat(m_datadir, "examples");

+    // Read and process "recoll.conf"
    if (!updateMainConfig())
 	return;
-
+    // Other files
    mimemap = new ConfStack<ConfTree>("mimemap", m_cdirs, true);
    if (mimemap == 0 || !mimemap->ok()) {
 	m_reason = string("No or bad mimemap file in: ") + cnferrloc;
 	return;
    }
-
-    mimeconf = new ConfStack<ConfTree>("mimeconf", m_cdirs, true);
+    mimeconf = new ConfStack<ConfSimple>("mimeconf", m_cdirs, true);
    if (mimeconf == 0 || !mimeconf->ok()) {
 	m_reason = string("No/bad mimeconf in: ") + cnferrloc;
 	return;
    }
-    mimeview = new ConfStack<ConfTree>("mimeview", m_cdirs, true);
-    if (mimeconf == 0 || !mimeconf->ok()) {
+    mimeview = new ConfStack<ConfSimple>("mimeview", m_cdirs, true);
+    if (mimeview == 0 || !mimeview->ok()) {
 	m_reason = string("No/bad mimeview in: ") + cnferrloc;
 	return;
    }
+    if (!readFieldsConfig(cnferrloc))
+	return;

    m_ok = true;
    setKeyDir("");
@ -453,15 +455,129 @@ string RclConfig::getMimeHandlerDef(const std::string &mtype, bool filtertypes)
    return hs;
 }

+// Read definitions for field prefixes, aliases, and hierarchy and arrange 
+// things for speed (theses are used a lot during indexing)
+bool RclConfig::readFieldsConfig(const string& cnferrloc)
+{
+    m_fields = new ConfStack<ConfSimple>("fields", m_cdirs, true);
+    if (m_fields == 0 || !m_fields->ok()) {
+	m_reason = string("No/bad fields file in: ") + cnferrloc;
+	return false;
+    }
+
+    // Build a direct map avoiding all indirections for field to
+    // prefix translation
+    // Add direct prefixes
+    list<string>tps = m_fields->getNames("prefixes");
+    for (list<string>::const_iterator it = tps.begin(); it != tps.end();it++) {
+	string val;
+	m_fields->get(*it, val, "prefixes");
+	m_fldtopref[*it] = val;
+    }
+    // Add prefixes for aliases:
+    tps = m_fields->getNames("aliases");
+    for (list<string>::const_iterator it = tps.begin(); it != tps.end();it++) {
+	string canonic = *it; // canonic name
+	string pfx;
+	map<string,string>::const_iterator pit = m_fldtopref.find(canonic);
+	if (pit != m_fldtopref.end()) {
+	    pfx = pit->second;
+	} else {
+	    // Note: it's perfectly normal to have no prefix for the canonic
+	    // name, this could be a stored, not indexed field
+	    LOGDEB2(("RclConfig::readFieldsConfig: no pfx for canonic [%s]\n",
+		    canonic.c_str()));
+	    continue;
+	}
+	string aliases;
+	m_fields->get(canonic, aliases, "aliases");
+	list<string> l;
+	stringToStrings(aliases, l);
+	for (list<string>::const_iterator ait = l.begin();
+	     ait != l.end(); ait++) {
+	    m_fldtopref[*ait] = pfx;
+	}
+    }
+#if 0
+    for (map<string,string>::const_iterator it = m_fldtopref.begin();
+	 it != m_fldtopref.end(); it++) {
+	LOGDEB(("RclConfig::readFieldsConfig: [%s] => [%s]\n",
+		it->first.c_str(), it->second.c_str()));
+    }
+#endif
+
+    string ss;
+    if (m_fields->get("stored", ss, "stored")) {
+	list<string> sl;
+	stringToStrings(ss, sl);
+	for (list<string>::const_iterator it = sl.begin(); 
+	     it != sl.end(); it++) {
+	    LOGDEB(("Inserting [%s] in stored list\n", (*it).c_str()));
+	    m_storedFields.insert(*it);
+	}
+    }
+
+    return true;
+}
+
+// Return term indexing prefix for field name (ie: "filename" -> "XSFN")
 bool RclConfig::getFieldPrefix(const string& fld, string &pfx)
 {
-    if (!mimeconf->get(fld, pfx, "prefixes")) {
-      LOGDEB(("getFieldPrefix: no prefix defined for '%s'\n", fld.c_str()));
-      return false;
+    map<string,string>::const_iterator pit = m_fldtopref.find(fld);
+    if (pit != m_fldtopref.end()) {
+	pfx = pit->second;
+	return true;
+    } else {
+	LOGDEB1(("RclConfig::readFieldsConfig: no prefix for field [%s]\n",
+		 fld.c_str()));
+	return false;
+    }
+}
+
+// Return specialisations of field name for search expansion 
+// (ie: author->[author, from])
+bool RclConfig::getFieldSpecialisations(const string& fld, 
+					list<string>& children, bool top)
+{
+    string sclds;
+    children.push_back(fld);
+    if (m_fields->get(fld, sclds, "specialisations")) {
+	list<string> clds;
+	stringToStrings(sclds, clds);
+	for (list<string>::const_iterator it = clds.begin();
+	     it != clds.end(); it++) {
+	    getFieldSpecialisations(*it, children, false);
+	}
+    }
+    if (top) {
+	children.sort();
+	children.unique();
    }
    return true;
 }

+// 
+bool RclConfig::getFieldSpecialisationPrefixes(const string& fld, 
+					       list<string>& pfxes)
+{
+    list<string> clds;
+    getFieldSpecialisations(fld, clds);
+    for (list<string>::const_iterator it = clds.begin();
+	 it != clds.end(); it++) {
+	string pfx;
+	if (getFieldPrefix(*it, pfx))
+	    pfxes.push_back(pfx);
+    }
+    pfxes.sort();
+    pfxes.unique();
+    return true;
+}
+bool RclConfig::fieldIsStored(const string& fld)
+{
+    set<string>::const_iterator it = m_storedFields.find(fld);
+    return it != m_storedFields.end();
+}
+
 string RclConfig::getMimeViewerDef(const string &mtype)
 {
    string hs;
@ -497,7 +613,7 @@ bool RclConfig::setMimeViewerDef(const string& mt, const string& def)
    cdirs.push_back(path_cat(m_datadir, "examples"));

    delete mimeview;
-    mimeview = new ConfStack<ConfTree>("mimeview", cdirs, true);
+    mimeview = new ConfStack<ConfSimple>("mimeview", cdirs, true);
    if (mimeview == 0 || !mimeview->ok()) {
 	m_reason = string("No/bad mimeview in: ") + m_confdir;
 	return false;
@ -727,6 +843,7 @@ void RclConfig::freeAll()
    delete mimemap;
    delete mimeconf; 
    delete mimeview; 
+    delete m_fields;
    delete STOPSUFFIXES;
    // just in case
    zeroMe();
@ -747,9 +864,9 @@ void RclConfig::initFrom(const RclConfig& r)
    if (r.mimemap)
 	mimemap = new ConfStack<ConfTree>(*(r.mimemap));
    if (r.mimeconf)
-	mimeconf = new ConfStack<ConfTree>(*(r.mimeconf));
+	mimeconf = new ConfStack<ConfSimple>(*(r.mimeconf));
    if (r.mimeview)
-	mimeview = new ConfStack<ConfTree>(*(r.mimeview));
+	mimeview = new ConfStack<ConfSimple>(*(r.mimeview));
    if (r.m_stopsuffixes)
 	m_stopsuffixes = new SuffixStore(*((SuffixStore*)r.m_stopsuffixes));
    m_maxsufflen = r.m_maxsufflen;
--- a/src/common/rclconfig.h
+++ b/src/common/rclconfig.h
@ -16,19 +16,23 @@
 */
 #ifndef _RCLCONFIG_H_INCLUDED_
 #define _RCLCONFIG_H_INCLUDED_
-/* @(#$Id: rclconfig.h,v 1.39 2007-11-16 14:28:52 dockes Exp $  (C) 2004 J.F.Dockes */
+/* @(#$Id: rclconfig.h,v 1.40 2008-09-08 16:49:10 dockes Exp $  (C) 2004 J.F.Dockes */

 #include <list>
 #include <string>
 #include <vector>
 #include <set>
 #include <utility>
+#include <map>
+#include <set>
 #ifndef NO_NAMESPACES
 using std::list;
 using std::string;
 using std::vector;
 using std::pair;
 using std::set;
+using std::map;
+using std::set;
 #endif

 #include "conftree.h"
@ -142,6 +146,14 @@ class RclConfig {

    /** mimeconf: get field prefix from field name */
    bool getFieldPrefix(const string& fldname, string &pfx);
+    /** Get implied meanings for field name (ie: author->[author, from]) */
+    bool getFieldSpecialisations(const string& fld, 
+				 list<string>& childrens, bool top = true);
+    /** Get prefixes for specialisations of field name */
+    bool getFieldSpecialisationPrefixes(const string& fld, 
+					list<string>& pfxes);
+    bool fieldIsStored(const string& fld);
+    const set<string>& getStoredFields() {return m_storedFields;}

    /** mimeview: get/set external viewer exec string(s) for mimetype(s) */
    string getMimeViewerDef(const string &mimetype);
@ -181,8 +193,11 @@ class RclConfig {

    ConfStack<ConfTree> *m_conf;   // Parsed configuration files
    ConfStack<ConfTree> *mimemap;  // The files don't change with keydir, 
-    ConfStack<ConfTree> *mimeconf; // but their content may depend on it.
-    ConfStack<ConfTree> *mimeview; // 
+    ConfStack<ConfSimple> *mimeconf; // but their content may depend on it.
+    ConfStack<ConfSimple> *mimeview; // 
+    ConfStack<ConfSimple> *m_fields;
+    map<string, string>  m_fldtopref;
+    set<string>          m_storedFields;

    void        *m_stopsuffixes;
    unsigned int m_maxsufflen;
@ -205,11 +220,13 @@ class RclConfig {
 	mimemap = 0; 
 	mimeconf = 0; 
 	mimeview = 0; 
+	m_fields = 0;
 	m_stopsuffixes = 0;
 	m_maxsufflen = 0;
    }
    /** Free data then zero pointers */
    void freeAll();
+    bool readFieldsConfig(const string& errloc);
 };


--- a/src/internfile/internfile.cpp
+++ b/src/internfile/internfile.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: internfile.cpp,v 1.40 2008-09-05 10:36:06 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: internfile.cpp,v 1.41 2008-09-08 16:49:10 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 /*
 *   This program is free software; you can redistribute it and/or modify
@ -286,13 +286,14 @@ static inline bool getKeyValue(const map<string, string>& docdata,
    return false;
 }

-static const string keyab("abstract");
+// These defs are for the Dijon meta array. Rcl::Doc predefined field
+// names are used where appropriate. In some cases, Rcl::Doc names are
+// used inside the Dijon metadata (ex: origcharset)
 static const string keyau("author");
 static const string keycs("charset");
 static const string keyct("content");
 static const string keyds("description");
 static const string keyfn("filename");
-static const string keykw("keywords");
 static const string keymd("modificationdate");
 static const string keymt("mimetype");
 static const string keyoc("origcharset");
@ -317,8 +318,8 @@ bool FileInterner::dijontorcl(Rcl::Doc& doc)
 	    doc.meta[it->first] = it->second;
 	}
    }
-    if (doc.meta[keyab].empty() && !doc.meta[keyds].empty()) {
-	doc.meta[keyab] = doc.meta[keyds];
+    if (doc.meta[Rcl::Doc::keyabs].empty() && !doc.meta[keyds].empty()) {
+	doc.meta[Rcl::Doc::keyabs] = doc.meta[keyds];
 	doc.meta.erase(keyds);
    }
    return true;
@ -353,7 +354,7 @@ void FileInterner::collectIpathAndMT(Rcl::Doc& doc, string& ipath) const
 	} else {
 	    ipath += isep;
 	}
-	getKeyValue(docdata, keyau, doc.meta["author"]);
+	getKeyValue(docdata, keyau, doc.meta[Rcl::Doc::keyau]);
 	getKeyValue(docdata, keymd, doc.dmtime);
    }

--- a/src/lib/Makefile
+++ b/src/lib/Makefile
@ -8,8 +8,8 @@ LIBS = librcl.a

 all: $(LIBS)

-OBJS =  rclaspell.o rclconfig.o rclinit.o textsplit.o unacpp.o csguess.o indexer.o mimetype.o htmlparse.o myhtmlparse.o mimehandler.o internfile.o mh_exec.o mh_html.o mh_mail.o mh_mbox.o mh_text.o docseq.o docseqdb.o docseqhist.o history.o recollq.o sortseq.o wasastringtoquery.o wasatorcl.o pathhash.o rcldb.o rclquery.o searchdata.o stemdb.o stoplist.o base64.o conftree.o copyfile.o debuglog.o execmd.o fstreewalk.o idfile.o fileudi.o md5.o mimeparse.o pathut.o readfile.o smallut.o transcode.o wipedir.o x11mon.o
-DEPS =  rclaspell.dep.stamp rclconfig.dep.stamp rclinit.dep.stamp textsplit.dep.stamp unacpp.dep.stamp csguess.dep.stamp indexer.dep.stamp mimetype.dep.stamp htmlparse.dep.stamp myhtmlparse.dep.stamp mimehandler.dep.stamp internfile.dep.stamp mh_exec.dep.stamp mh_html.dep.stamp mh_mail.dep.stamp mh_mbox.dep.stamp mh_text.dep.stamp docseq.dep.stamp docseqdb.dep.stamp docseqhist.dep.stamp history.dep.stamp recollq.dep.stamp sortseq.dep.stamp wasastringtoquery.dep.stamp wasatorcl.dep.stamp pathhash.dep.stamp rcldb.dep.stamp rclquery.dep.stamp searchdata.dep.stamp stemdb.dep.stamp stoplist.dep.stamp base64.dep.stamp conftree.dep.stamp copyfile.dep.stamp debuglog.dep.stamp execmd.dep.stamp fstreewalk.dep.stamp idfile.dep.stamp fileudi.dep.stamp md5.dep.stamp mimeparse.dep.stamp pathut.dep.stamp readfile.dep.stamp smallut.dep.stamp transcode.dep.stamp wipedir.dep.stamp x11mon.dep.stamp
+OBJS =  rclaspell.o rclconfig.o rclinit.o textsplit.o unacpp.o csguess.o indexer.o mimetype.o htmlparse.o myhtmlparse.o mimehandler.o internfile.o mh_exec.o mh_html.o mh_mail.o mh_mbox.o mh_text.o docseq.o docseqdb.o docseqhist.o history.o recollq.o sortseq.o wasastringtoquery.o wasatorcl.o pathhash.o rcldb.o rcldoc.o rclquery.o searchdata.o stemdb.o stoplist.o base64.o conftree.o copyfile.o debuglog.o execmd.o fstreewalk.o idfile.o fileudi.o md5.o mimeparse.o pathut.o readfile.o smallut.o transcode.o wipedir.o x11mon.o
+DEPS =  rclaspell.dep.stamp rclconfig.dep.stamp rclinit.dep.stamp textsplit.dep.stamp unacpp.dep.stamp csguess.dep.stamp indexer.dep.stamp mimetype.dep.stamp htmlparse.dep.stamp myhtmlparse.dep.stamp mimehandler.dep.stamp internfile.dep.stamp mh_exec.dep.stamp mh_html.dep.stamp mh_mail.dep.stamp mh_mbox.dep.stamp mh_text.dep.stamp docseq.dep.stamp docseqdb.dep.stamp docseqhist.dep.stamp history.dep.stamp recollq.dep.stamp sortseq.dep.stamp wasastringtoquery.dep.stamp wasatorcl.dep.stamp pathhash.dep.stamp rcldb.dep.stamp rcldoc.dep.stamp rclquery.dep.stamp searchdata.dep.stamp stemdb.dep.stamp stoplist.dep.stamp base64.dep.stamp conftree.dep.stamp copyfile.dep.stamp debuglog.dep.stamp execmd.dep.stamp fstreewalk.dep.stamp idfile.dep.stamp fileudi.dep.stamp md5.dep.stamp mimeparse.dep.stamp pathut.dep.stamp readfile.dep.stamp smallut.dep.stamp transcode.dep.stamp wipedir.dep.stamp x11mon.dep.stamp

 librcl.a : $(DEPS) $(OBJS) unac.o
 	ar ru librcl.a $(OBJS) unac.o
@ -71,6 +71,8 @@ pathhash.o : ../rcldb/pathhash.cpp
 	$(CXX) $(ALL_CXXFLAGS) -c ../rcldb/pathhash.cpp
 rcldb.o : ../rcldb/rcldb.cpp
 	$(CXX) $(ALL_CXXFLAGS) -c ../rcldb/rcldb.cpp
+rcldoc.o : ../rcldb/rcldoc.cpp
+	$(CXX) $(ALL_CXXFLAGS) -c ../rcldb/rcldoc.cpp
 rclquery.o : ../rcldb/rclquery.cpp
 	$(CXX) $(ALL_CXXFLAGS) -c ../rcldb/rclquery.cpp
 searchdata.o : ../rcldb/searchdata.cpp
@ -198,6 +200,9 @@ pathhash.dep.stamp : ../rcldb/pathhash.cpp
 rcldb.dep.stamp : ../rcldb/rcldb.cpp
 	$(CXX) -M $(ALL_CXXFLAGS) ../rcldb/rcldb.cpp > rcldb.dep
 	touch rcldb.dep.stamp
+rcldoc.dep.stamp : ../rcldb/rcldoc.cpp
+	$(CXX) -M $(ALL_CXXFLAGS) ../rcldb/rcldoc.cpp > rcldoc.dep
+	touch rcldoc.dep.stamp
 rclquery.dep.stamp : ../rcldb/rclquery.cpp
 	$(CXX) -M $(ALL_CXXFLAGS) ../rcldb/rclquery.cpp > rclquery.dep
 	touch rclquery.dep.stamp
@ -285,6 +290,7 @@ include wasastringtoquery.dep
 include wasatorcl.dep
 include pathhash.dep
 include rcldb.dep
+include rcldoc.dep
 include rclquery.dep
 include searchdata.dep
 include stemdb.dep
--- a/src/lib/mkMake
+++ b/src/lib/mkMake
@ -31,6 +31,7 @@ ${depth}/query/wasastringtoquery.cpp \
 ${depth}/query/wasatorcl.cpp \
 ${depth}/rcldb/pathhash.cpp \
 ${depth}/rcldb/rcldb.cpp \
+${depth}/rcldb/rcldoc.cpp \
 ${depth}/rcldb/rclquery.cpp \
 ${depth}/rcldb/searchdata.cpp \
 ${depth}/rcldb/stemdb.cpp \
--- a/src/python/recoll/pyrecoll.cpp
+++ b/src/python/recoll/pyrecoll.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: pyrecoll.cpp,v 1.10 2008-08-28 15:44:37 dockes Exp $ (C) 2007 J.F.Dockes";
+static char rcsid[] = "@(#$Id: pyrecoll.cpp,v 1.11 2008-09-08 16:49:10 dockes Exp $ (C) 2007 J.F.Dockes";
 #endif


@ -394,8 +394,8 @@ static PyGetSetDef Doc_getseters[] = {
     "fbytes", (void *)"fbytes"},
    {"dbytes", (getter)Doc_getmeta, (setter)Doc_setmeta, 
     "dbytes", (void *)"dbytes"},
-    {"relevance", (getter)Doc_getmeta, (setter)Doc_setmeta, 
-     "relevance", (void *)"relevance"},
+    {"relevancyrating", (getter)Doc_getmeta, (setter)Doc_setmeta, 
+     "relevance", (void *)"relevancyrating"},
    {"title", (getter)Doc_getmeta, (setter)Doc_setmeta, 
     "title", (void *)"title"},
    {"keywords", (getter)Doc_getmeta, (setter)Doc_setmeta, 
--- a/src/python/samples/recollq.py
+++ b/src/python/samples/recollq.py
@ -3,7 +3,7 @@
 import sys
 import recoll
 allmeta = ("title", "keywords", "abstract", "url", "mimetype", "mtime",
-           "ipath", "fbytes", "dbytes", "relevance")
+           "ipath", "fbytes", "dbytes", "relevancyrating")


 def dotest(db, q):
--- a/src/qtgui/preview_w.cpp
+++ b/src/qtgui/preview_w.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: preview_w.cpp,v 1.35 2008-07-01 08:27:58 dockes Exp $ (C) 2005 J.F.Dockes";
+static char rcsid[] = "@(#$Id: preview_w.cpp,v 1.36 2008-09-08 16:49:10 dockes Exp $ (C) 2005 J.F.Dockes";
 #endif
 /*
 *   This program is free software; you can redistribute it and/or modify
@ -753,8 +753,8 @@ bool Preview::loadFileInCurrentTab(string fn, size_t sz, const Rcl::Doc &idoc,

    Rcl::Doc doc = idoc;

-    if (doc.meta["title"].empty()) 
-	doc.meta["title"] = path_getsimple(doc.url);
+    if (doc.meta[Rcl::Doc::keytt].empty()) 
+	doc.meta[Rcl::Doc::keytt] = path_getsimple(doc.url);

    setCurTabProps(fn, doc, docnum);

--- a/src/qtgui/reslist.cpp
+++ b/src/qtgui/reslist.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: reslist.cpp,v 1.41 2008-08-26 07:33:05 dockes Exp $ (C) 2005 J.F.Dockes";
+static char rcsid[] = "@(#$Id: reslist.cpp,v 1.42 2008-09-08 16:49:10 dockes Exp $ (C) 2005 J.F.Dockes";
 #endif

 #include <time.h>
@ -418,7 +418,7 @@ void ResList::resultPageNext()
 	if (percent == -1) {
 	    percent = 0;
 	    // Document not available, maybe other further, will go on.
-	    doc.meta["abstract"] = string(tr("Unavailable document").utf8());
+	    doc.meta[Rcl::Doc::keyabs] = string(tr("Unavailable document").utf8());
 	}

 	// Determine icon to display if any
@ -442,8 +442,8 @@ void ResList::resultPageNext()
 	printableUrl(rclconfig->getDefCharset(), doc.url, url);

 	// Make title out of file name if none yet
-	if (doc.meta["title"].empty()) {
-	    doc.meta["title"] = path_getsimple(url);
+	if (doc.meta[Rcl::Doc::keytt].empty()) {
+	    doc.meta[Rcl::Doc::keytt] = path_getsimple(url);
 	}

 	// Result number
@ -485,7 +485,7 @@ void ResList::resultPageNext()
 	    (doc.syntabs || prefs.queryReplaceAbstract)) {
 	    abstract = m_docSource->getAbstract(doc);
 	} else {
-	    abstract = doc.meta["abstract"];
+	    abstract = doc.meta[Rcl::Doc::keyabs];
 	}
 	// No need to call escapeHtml(), plaintorich handles it
 	list<string> lr;
@ -520,14 +520,14 @@ void ResList::resultPageNext()
 	subs['A'] = !richabst.empty() ? richabst + "<br>" : "";
 	subs['D'] = datebuf;
 	subs['I'] = img_name;
-	subs['K'] = !doc.meta["keywords"].empty() ? 
-	    escapeHtml(doc.meta["keywords"]) + "<br>" : "";
+	subs['K'] = !doc.meta[Rcl::Doc::keykw].empty() ? 
+	    escapeHtml(doc.meta[Rcl::Doc::keykw]) + "<br>" : "";
 	subs['L'] = linksbuf;
 	subs['N'] = numbuf;
 	subs['M'] = doc.mimetype;
 	subs['R'] = perbuf;
 	subs['S'] = sizebuf;
-	subs['T'] = escapeHtml(doc.meta["title"]);
+	subs['T'] = escapeHtml(doc.meta[Rcl::Doc::keytt]);
 	subs['U'] = url;

 	string formatted;
--- a/src/query/docseq.h
+++ b/src/query/docseq.h
@ -16,7 +16,7 @@
 */
 #ifndef _DOCSEQ_H_INCLUDED_
 #define _DOCSEQ_H_INCLUDED_
-/* @(#$Id: docseq.h,v 1.13 2008-06-13 18:22:46 dockes Exp $  (C) 2004 J.F.Dockes */
+/* @(#$Id: docseq.h,v 1.14 2008-09-08 16:49:10 dockes Exp $  (C) 2004 J.F.Dockes */
 #include <string>
 #include <list>
 #include <vector>
@ -70,7 +70,7 @@ class DocSequence {
     *  The default is to return the input doc's abstract fields, but some 
     *  sequences can compute a better value (ie: docseqdb) */
    virtual string getAbstract(Rcl::Doc& doc) {
-	return doc.meta["abstract"];
+	return doc.meta[Rcl::Doc::keyabs];
    }

    /** Get estimated total count in results */
--- a/src/query/docseqdb.cpp
+++ b/src/query/docseqdb.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: docseqdb.cpp,v 1.4 2008-06-13 18:22:46 dockes Exp $ (C) 2005 J.F.Dockes";
+static char rcsid[] = "@(#$Id: docseqdb.cpp,v 1.5 2008-09-08 16:49:10 dockes Exp $ (C) 2005 J.F.Dockes";
 #endif
 /*
 *   This program is free software; you can redistribute it and/or modify
@ -62,10 +62,10 @@ int DocSequenceDb::getResCnt()
 string DocSequenceDb::getAbstract(Rcl::Doc &doc)
 {
    if (!m_q->whatDb())
-	return doc.meta["abstract"];
+	return doc.meta[Rcl::Doc::keyabs];
    string abstract;
    m_q->whatDb()->makeDocAbstract(doc, m_q.getptr(), abstract);
-    return abstract.empty() ? doc.meta["abstract"] : abstract;
+    return abstract.empty() ? doc.meta[Rcl::Doc::keyabs] : abstract;
 }

 list<string> DocSequenceDb::expand(Rcl::Doc &doc)
--- a/src/query/recollq.cpp
+++ b/src/query/recollq.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: recollq.cpp,v 1.13 2008-06-13 18:22:46 dockes Exp $ (C) 2006 J.F.Dockes";
+static char rcsid[] = "@(#$Id: recollq.cpp,v 1.14 2008-09-08 16:49:10 dockes Exp $ (C) 2006 J.F.Dockes";
 #endif
 /*
 *   This program is free software; you can redistribute it and/or modify
@ -59,6 +59,7 @@ static char usage [] =
 "    -d also dump file contents\n"
 "    -n <cnt> limit the maximum number of results (0->no limit, default 2000)\n"
 "    -b : basic. Just output urls, no mime types or titles\n"
+"    -m : dump the whole document meta[] array\n"
 ;
 static void
 Usage(void)
@ -80,6 +81,7 @@ static int     op_flags;
 #define OPT_l     0x100
 #define OPT_q     0x200
 #define OPT_t     0x400
+#define OPT_m     0x800

 int recollq(RclConfig **cfp, int argc, char **argv)
 {
@ -103,6 +105,7 @@ int recollq(RclConfig **cfp, int argc, char **argv)
            case 'd':   op_flags |= OPT_d; break;
            case 'f':   op_flags |= OPT_f; break;
            case 'l':   op_flags |= OPT_l; break;
+            case 'm':   op_flags |= OPT_m; break;
 	    case 'n':	op_flags |= OPT_n; if (argc < 2)  Usage();
 		limit = atoi(*(++argv));
 		if (limit <= 0) limit = INT_MAX;
@ -192,9 +195,16 @@ int recollq(RclConfig **cfp, int argc, char **argv)
 	    cout 
 		<< doc.mimetype.c_str() << "\t"
 		<< "[" << doc.url.c_str() << "]" << "\t" 
-		<< "[" << doc.meta["title"].c_str() << "]" << "\t"
+		<< "[" << doc.meta[Rcl::Doc::keytt].c_str() << "]" << "\t"
 		<< doc.fbytes.c_str()   << "\tbytes" << "\t"
 		<<  endl;
+	    if (op_flags & OPT_m) {
+		for (map<string,string>::const_iterator it = doc.meta.begin();
+		     it != doc.meta.end(); it++) {
+		    cout << it->first << " = " << it->second << endl;
+		}
+	    }
+	    cout << endl;
 	}
 	if (op_flags & OPT_d) {
 	    string fn = doc.url.substr(7);
--- a/src/rcldb/rcldb.cpp
+++ b/src/rcldb/rcldb.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.142 2008-09-05 10:34:17 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.143 2008-09-08 16:49:10 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 /*
 *   This program is free software; you can redistribute it and/or modify
@ -129,6 +129,13 @@ bool Db::Native::subDocs(const string &udi, vector<Xapian::docid>& docids)
    return false;
 }

+static const string keycap("caption");
+static const string keymtp("mtype");
+static const string keyfmt("fmtime");
+static const string keydmt("dmtime");
+static const string keyoc("origcharset");
+static const string keyurl("url");
+
 // Turn data record from db into document fields
 bool Db::Native::dbDataToRclDoc(Xapian::docid docid, std::string &data, 
 				Doc &doc, int percent)
@ -137,30 +144,37 @@ bool Db::Native::dbDataToRclDoc(Xapian::docid docid, std::string &data,
    ConfSimple parms(&data);
    if (!parms.ok())
 	return false;
-    parms.get(string("url"), doc.url);
-    parms.get(string("mtype"), doc.mimetype);
-    parms.get(string("fmtime"), doc.fmtime);
-    parms.get(string("dmtime"), doc.dmtime);
-    parms.get(string("origcharset"), doc.origcharset);
-    parms.get(string("caption"), doc.meta["title"]);
-    parms.get(string("keywords"), doc.meta["keywords"]);
-    parms.get(string("abstract"), doc.meta["abstract"]);
-    parms.get(string("author"), doc.meta["author"]);
+    parms.get(keyurl, doc.url);
+    parms.get(keymtp, doc.mimetype);
+    parms.get(keyfmt, doc.fmtime);
+    parms.get(keydmt, doc.dmtime);
+    parms.get(keyoc, doc.origcharset);
+    parms.get(keycap, doc.meta[Doc::keytt]);
+    parms.get(Doc::keykw, doc.meta[Doc::keykw]);
+    parms.get(Doc::keyabs, doc.meta[Doc::keyabs]);
    // Possibly remove synthetic abstract indicator (if it's there, we
    // used to index the beginning of the text as abstract).
    doc.syntabs = false;
-    if (doc.meta["abstract"].find(rclSyntAbs) == 0) {
-	doc.meta["abstract"] = doc.meta["abstract"].substr(rclSyntAbs.length());
+    if (doc.meta[Doc::keyabs].find(rclSyntAbs) == 0) {
+	doc.meta[Doc::keyabs] = doc.meta[Doc::keyabs].substr(rclSyntAbs.length());
 	doc.syntabs = true;
    }
    char buf[20];
    sprintf(buf,"%.2f", float(percent) / 100.0);
-    doc.meta["relevancyrating"] = buf;
+    doc.meta[Doc::keyrr] = buf;
    parms.get(string("ipath"), doc.ipath);
    parms.get(string("fbytes"), doc.fbytes);
    parms.get(string("dbytes"), doc.dbytes);
    parms.get(string("sig"), doc.sig);
    doc.xdocid = docid;
+
+    // Other, not predefined meta fields:
+    list<string> keys = parms.getNames(string());
+    for (list<string>::const_iterator it = keys.begin(); 
+	 it != keys.end(); it++) {
+	if (doc.meta.find(*it) == doc.meta.end()) 
+	    parms.get(*it, doc.meta[*it]);
+    }
    return true;
 }

@ -680,21 +694,21 @@ bool Db::fieldToPrefix(const string& fldname, string &pfx)
    // This is the default table
    static map<string, string> fldToPrefs;
    if (fldToPrefs.empty()) {
-	fldToPrefs["abstract"] = string();
+	fldToPrefs[Doc::keyabs] = string();
 	fldToPrefs["ext"] = "XE";
 	fldToPrefs["filename"] = "XSFN";

 	fldToPrefs["title"] = "S";
-	fldToPrefs["caption"] = "S";
+	fldToPrefs[keycap] = "S";
 	fldToPrefs["subject"] = "S";

-	fldToPrefs["author"] = "A";
+	fldToPrefs[Doc::keyau] = "A";
 	fldToPrefs["creator"] = "A";
 	fldToPrefs["from"] = "A";

 	fldToPrefs["keyword"] = "K";
 	fldToPrefs["tag"] = "K";
-	fldToPrefs["keywords"] = "K";
+	fldToPrefs[Doc::keykw] = "K";
 	fldToPrefs["tags"] = "K";
    }

@ -803,6 +817,7 @@ void Db::setAbstractParams(int idxtrunc, int syntlen, int syntctxlen)
 }

 static const int MB = 1024 * 1024;
+static const string nc("\n\r\x0c");

 // Add document in internal form to the database: index the terms in
 // the title abstract and body and add special terms for file name,
@ -831,35 +846,6 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,

    Doc doc = idoc;

-    // The title, author, abstract and keywords fields are special, they
-    // get stored in the document data record.
-    // Truncate abstract, title and keywords to reasonable lengths. If
-    // abstract is currently empty, we make up one with the beginning
-    // of the document. This is then not indexed, but part of the doc
-    // data so that we can return it to a query without having to
-    // decode the original file.
-    bool syntabs = false;
-    // Note that the map accesses by operator[] create empty entries if they
-    // don't exist yet.
-    if (doc.meta["abstract"].empty()) {
-	syntabs = true;
-	doc.meta["abstract"] = rclSyntAbs + 
-	    neutchars(truncate_to_word(doc.text, m_idxAbsTruncLen), "\n\r");
-    } else {
-	doc.meta["abstract"] = 
-	    neutchars(truncate_to_word(doc.meta["abstract"], m_idxAbsTruncLen),
-		      "\n\r");
-    }
-    if (doc.meta["title"].empty())
-	doc.meta["title"] = doc.utf8fn;
-    doc.meta["title"] = 
-	neutchars(truncate_to_word(doc.meta["title"], 150), "\n\r");
-    doc.meta["author"] = 
-	neutchars(truncate_to_word(doc.meta["author"], 150), "\n\r");
-    doc.meta["keywords"] = 
-	neutchars(truncate_to_word(doc.meta["keywords"], 300),"\n\r");
-
-
    Xapian::Document newdocument;
    mySplitterCB splitData(newdocument, m_stops);
    TextSplit splitter(&splitData);
@ -882,11 +868,9 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
    string pfx;
    for (meta_it = doc.meta.begin(); meta_it != doc.meta.end(); meta_it++) {
 	if (!meta_it->second.empty()) {
-	    if (meta_it->first == "abstract" && syntabs)
-		continue;
 	    if (!fieldToPrefix(meta_it->first, pfx)) {
 		LOGDEB(("Db::add: no prefix for field [%s], no indexing\n",
-			meta_it->first.c_str()));
+			 meta_it->first.c_str()));
 		continue;
 	    }
 	    LOGDEB1(("Db::add: field [%s] pfx [%s]: [%s]\n", 
@ -908,7 +892,7 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
    else
 	splitData.basepos += splitData.curpos + 100;

-    // Finally: split and index body text
+    // Split and index body text
    LOGDEB2(("Db::add: split body\n"));
    if (!dumb_string(doc.text, noacc)) {
 	LOGERR(("Db::add: dumb_string failed\n"));
@ -958,11 +942,22 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
    buf[4] = '\0';
    newdocument.add_term("Y" + string(buf)); // Year (YYYY)

+
+    //////////////////////////////////////////////////////////////////
    // Document data record. omindex has the following nl separated fields:
    // - url
    // - sample
    // - caption (title limited to 100 chars)
    // - mime type 
+    //
+    // The title, author, abstract and keywords fields are special,
+    // they always get stored in the document data
+    // record. Configurable other fields can be, too.
+    //
+    // We truncate stored fields abstract, title and keywords to
+    // reasonable lengths and suppress newlines (so that the data
+    // record can keep a simple syntax)
+
    string record = "url=" + doc.url;
    record += "\nmtype=" + doc.mimetype;
    record += "\nfmtime=" + doc.fmtime;
@ -982,20 +977,55 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
    sprintf(sizebuf, "%u", (unsigned int)doc.text.length());
    record += string("\ndbytes=") + sizebuf;

-    if (!doc.ipath.empty()) {
+    if (!doc.ipath.empty())
 	record += "\nipath=" + doc.ipath;
+
+    if (doc.meta[Doc::keytt].empty())
+	doc.meta[Doc::keytt] = doc.utf8fn;
+    doc.meta[Doc::keytt] = 
+	neutchars(truncate_to_word(doc.meta[Doc::keytt], 150), nc);
+    if (!doc.meta[Doc::keytt].empty())
+	record += "\n" + keycap + "=" + doc.meta[Doc::keytt];
+
+    doc.meta[Doc::keykw] = 
+	neutchars(truncate_to_word(doc.meta[Doc::keykw], 300), nc);
+    if (!doc.meta[Doc::keykw].empty())
+	record += "\n" + Doc::keykw + "=" + doc.meta[Doc::keykw];
+
+    // If abstract is empty, we make up one with the beginning of the
+    // document. This is then not indexed, but part of the doc data so
+    // that we can return it to a query without having to decode the
+    // original file.
+    bool syntabs = false;
+    // Note that the map accesses by operator[] create empty entries if they
+    // don't exist yet.
+    if (doc.meta[Doc::keyabs].empty()) {
+	syntabs = true;
+	if (!doc.text.empty())
+	    doc.meta[Doc::keyabs] = rclSyntAbs + 
+		neutchars(truncate_to_word(doc.text, m_idxAbsTruncLen), nc);
+    } else {
+	doc.meta[Doc::keyabs] = 
+	    neutchars(truncate_to_word(doc.meta[Doc::keyabs], m_idxAbsTruncLen),
+		      nc);
    }
-    if (!doc.meta["title"].empty())
-	record += "\ncaption=" + doc.meta["title"];
-    if (!doc.meta["keywords"].empty())
-	record += "\nkeywords=" + doc.meta["keywords"];
-    if (!doc.meta["abstract"].empty())
-	record += "\nabstract=" + doc.meta["abstract"];
-    if (!doc.meta["author"].empty()) {
-	record += "\nauthor=" + doc.meta["author"];
+    if (!doc.meta[Doc::keyabs].empty())
+	record += "\n" + Doc::keyabs + "=" + doc.meta[Doc::keyabs];
+
+    RclConfig *config = RclConfig::getMainConfig();
+    if (config) {
+	const set<string>& stored = config->getStoredFields();
+	for (set<string>::const_iterator it = stored.begin();
+	     it != stored.end(); it++) {
+	    if (!doc.meta[*it].empty()) {
+		string value = 
+		    neutchars(truncate_to_word(doc.meta[*it], 150), nc);
+		record += "\n" + *it + "=" + value;
+	    }
+	}
    }
    record += "\n";
-    LOGDEB1(("Newdocument data: %s\n", record.c_str()));
+    LOGDEB(("Rcl::Db::add: new doc record:\n %s\n", record.c_str()));
    newdocument.set_data(record);

    const char *fnc = udi.c_str();
--- a/src/rcldb/rcldoc.cpp
+++ b/src/rcldb/rcldoc.cpp
@ -0,0 +1,14 @@
+#ifndef lint
+static char rcsid[] = "@(#$Id: rcldoc.cpp,v 1.1 2008-09-08 16:49:10 dockes Exp $ (C) 2007 J.F.Dockes";
+#endif
+
+
+#include "rcldoc.h"
+namespace Rcl {
+const string Doc::keyabs("abstract");
+const string Doc::keyau("author");
+const string Doc::keyfn("filename");
+const string Doc::keykw("keywords");
+const string Doc::keyrr("relevancyrating");
+const string Doc::keytt("title");
+}
--- a/src/rcldb/rcldoc.h
+++ b/src/rcldb/rcldoc.h
@ -16,7 +16,7 @@
 */
 #ifndef _RCLDOC_H_INCLUDED_
 #define _RCLDOC_H_INCLUDED_
-/* @(#$Id: rcldoc.h,v 1.8 2008-08-26 07:33:31 dockes Exp $  (C) 2006 J.F.Dockes */
+/* @(#$Id: rcldoc.h,v 1.9 2008-09-08 16:49:10 dockes Exp $  (C) 2006 J.F.Dockes */

 #include <string>
 #include <map>
@ -51,7 +51,7 @@ class Doc {

    // Transcoded version of the simple file name for SFN-prefixed
    // specific file name indexation
-    // Indexx: set by DbIndexer::processone    
+    // Index: set by DbIndexer::processone    
    string utf8fn; 

    // Internal path for multi-doc files. Ascii
@ -78,11 +78,13 @@ class Doc {
    // handler. If a fieldname-to-prefix translation exists, the
    // terms in the value will be indexed with a prefix.
    // Only some predefined fields are stored in the data record:
-    // "title", "keywords", "abstract", "author"
+    // "title", "keywords", "abstract", "author", but if a field name is
+    // in the "stored" configuration list, it will be stored too.
    map<string, string> meta; 

    // Attribute for the "abstract" entry. true if it is just the top
-    // of doc, not a native document attribute.
+    // of doc, not a native document attribute. Not stored directly, but
+    // as an indicative prefix at the beginning of the abstract (ugly hack)
    bool   syntabs;      
    
    // File size. Index: Set by caller prior to Db::Add. Query: set by
@ -110,7 +112,7 @@ class Doc {
    // and indexed
    string text; 

-    int pc; // used by sortseq, convenience
+    int pc; // relevancy percentage, used by sortseq, convenience
    unsigned long xdocid; // Opaque: rcldb doc identifier.

    ///////////////////////////////////////////////////////////////////
@ -132,6 +134,12 @@ class Doc {
 	pc = 0;
 	xdocid = 0;
    }
+    static const string keyfn;
+    static const string keyrr;
+    static const string keyabs;
+    static const string keyau;
+    static const string keytt;
+    static const string keykw;
 };


--- a/src/recollinstall.in
+++ b/src/recollinstall.in
@ -97,6 +97,7 @@ ${INSTALL} -m 0444 \
 	   sampleconf/mimeview \
 	   sampleconf/recoll.conf \
           sampleconf/mimemap \
+           sampleconf/fields \
 	   ${datadir}/recoll/examples/ || exit 1
 ${INSTALL} -m 0755 index/rclmon.sh ${datadir}/recoll/examples/ || exit 1

--- a/src/sampleconf/fields
+++ b/src/sampleconf/fields
@ -0,0 +1,55 @@
+# @(#$Id: fields,v 1.1 2008-09-08 16:49:10 dockes Exp $  (C) 2007 J.F.Dockes
+# Field names configuration. This defines how one may search ie for 
+# author:Hemingway
+# Important: 
+#   - the field names MUST be all lowercase here. They can be anycased
+#     in the documents:
+
+#####################################################
+# This section defines what prefix the terms inside named fields will be
+# indexed with (in addition to prefix-less indexing for general search)
+# ALL prefixes MUST be all UPPERCASE. Extension prefixes begin with X 
+# 
+# The choice of field names is rather arbitrary. Use of any of the aliases
+# defined in the following section will yield exactly the same results,
+# (both for indexing and search).
+[prefixes]
+
+# Native fields matching omega uses, which we index without an X first
+# letter. Don't change these
+title = S
+author = A
+keyword = K
+
+# extension examples. This are actually used by default by Recoll:
+ext = XE
+filename = XSFN
+
+############################
+# Some fields are stored in the document data record inside the index and
+# can be returned in result lists. There is no necessity that stored fields
+# should be indexed (have a prefix) (example: url but this one doesn't need
+# to be listed here)
+#
+# Some fields are stored by default, don't add them here, else they will be
+# stored twice: title, keywords, abstract, filename, mimetype, url
+# "author" used to be stored by default, now set here as optional
+[stored]
+stored = author
+
+##########################
+# This section defines field names aliases or synonyms. Any right hand side
+# value will be turned into the lhs canonic name before further treatment
+[aliases]
+title = caption subject
+author = creator
+keyword = keywords tag tags
+dmtime = date contentmodified datemodified
+mtype = type mimetype contenttype
+ext = fileextension
+
+#########################
+# This section defines a hierarchy for field names. Searching for a lhs
+# ancestor will be expanded to a search for itself and all rhs descendants
+[specialisations]
+author = from