added open-ended field name handling

2007-06-19 08:36:24 +00:00 · 2007-06-19 08:36:24 +00:00 · 0c74bd6e36
commit 0c74bd6e36
parent c4b099e8d3
15 changed files with 176 additions and 176 deletions
--- a/src/common/rclconfig.cpp
+++ b/src/common/rclconfig.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: rclconfig.cpp,v 1.46 2007-06-18 13:04:14 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: rclconfig.cpp,v 1.47 2007-06-19 08:36:23 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 /*
 *   This program is free software; you can redistribute it and/or modify
@ -373,13 +373,13 @@ string RclConfig::getMimeHandlerDef(const std::string &mtype)
    return hs;
 }

-string RclConfig::getFieldPrefix(const string& fld)
+bool RclConfig::getFieldPrefix(const string& fld, string &pfx)
 {
-    string hs;
-    if (!mimeconf->get(fld, hs, "prefixes")) {
+    if (!mimeconf->get(fld, pfx, "prefixes")) {
      LOGDEB(("getFieldPrefix: no prefix defined for '%s'\n", fld.c_str()));
+      return false;
    }
-    return hs;
+    return true;
 }

 string RclConfig::getMimeViewerDef(const string &mtype)
--- a/src/common/rclconfig.h
+++ b/src/common/rclconfig.h
@ -16,7 +16,7 @@
 */
 #ifndef _RCLCONFIG_H_INCLUDED_
 #define _RCLCONFIG_H_INCLUDED_
-/* @(#$Id: rclconfig.h,v 1.34 2007-06-18 13:04:15 dockes Exp $  (C) 2004 J.F.Dockes */
+/* @(#$Id: rclconfig.h,v 1.35 2007-06-19 08:36:24 dockes Exp $  (C) 2004 J.F.Dockes */

 #include <list>
 #include <string>
@ -138,7 +138,7 @@ class RclConfig {
    bool getMimeCatTypes(const string& cat, list<string>&);

    /** mimeconf: get field prefix from field name */
-    string getFieldPrefix(const string& fldname);
+    bool getFieldPrefix(const string& fldname, string &pfx);

    /** mimeview: get/set external viewer exec string(s) for mimetype(s) */
    string getMimeViewerDef(const string &mimetype);
--- a/src/internfile/internfile.cpp
+++ b/src/internfile/internfile.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: internfile.cpp,v 1.30 2007-05-23 08:29:04 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: internfile.cpp,v 1.31 2007-06-19 08:36:24 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 /*
 *   This program is free software; you can redistribute it and/or modify
@ -270,12 +270,12 @@ static const string keyab("abstract");
 static const string keyau("author");
 static const string keycs("charset");
 static const string keyct("content");
+static const string keyds("description");
 static const string keyfn("filename");
 static const string keykw("keywords");
 static const string keymd("modificationdate");
 static const string keymt("mimetype");
 static const string keyoc("origcharset");
-static const string keysm("sample");
 static const string keytt("title");

 bool FileInterner::dijontorcl(Rcl::Doc& doc)
@ -283,15 +283,24 @@ bool FileInterner::dijontorcl(Rcl::Doc& doc)
    Dijon::Filter *df = m_handlers.back();
    const std::map<std::string, std::string>& docdata = df->get_meta_data();

-    getKeyValue(docdata, keyau, doc.author);
-    getKeyValue(docdata, keyoc, doc.origcharset);
-    getKeyValue(docdata, keyct, doc.text);    
-    getKeyValue(docdata, keytt, doc.title);
-    getKeyValue(docdata, keykw, doc.keywords);
-    getKeyValue(docdata, keymd, doc.dmtime);
-    if (!getKeyValue(docdata, keyab, doc.abstract))
-	getKeyValue(docdata, keysm, doc.abstract);
-    LOGDEB1(("FILENAME: %s\n", doc.utf8fn.c_str()));
+    for (map<string,string>::const_iterator it = docdata.begin(); 
+	 it != docdata.end(); it++) {
+	if (it->first == keyct) {
+	    doc.text = it->second;
+	} else if (it->first == keymd) {
+	    doc.dmtime = it->second;
+	} else if (it->first == keyoc) {
+	    doc.origcharset = it->second;
+	} else if (it->first == keymt || it->first == keycs) {
+	    // don't need these.
+	} else {
+	    doc.meta[it->first] = it->second;
+	}
+    }
+    if (doc.meta[keyab].empty() && !doc.meta[keyds].empty()) {
+	doc.meta[keyab] = doc.meta[keyds];
+	doc.meta.erase(keyds);
+    }
    return true;
 }

@ -324,7 +333,7 @@ void FileInterner::collectIpathAndMT(Rcl::Doc& doc, string& ipath) const
 	} else {
 	    ipath += isep;
 	}
-	getKeyValue(docdata, keyau, doc.author);
+	getKeyValue(docdata, keyau, doc.meta["author"]);
 	getKeyValue(docdata, keymd, doc.dmtime);
    }

@ -672,7 +681,7 @@ int main(int argc, char **argv)
 	"]]]]\n-----------------------------------------------------\n" <<
 	"doc.keywords [[[[" << doc.keywords <<
 	"]]]]\n-----------------------------------------------------\n" <<
-	"doc.abstract [[[[" << doc.abstract <<
+	"doc.meta["abstract"] [[[[" << doc.meta["abstract"] <<
 	"]]]]\n-----------------------------------------------------\n" <<
 	"doc.text [[[[" << doc.text << "]]]]\n";
 }
--- a/src/internfile/mh_html.cpp
+++ b/src/internfile/mh_html.cpp
@ -136,15 +136,16 @@ bool MimeHandlerHtml::next_document()
    m_metaData["origcharset"] = m_defcharset;
    m_metaData["content"] = result.dump;
    m_metaData["charset"] = "utf-8";
-    m_metaData["title"] = result.title;
-    m_metaData["keywords"] = result.keywords;
    // Avoid setting empty values which would crush ones possibly inherited
    // from parent (if we're an attachment)
-    if (!result.author.empty())
-	m_metaData["author"] = result.author;
    if (!result.dmtime.empty())
 	m_metaData["modificationdate"] = result.dmtime;
-    m_metaData["sample"] = result.sample;
    m_metaData["mimetype"] = "text/plain";
+
+    for (map<string,string>::const_iterator it = result.meta.begin(); 
+	 it != result.meta.end(); it++) {
+	if (!it->second.empty())
+	    m_metaData[it->first] = it->second;
+    }
    return true;
 }
--- a/src/internfile/myhtmlparse.cpp
+++ b/src/internfile/myhtmlparse.cpp
@ -144,22 +144,7 @@ MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
 		    if ((j = p.find("name")) != p.end()) {
 			string name = j->second;
 			lowercase_term(name);
-			if (name == "description") {
-			    if (sample.empty()) {
-				sample = i->second;
-				decode_entities(sample);
-			    }
-			} else if (name == "keywords") {
-			    if (!keywords.empty()) keywords += ' ';
-			    string tmp = i->second;
-			    decode_entities(tmp);
-			    keywords += tmp;
-			} else if (name == "author") {
-			    if (!author.empty()) author += ' ';
-			    string tmp = i->second;
-			    decode_entities(tmp);
-			    author += tmp;
-			} else if (name == "date") {
+			if (name == "date") {
 			    // Yes this doesnt exist. It's output by filters
 			    // And the format isn't even standard http/html
 			    // FIXME
@ -172,7 +157,14 @@ MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
 				sprintf(ascuxtime, "%ld", (long)mktime(&tm));
 				dmtime = ascuxtime;
 			    }
-			} 
+			} else if (name == "robots") {
+			} else {
+			    if (!meta[name].empty())
+				meta[name] += ' ';
+			    string tmp = i->second;
+			    decode_entities(tmp);
+			    meta[name] += tmp;
+			}
 		    } else if ((j = p.find("http-equiv")) != p.end()) {
 			string hequiv = j->second;
 			lowercase_term(hequiv);
@ -309,8 +301,8 @@ MyHtmlParser::closing_tag(const string &tag)
 	    break;
 	case 't':
 	    if (tag == "title") {
-		if (title.empty()) {
-		    title = dump;
+		if (meta["title"].empty()) {
+		    meta["title"] = dump;
 		    dump = "";
 		}
 		break;
--- a/src/internfile/myhtmlparse.h
+++ b/src/internfile/myhtmlparse.h
@ -22,6 +22,8 @@
 * USA
 * -----END-LICENCE-----
 */
+#include <map>
+using std::map;

 #include "htmlparse.h"

@ -37,7 +39,8 @@ class MyHtmlParser : public HtmlParser {
    bool in_body_tag; 
    bool in_pre_tag;
    bool pending_space;
-    string title, sample, keywords, dump, dmtime, author;
+    map<string,string> meta;
+    string dump, dmtime;
    string ocharset; // This is the charset our user thinks the doc was
    string charset; // This is the charset it was supposedly converted to
    string doccharset; // Set this to value of charset parameter in header
--- a/src/qtgui/preview_w.cpp
+++ b/src/qtgui/preview_w.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: preview_w.cpp,v 1.20 2007-06-12 13:31:38 dockes Exp $ (C) 2005 J.F.Dockes";
+static char rcsid[] = "@(#$Id: preview_w.cpp,v 1.21 2007-06-19 08:36:24 dockes Exp $ (C) 2005 J.F.Dockes";
 #endif
 /*
 *   This program is free software; you can redistribute it and/or modify
@ -400,8 +400,12 @@ QTextEdit *Preview::addEditorTab()
 void Preview::setCurTabProps(const string &fn, const Rcl::Doc &doc,
 			     int docnum)
 {
-    QString title = QString::fromUtf8(doc.title.c_str(), 
-				      doc.title.length());
+    QString title;
+    map<string,string>::const_iterator meta_it;
+    if ((meta_it = doc.meta.find("title")) != doc.meta.end()) {
+	    title = QString::fromUtf8(meta_it->second.c_str(), 
+				      meta_it->second.length());
+    }
    if (title.length() > 20) {
 	title = title.left(10) + "..." + title.right(10);
    }
@ -421,8 +425,8 @@ void Preview::setCurTabProps(const string &fn, const Rcl::Doc &doc,
    printableUrl(doc.url, url);
    string tiptxt = url + string("\n");
    tiptxt += doc.mimetype + " " + string(datebuf) + "\n";
-    if (!doc.title.empty())
-	tiptxt += doc.title + "\n";
+    if (meta_it != doc.meta.end() && !meta_it->second.empty())
+	tiptxt += meta_it->second + "\n";
    pvTab->setTabToolTip(w,QString::fromUtf8(tiptxt.c_str(), tiptxt.length()));

    for (list<TabData>::iterator it = tabData.begin(); 
@ -607,8 +611,8 @@ bool Preview::loadFileInCurrentTab(string fn, size_t sz, const Rcl::Doc &idoc,
    Rcl::Doc doc = idoc;
    bool cancel = false;

-    if (doc.title.empty()) 
-	doc.title = path_getsimple(doc.url);
+    if (doc.meta["title"].empty()) 
+	doc.meta["title"] = path_getsimple(doc.url);

    setCurTabProps(fn, doc, docnum);

--- a/src/qtgui/reslist.cpp
+++ b/src/qtgui/reslist.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: reslist.cpp,v 1.26 2007-06-13 17:03:23 dockes Exp $ (C) 2005 J.F.Dockes";
+static char rcsid[] = "@(#$Id: reslist.cpp,v 1.27 2007-06-19 08:36:24 dockes Exp $ (C) 2005 J.F.Dockes";
 #endif

 #include <time.h>
@ -399,7 +399,7 @@ void ResList::resultPageNext()
 	if (percent == -1) {
 	    percent = 0;
 	    // Document not available, maybe other further, will go on.
-	    doc.abstract = string(tr("Unavailable document").utf8());
+	    doc.meta["abstract"] = string(tr("Unavailable document").utf8());
 	}

 	// Determine icon to display if any
@ -426,8 +426,8 @@ void ResList::resultPageNext()
 	printableUrl(doc.url, url);

 	// Make title out of file name if none yet
-	if (doc.title.empty()) {
-	    doc.title = path_getsimple(url);
+	if (doc.meta["title"].empty()) {
+	    doc.meta["title"] = path_getsimple(url);
 	}

 	// Result number
@ -469,7 +469,7 @@ void ResList::resultPageNext()
 	    (doc.syntabs || prefs.queryReplaceAbstract)) {
 	    abstract = m_docSource->getAbstract(doc);
 	} else {
-	    abstract = doc.abstract;
+	    abstract = doc.meta["abstract"];
 	}
 	// No need to call escapeHtml(), plaintorich handles it
 	string richabst;
@ -505,14 +505,14 @@ void ResList::resultPageNext()
 	map<char,string> subs;
 	subs['A'] = !richabst.empty() ? richabst + "<br>" : "";
 	subs['D'] = datebuf;
-	subs['K'] = !doc.keywords.empty() ? escapeHtml(doc.keywords) + "<br>" 
-	    : "";
+	subs['K'] = !doc.meta["keywords"].empty() ? 
+	    escapeHtml(doc.meta["keywords"]) + "<br>" : "";
 	subs['L'] = linksbuf;
 	subs['N'] = numbuf;
 	subs['M'] = doc.mimetype;
 	subs['R'] = perbuf;
 	subs['S'] = sizebuf;
-	subs['T'] = escapeHtml(doc.title);
+	subs['T'] = escapeHtml(doc.meta["title"]);
 	subs['U'] = url;

 	string formatted;
--- a/src/query/docseq.h
+++ b/src/query/docseq.h
@ -16,7 +16,7 @@
 */
 #ifndef _DOCSEQ_H_INCLUDED_
 #define _DOCSEQ_H_INCLUDED_
-/* @(#$Id: docseq.h,v 1.11 2007-01-19 15:22:50 dockes Exp $  (C) 2004 J.F.Dockes */
+/* @(#$Id: docseq.h,v 1.12 2007-06-19 08:36:24 dockes Exp $  (C) 2004 J.F.Dockes */
 #include <string>
 #include <list>
 #include <vector>
@ -70,7 +70,7 @@ class DocSequence {
     *  The default is to return the input doc's abstract fields, but some 
     *  sequences can compute a better value (ie: docseqdb) */
    virtual string getAbstract(Rcl::Doc& doc) {
-	return doc.abstract;
+	return doc.meta["abstract"];
    }

    /** Get estimated total count in results */
--- a/src/query/docseqdb.cpp
+++ b/src/query/docseqdb.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: docseqdb.cpp,v 1.2 2007-01-19 15:22:50 dockes Exp $ (C) 2005 J.F.Dockes";
+static char rcsid[] = "@(#$Id: docseqdb.cpp,v 1.3 2007-06-19 08:36:24 dockes Exp $ (C) 2005 J.F.Dockes";
 #endif
 /*
 *   This program is free software; you can redistribute it and/or modify
@ -42,9 +42,9 @@ int DocSequenceDb::getResCnt()
 string DocSequenceDb::getAbstract(Rcl::Doc &doc)
 {
    if (!m_db)
-	return doc.abstract;
+	return doc.meta["abstract"];
    string abstract;
    m_db->makeDocAbstract(doc, abstract);
-    return abstract.empty() ? doc.abstract : abstract;
+    return abstract.empty() ? doc.meta["abstract"] : abstract;
 }

--- a/src/rcldb/rcldb.cpp
+++ b/src/rcldb/rcldb.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.114 2007-06-18 13:04:15 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.115 2007-06-19 08:36:24 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 /*
 *   This program is free software; you can redistribute it and/or modify
@ -200,14 +200,14 @@ bool Native::dbDataToRclDoc(Xapian::docid docid, std::string &data, Doc &doc)
    parms.get(string("fmtime"), doc.fmtime);
    parms.get(string("dmtime"), doc.dmtime);
    parms.get(string("origcharset"), doc.origcharset);
-    parms.get(string("caption"), doc.title);
-    parms.get(string("keywords"), doc.keywords);
-    parms.get(string("abstract"), doc.abstract);
+    parms.get(string("caption"), doc.meta["title"]);
+    parms.get(string("keywords"), doc.meta["keywords"]);
+    parms.get(string("abstract"), doc.meta["abstract"]);
    // Possibly remove synthetic abstract indicator (if it's there, we
    // used to index the beginning of the text as abstract).
    doc.syntabs = false;
-    if (doc.abstract.find(rclSyntAbs) == 0) {
-	doc.abstract = doc.abstract.substr(rclSyntAbs.length());
+    if (doc.meta["abstract"].find(rclSyntAbs) == 0) {
+	doc.meta["abstract"] = doc.meta["abstract"].substr(rclSyntAbs.length());
 	doc.syntabs = true;
    }
    parms.get(string("ipath"), doc.ipath);
@ -743,12 +743,15 @@ bool Db::isopen()
 // Try to translate field specification into field prefix.  We have a
 // default table used if translations are not in the config for some
 // reason (old config not updated ?). We use it only if the config
-// translation fails
-string Db::fieldToPrefix(const string& fldname)
+// translation fails. Also we add in there fields which should be
+// indexed with no prefix (ie: abstract)
+bool Db::fieldToPrefix(const string& fldname, string &pfx)
 {
    // This is the default table
    static map<string, string> fldToPrefs;
    if (fldToPrefs.empty()) {
+	fldToPrefs["abstract"] = "";
+
 	fldToPrefs["title"] = "S";
 	fldToPrefs["caption"] = "S";
 	fldToPrefs["subject"] = "S";
@ -763,17 +766,19 @@ string Db::fieldToPrefix(const string& fldname)
 	fldToPrefs["tags"] = "K";
    }

-    string fld(fldname), pfx;
+    string fld(fldname);
    stringtolower(fld);
+
    RclConfig *config = RclConfig::getMainConfig();
-    if (config)
-	pfx = config->getFieldPrefix(fld);
-    if (pfx.empty()) {
-	map<string, string>::const_iterator it = fldToPrefs.find(fld);
-	if (it != fldToPrefs.end())
-	    fld = it->second;
+    if (config && config->getFieldPrefix(fld, pfx))
+	return true;
+
+    map<string, string>::const_iterator it = fldToPrefs.find(fld);
+    if (it != fldToPrefs.end()) {
+	pfx = it->second;
+	return true;
    }
-    return pfx;
+    return false;
 }


@ -880,11 +885,12 @@ bool Db::add(const string &fn, const Doc &idoc, const struct stat *stp)
    LOGDEB1(("Db::add: fn %s\n", fn.c_str()));
    if (m_ndb == 0)
 	return false;
-
+    static int first = 1;
    // Check file system full every mbyte of indexed text.
-    if (m_maxFsOccupPc > 0 && (m_curtxtsz - m_occtxtsz) / MB >= 1) {
+    if (m_maxFsOccupPc > 0 && (first || (m_curtxtsz - m_occtxtsz) / MB >= 1)) {
 	LOGDEB(("Db::add: checking file system usage\n"));
 	int pc;
+	first = 0;
 	if (fsocc(m_basedir, &pc) && pc >= m_maxFsOccupPc) {
 	    LOGERR(("Db::add: stop indexing: file system "
 		     "%d%% full > max %d%%\n", pc, m_maxFsOccupPc));
@ -895,37 +901,38 @@ bool Db::add(const string &fn, const Doc &idoc, const struct stat *stp)

    Doc doc = idoc;

+    // The title, author, abstract and keywords fields are special, they
+    // get stored in the document data record.
    // Truncate abstract, title and keywords to reasonable lengths. If
    // abstract is currently empty, we make up one with the beginning
    // of the document. This is then not indexed, but part of the doc
    // data so that we can return it to a query without having to
    // decode the original file.
    bool syntabs = false;
-    if (doc.abstract.empty()) {
+    // Note that the map accesses by operator[] create empty entries if they
+    // don't exist yet.
+    if (doc.meta["abstract"].empty()) {
 	syntabs = true;
-	doc.abstract = rclSyntAbs + 
-	    truncate_to_word(doc.text, m_idxAbsTruncLen);
+	doc.meta["abstract"] = rclSyntAbs + 
+	    neutchars(truncate_to_word(doc.text, m_idxAbsTruncLen), "\n\r");
    } else {
-	doc.abstract = truncate_to_word(doc.abstract, m_idxAbsTruncLen);
+	doc.meta["abstract"] = 
+	    neutchars(truncate_to_word(doc.meta["abstract"], m_idxAbsTruncLen),
+		      "\n\r");
    }
-    doc.abstract = neutchars(doc.abstract, "\n\r");
-    doc.title = neutchars(truncate_to_word(doc.title, 150), "\n\r");
-    doc.author = neutchars(truncate_to_word(doc.author, 150), "\n\r");
-    doc.keywords = neutchars(truncate_to_word(doc.keywords, 300), "\n\r");
+    if (doc.meta["title"].empty())
+	doc.meta["title"] = doc.utf8fn, "\n\r";
+    doc.meta["title"] = 
+	neutchars(truncate_to_word(doc.meta["title"], 150), "\n\r");
+    doc.meta["author"] = 
+	neutchars(truncate_to_word(doc.meta["author"], 150), "\n\r");
+    doc.meta["keywords"] = 
+	neutchars(truncate_to_word(doc.meta["keywords"], 300),"\n\r");
+

    Xapian::Document newdocument;
-
    mySplitterCB splitData(newdocument, m_stops);
-
    TextSplit splitter(&splitData);
-
-    // Index the title, document text, keywords and other textual
-    // metadata.  These are all indexed as text with positions, as we
-    // may want to do phrase searches with them (this makes no sense
-    // for keywords by the way, but wtf).
-    /
-    // The order has no importance, and we set a position gap of 100
-    // between fields to avoid false proximity matches.
    string noacc;

    // Split and index file name as document term(s)
@ -935,35 +942,39 @@ bool Db::add(const string &fn, const Doc &idoc, const struct stat *stp)
 	splitData.basepos += splitData.curpos + 100;
    }

-    // Split and index title. If title is empty here, use file name
-    if (doc.title.empty())
-	doc.title = doc.utf8fn;
-    if (!doc.title.empty()) {
-	LOGDEB2(("Db::add: split title [%s]\n", doc.title.c_str()));
-	if (!dumb_string(doc.title, noacc)) {
-	    LOGERR(("Db::add: dumb_string failed\n"));
-	    return false;
+    // Index textual metadata.  These are all indexed as text with
+    // positions, as we may want to do phrase searches with them (this
+    // makes no sense for keywords by the way).
+    //
+    // The order has no importance, and we set a position gap of 100
+    // between fields to avoid false proximity matches.
+    map<string,string>::iterator meta_it;
+    string pfx;
+    for (meta_it = doc.meta.begin(); meta_it != doc.meta.end(); meta_it++) {
+	if (!meta_it->second.empty()) {
+	    if (meta_it->first == "abstract" && syntabs)
+		continue;
+	    if (!fieldToPrefix(meta_it->first, pfx)) {
+		LOGDEB(("Db::add: no prefix for field [%s], no indexing\n",
+			meta_it->first.c_str()));
+		continue;
+	    }
+	    LOGDEB(("Db::add: field [%s] pfx [%s]: [%s]\n", 
+		    meta_it->first.c_str(), pfx.c_str(), 
+		    meta_it->second.c_str()));
+	    if (!dumb_string(meta_it->second, noacc)) {
+		LOGERR(("Db::add: dumb_string failed\n"));
+		return false;
+	    }
+	    splitData.setprefix(pfx); // Subject
+	    splitter.text_to_words(noacc);
+	    splitData.setprefix(emptystring);
+	    splitData.basepos += splitData.curpos + 100;
 	}
-	splitData.setprefix("S"); // Subject
-	splitter.text_to_words(noacc);
-	splitData.setprefix(emptystring);
-	splitData.basepos += splitData.curpos + 100;
    }

-    // Split and index author
-    if (!doc.author.empty()) {
-	LOGDEB2(("Db::add: split author [%s]\n", doc.author.c_str()));
-	if (!dumb_string(doc.author, noacc)) {
-	    LOGERR(("Db::add: dumb_string failed\n"));
-	    return false;
-	}
-	splitData.setprefix("A"); 
-	splitter.text_to_words(noacc);
-	splitData.setprefix(emptystring);
-	splitData.basepos += splitData.curpos + 100;
-    }

-    // Split and index body
+    // Split and index body text
    LOGDEB2(("Db::add: split body\n"));
    if (!dumb_string(doc.text, noacc)) {
 	LOGERR(("Db::add: dumb_string failed\n"));
@ -972,36 +983,8 @@ bool Db::add(const string &fn, const Doc &idoc, const struct stat *stp)
    splitter.text_to_words(noacc);
    splitData.basepos += splitData.curpos + 100;

-    // Split and index keywords
-    if (!doc.keywords.empty()) {
-	LOGDEB2(("Db::add: split kw [%s]\n", doc.keywords.c_str()));
-	if (!dumb_string(doc.keywords, noacc)) {
-	    LOGERR(("Db::add: dumb_string failed\n"));
-	    return false;
-	}
-	splitData.setprefix("K");
-	splitter.text_to_words(noacc);
-	splitData.setprefix(emptystring);
-	splitData.basepos += splitData.curpos + 100;
-    }

-    // Split and index abstract. We don't do this if it is synthetic
-    // any more (this used to give a relevance boost to the beginning
-    // of text, why ?)
-    LOGDEB2(("Db::add: split abstract [%s]\n", doc.abstract.c_str()));
-    if (!syntabs) {
-	// syntabs indicator test kept here in case we want to go back
-	// to indexing synthetic abstracts one day
-	if (!dumb_string(syntabs ? doc.abstract.substr(rclSyntAbs.length()) : 
-			 doc.abstract, noacc)) {
-	    LOGERR(("Db::add: dumb_string failed\n"));
-	    return false;
-	}
-	splitter.text_to_words(noacc);
-    }
-    splitData.basepos += splitData.curpos + 100;
-
-    ////// Special terms for metadata
+    ////// Special terms for other metadata. No positions for these.
    // Mime type
    newdocument.add_term("T" + doc.mimetype);

@ -1075,11 +1058,14 @@ bool Db::add(const string &fn, const Doc &idoc, const struct stat *stp)
    if (!doc.ipath.empty()) {
 	record += "\nipath=" + doc.ipath;
    }
-    record += "\ncaption=" + doc.title;
-    record += "\nkeywords=" + doc.keywords;
-    record += "\nabstract=" + doc.abstract;
-    if (!doc.author.empty()) {
-	record += "\nauthor=" + doc.author;
+    if (!doc.meta["title"].empty())
+	record += "\ncaption=" + doc.meta["title"];
+    if (!doc.meta["keywords"].empty())
+	record += "\nkeywords=" + doc.meta["keywords"];
+    if (!doc.meta["abstract"].empty())
+	record += "\nabstract=" + doc.meta["abstract"];
+    if (!doc.meta["author"].empty()) {
+	record += "\nauthor=" + doc.meta["author"];
    }
    record += "\n";
    LOGDEB1(("Newdocument data: %s\n", record.c_str()));
--- a/src/rcldb/rcldb.h
+++ b/src/rcldb/rcldb.h
@ -16,7 +16,7 @@
 */
 #ifndef _DB_H_INCLUDED_
 #define _DB_H_INCLUDED_
-/* @(#$Id: rcldb.h,v 1.51 2007-06-18 13:04:15 dockes Exp $  (C) 2004 J.F.Dockes */
+/* @(#$Id: rcldb.h,v 1.52 2007-06-19 08:36:24 dockes Exp $  (C) 2004 J.F.Dockes */

 #include <string>
 #include <list>
@ -95,7 +95,7 @@ class Db {
    const StopList& getStopList() const {return m_stops;}

    /** Field name to prefix translation (ie: author -> 'A') */
-    string fieldToPrefix(const string& fldname);
+    bool fieldToPrefix(const string& fldname, string &pfx);

    /* Update-related methods ******************************************/

--- a/src/rcldb/rcldoc.h
+++ b/src/rcldb/rcldoc.h
@ -16,12 +16,14 @@
 */
 #ifndef _RCLDOC_H_INCLUDED_
 #define _RCLDOC_H_INCLUDED_
-/* @(#$Id: rcldoc.h,v 1.2 2007-01-17 13:53:41 dockes Exp $  (C) 2006 J.F.Dockes */
+/* @(#$Id: rcldoc.h,v 1.3 2007-06-19 08:36:24 dockes Exp $  (C) 2006 J.F.Dockes */

 #include <string>
+#include <map>

 #ifndef NO_NAMESPACES
 using std::string;
+using std::map;
 namespace Rcl {
 #endif

@ -47,12 +49,16 @@ class Doc {
                         // Possibly set by handler
    string origcharset;  // Charset we transcoded from (in case we want back)
                         // Possibly set by handler
-    string title;        // Possibly set by handler
-    string author;       // Possibly set by handler
-    string keywords;     // Possibly set by handler
-    string abstract;     // Possibly set by handler
-    bool   syntabs;      // true if abstract is just the top of doc, not an 
-                         // explicit document attribute
+
+    // A map for textual metadata like, author, keywords, abstract, title
+    // Entries possibly set by handler. If a field-name to prefix translation 
+    // exists, the terms will be indexed with a prefix.
+    map<string, string> meta; 
+
+    // Attribute for the "abstract" entry. true if it is just the top
+    // of doc, not a native document attribute
+    bool   syntabs;      
+
    string fbytes;       // File size. Set by Db::Add
    string dbytes;       // Doc size. Set by Db::Add from text length

@ -72,9 +78,7 @@ class Doc {
 	fmtime.erase();
 	dmtime.erase();
 	origcharset.erase();
-	title.erase();
-	keywords.erase();
-	abstract.erase();
+	meta.clear();
 	syntabs = false;
 	fbytes.erase();
 	dbytes.erase();
--- a/src/rcldb/searchdata.cpp
+++ b/src/rcldb/searchdata.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.15 2007-06-18 13:04:15 dockes Exp $ (C) 2006 J.F.Dockes";
+static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.16 2007-06-19 08:36:24 dockes Exp $ (C) 2006 J.F.Dockes";
 #endif
 /*
 *   This program is free software; you can redistribute it and/or modify
@ -487,7 +487,7 @@ bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p,
    }
    string prefix;
    if (!m_field.empty())
-	prefix = db.fieldToPrefix(m_field);
+	db.fieldToPrefix(m_field, prefix);
    list<Xapian::Query> pqueries;

    // We normally boost the original term in the stem expansion list. Don't
@ -541,7 +541,7 @@ bool SearchDataClauseDist::toNativeQuery(Rcl::Db &db, void *p,

    string prefix;
    if (!m_field.empty())
-	prefix = db.fieldToPrefix(m_field);
+	db.fieldToPrefix(m_field, prefix);

    // We normally boost the original term in the stem expansion list. Don't
    // do it if there are wildcards anywhere, this would skew the results.
--- a/src/sampleconf/mimeconf
+++ b/src/sampleconf/mimeconf
@ -1,4 +1,4 @@
-# @(#$Id: mimeconf,v 1.29 2007-06-18 13:04:15 dockes Exp $  (C) 2004 J.F.Dockes
+# @(#$Id: mimeconf,v 1.30 2007-06-19 08:36:24 dockes Exp $  (C) 2004 J.F.Dockes

 # Recoll : associations of mime types to processing filters.
 # There are different sections for decompression, 'interning' for indexing
@ -144,3 +144,4 @@ keyword = K
 tag = K
 keywords = K
 tags = K
+