From 89c3dfdf984dd980371185fddce0bec45f910d82 Mon Sep 17 00:00:00 2001 From: dockes Date: Mon, 8 Sep 2008 16:49:10 +0000 Subject: [PATCH] foundation work for configurable stored/indexed fields --- src/common/rclconfig.cpp | 141 ++++++++++++++++++++++++++++--- src/common/rclconfig.h | 23 ++++- src/internfile/internfile.cpp | 13 +-- src/lib/Makefile | 10 ++- src/lib/mkMake | 1 + src/python/recoll/pyrecoll.cpp | 6 +- src/python/samples/recollq.py | 2 +- src/qtgui/preview_w.cpp | 6 +- src/qtgui/reslist.cpp | 16 ++-- src/query/docseq.h | 4 +- src/query/docseqdb.cpp | 6 +- src/query/recollq.cpp | 14 ++- src/rcldb/rcldb.cpp | 150 ++++++++++++++++++++------------- src/rcldb/rcldoc.cpp | 14 +++ src/rcldb/rcldoc.h | 18 ++-- src/recollinstall.in | 1 + src/sampleconf/fields | 55 ++++++++++++ 17 files changed, 370 insertions(+), 110 deletions(-) create mode 100644 src/rcldb/rcldoc.cpp create mode 100644 src/sampleconf/fields diff --git a/src/common/rclconfig.cpp b/src/common/rclconfig.cpp index 6e97401d..6a438728 100644 --- a/src/common/rclconfig.cpp +++ b/src/common/rclconfig.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: rclconfig.cpp,v 1.56 2007-12-13 06:58:21 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: rclconfig.cpp,v 1.57 2008-09-08 16:49:10 dockes Exp $ (C) 2004 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -109,25 +109,27 @@ RclConfig::RclConfig(const string *argcnf) m_cdirs.push_back(path_cat(m_datadir, "examples")); string cnferrloc = m_confdir + " or " + path_cat(m_datadir, "examples"); + // Read and process "recoll.conf" if (!updateMainConfig()) return; - + // Other files mimemap = new ConfStack("mimemap", m_cdirs, true); if (mimemap == 0 || !mimemap->ok()) { m_reason = string("No or bad mimemap file in: ") + cnferrloc; return; } - - mimeconf = new ConfStack("mimeconf", m_cdirs, true); + mimeconf = new ConfStack("mimeconf", m_cdirs, true); if (mimeconf == 0 || !mimeconf->ok()) { m_reason = string("No/bad mimeconf in: ") + cnferrloc; return; } - mimeview = new ConfStack("mimeview", m_cdirs, true); - if (mimeconf == 0 || !mimeconf->ok()) { + mimeview = new ConfStack("mimeview", m_cdirs, true); + if (mimeview == 0 || !mimeview->ok()) { m_reason = string("No/bad mimeview in: ") + cnferrloc; return; } + if (!readFieldsConfig(cnferrloc)) + return; m_ok = true; setKeyDir(""); @@ -453,15 +455,129 @@ string RclConfig::getMimeHandlerDef(const std::string &mtype, bool filtertypes) return hs; } +// Read definitions for field prefixes, aliases, and hierarchy and arrange +// things for speed (theses are used a lot during indexing) +bool RclConfig::readFieldsConfig(const string& cnferrloc) +{ + m_fields = new ConfStack("fields", m_cdirs, true); + if (m_fields == 0 || !m_fields->ok()) { + m_reason = string("No/bad fields file in: ") + cnferrloc; + return false; + } + + // Build a direct map avoiding all indirections for field to + // prefix translation + // Add direct prefixes + listtps = m_fields->getNames("prefixes"); + for (list::const_iterator it = tps.begin(); it != tps.end();it++) { + string val; + m_fields->get(*it, val, "prefixes"); + m_fldtopref[*it] = val; + } + // Add prefixes for aliases: + tps = m_fields->getNames("aliases"); + for (list::const_iterator it = tps.begin(); it != tps.end();it++) { + string canonic = *it; // canonic name + string pfx; + map::const_iterator pit = m_fldtopref.find(canonic); + if (pit != m_fldtopref.end()) { + pfx = pit->second; + } else { + // Note: it's perfectly normal to have no prefix for the canonic + // name, this could be a stored, not indexed field + LOGDEB2(("RclConfig::readFieldsConfig: no pfx for canonic [%s]\n", + canonic.c_str())); + continue; + } + string aliases; + m_fields->get(canonic, aliases, "aliases"); + list l; + stringToStrings(aliases, l); + for (list::const_iterator ait = l.begin(); + ait != l.end(); ait++) { + m_fldtopref[*ait] = pfx; + } + } +#if 0 + for (map::const_iterator it = m_fldtopref.begin(); + it != m_fldtopref.end(); it++) { + LOGDEB(("RclConfig::readFieldsConfig: [%s] => [%s]\n", + it->first.c_str(), it->second.c_str())); + } +#endif + + string ss; + if (m_fields->get("stored", ss, "stored")) { + list sl; + stringToStrings(ss, sl); + for (list::const_iterator it = sl.begin(); + it != sl.end(); it++) { + LOGDEB(("Inserting [%s] in stored list\n", (*it).c_str())); + m_storedFields.insert(*it); + } + } + + return true; +} + +// Return term indexing prefix for field name (ie: "filename" -> "XSFN") bool RclConfig::getFieldPrefix(const string& fld, string &pfx) { - if (!mimeconf->get(fld, pfx, "prefixes")) { - LOGDEB(("getFieldPrefix: no prefix defined for '%s'\n", fld.c_str())); - return false; + map::const_iterator pit = m_fldtopref.find(fld); + if (pit != m_fldtopref.end()) { + pfx = pit->second; + return true; + } else { + LOGDEB1(("RclConfig::readFieldsConfig: no prefix for field [%s]\n", + fld.c_str())); + return false; + } +} + +// Return specialisations of field name for search expansion +// (ie: author->[author, from]) +bool RclConfig::getFieldSpecialisations(const string& fld, + list& children, bool top) +{ + string sclds; + children.push_back(fld); + if (m_fields->get(fld, sclds, "specialisations")) { + list clds; + stringToStrings(sclds, clds); + for (list::const_iterator it = clds.begin(); + it != clds.end(); it++) { + getFieldSpecialisations(*it, children, false); + } + } + if (top) { + children.sort(); + children.unique(); } return true; } +// +bool RclConfig::getFieldSpecialisationPrefixes(const string& fld, + list& pfxes) +{ + list clds; + getFieldSpecialisations(fld, clds); + for (list::const_iterator it = clds.begin(); + it != clds.end(); it++) { + string pfx; + if (getFieldPrefix(*it, pfx)) + pfxes.push_back(pfx); + } + pfxes.sort(); + pfxes.unique(); + return true; +} +bool RclConfig::fieldIsStored(const string& fld) +{ + set::const_iterator it = m_storedFields.find(fld); + return it != m_storedFields.end(); +} + string RclConfig::getMimeViewerDef(const string &mtype) { string hs; @@ -497,7 +613,7 @@ bool RclConfig::setMimeViewerDef(const string& mt, const string& def) cdirs.push_back(path_cat(m_datadir, "examples")); delete mimeview; - mimeview = new ConfStack("mimeview", cdirs, true); + mimeview = new ConfStack("mimeview", cdirs, true); if (mimeview == 0 || !mimeview->ok()) { m_reason = string("No/bad mimeview in: ") + m_confdir; return false; @@ -727,6 +843,7 @@ void RclConfig::freeAll() delete mimemap; delete mimeconf; delete mimeview; + delete m_fields; delete STOPSUFFIXES; // just in case zeroMe(); @@ -747,9 +864,9 @@ void RclConfig::initFrom(const RclConfig& r) if (r.mimemap) mimemap = new ConfStack(*(r.mimemap)); if (r.mimeconf) - mimeconf = new ConfStack(*(r.mimeconf)); + mimeconf = new ConfStack(*(r.mimeconf)); if (r.mimeview) - mimeview = new ConfStack(*(r.mimeview)); + mimeview = new ConfStack(*(r.mimeview)); if (r.m_stopsuffixes) m_stopsuffixes = new SuffixStore(*((SuffixStore*)r.m_stopsuffixes)); m_maxsufflen = r.m_maxsufflen; diff --git a/src/common/rclconfig.h b/src/common/rclconfig.h index bc6c790a..faeb6edf 100644 --- a/src/common/rclconfig.h +++ b/src/common/rclconfig.h @@ -16,19 +16,23 @@ */ #ifndef _RCLCONFIG_H_INCLUDED_ #define _RCLCONFIG_H_INCLUDED_ -/* @(#$Id: rclconfig.h,v 1.39 2007-11-16 14:28:52 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: rclconfig.h,v 1.40 2008-09-08 16:49:10 dockes Exp $ (C) 2004 J.F.Dockes */ #include #include #include #include #include +#include +#include #ifndef NO_NAMESPACES using std::list; using std::string; using std::vector; using std::pair; using std::set; +using std::map; +using std::set; #endif #include "conftree.h" @@ -142,6 +146,14 @@ class RclConfig { /** mimeconf: get field prefix from field name */ bool getFieldPrefix(const string& fldname, string &pfx); + /** Get implied meanings for field name (ie: author->[author, from]) */ + bool getFieldSpecialisations(const string& fld, + list& childrens, bool top = true); + /** Get prefixes for specialisations of field name */ + bool getFieldSpecialisationPrefixes(const string& fld, + list& pfxes); + bool fieldIsStored(const string& fld); + const set& getStoredFields() {return m_storedFields;} /** mimeview: get/set external viewer exec string(s) for mimetype(s) */ string getMimeViewerDef(const string &mimetype); @@ -181,8 +193,11 @@ class RclConfig { ConfStack *m_conf; // Parsed configuration files ConfStack *mimemap; // The files don't change with keydir, - ConfStack *mimeconf; // but their content may depend on it. - ConfStack *mimeview; // + ConfStack *mimeconf; // but their content may depend on it. + ConfStack *mimeview; // + ConfStack *m_fields; + map m_fldtopref; + set m_storedFields; void *m_stopsuffixes; unsigned int m_maxsufflen; @@ -205,11 +220,13 @@ class RclConfig { mimemap = 0; mimeconf = 0; mimeview = 0; + m_fields = 0; m_stopsuffixes = 0; m_maxsufflen = 0; } /** Free data then zero pointers */ void freeAll(); + bool readFieldsConfig(const string& errloc); }; diff --git a/src/internfile/internfile.cpp b/src/internfile/internfile.cpp index 84b95bf1..a3fb3b03 100644 --- a/src/internfile/internfile.cpp +++ b/src/internfile/internfile.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: internfile.cpp,v 1.40 2008-09-05 10:36:06 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: internfile.cpp,v 1.41 2008-09-08 16:49:10 dockes Exp $ (C) 2004 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -286,13 +286,14 @@ static inline bool getKeyValue(const map& docdata, return false; } -static const string keyab("abstract"); +// These defs are for the Dijon meta array. Rcl::Doc predefined field +// names are used where appropriate. In some cases, Rcl::Doc names are +// used inside the Dijon metadata (ex: origcharset) static const string keyau("author"); static const string keycs("charset"); static const string keyct("content"); static const string keyds("description"); static const string keyfn("filename"); -static const string keykw("keywords"); static const string keymd("modificationdate"); static const string keymt("mimetype"); static const string keyoc("origcharset"); @@ -317,8 +318,8 @@ bool FileInterner::dijontorcl(Rcl::Doc& doc) doc.meta[it->first] = it->second; } } - if (doc.meta[keyab].empty() && !doc.meta[keyds].empty()) { - doc.meta[keyab] = doc.meta[keyds]; + if (doc.meta[Rcl::Doc::keyabs].empty() && !doc.meta[keyds].empty()) { + doc.meta[Rcl::Doc::keyabs] = doc.meta[keyds]; doc.meta.erase(keyds); } return true; @@ -353,7 +354,7 @@ void FileInterner::collectIpathAndMT(Rcl::Doc& doc, string& ipath) const } else { ipath += isep; } - getKeyValue(docdata, keyau, doc.meta["author"]); + getKeyValue(docdata, keyau, doc.meta[Rcl::Doc::keyau]); getKeyValue(docdata, keymd, doc.dmtime); } diff --git a/src/lib/Makefile b/src/lib/Makefile index c6e9fd96..ec15a09b 100644 --- a/src/lib/Makefile +++ b/src/lib/Makefile @@ -8,8 +8,8 @@ LIBS = librcl.a all: $(LIBS) -OBJS = rclaspell.o rclconfig.o rclinit.o textsplit.o unacpp.o csguess.o indexer.o mimetype.o htmlparse.o myhtmlparse.o mimehandler.o internfile.o mh_exec.o mh_html.o mh_mail.o mh_mbox.o mh_text.o docseq.o docseqdb.o docseqhist.o history.o recollq.o sortseq.o wasastringtoquery.o wasatorcl.o pathhash.o rcldb.o rclquery.o searchdata.o stemdb.o stoplist.o base64.o conftree.o copyfile.o debuglog.o execmd.o fstreewalk.o idfile.o fileudi.o md5.o mimeparse.o pathut.o readfile.o smallut.o transcode.o wipedir.o x11mon.o -DEPS = rclaspell.dep.stamp rclconfig.dep.stamp rclinit.dep.stamp textsplit.dep.stamp unacpp.dep.stamp csguess.dep.stamp indexer.dep.stamp mimetype.dep.stamp htmlparse.dep.stamp myhtmlparse.dep.stamp mimehandler.dep.stamp internfile.dep.stamp mh_exec.dep.stamp mh_html.dep.stamp mh_mail.dep.stamp mh_mbox.dep.stamp mh_text.dep.stamp docseq.dep.stamp docseqdb.dep.stamp docseqhist.dep.stamp history.dep.stamp recollq.dep.stamp sortseq.dep.stamp wasastringtoquery.dep.stamp wasatorcl.dep.stamp pathhash.dep.stamp rcldb.dep.stamp rclquery.dep.stamp searchdata.dep.stamp stemdb.dep.stamp stoplist.dep.stamp base64.dep.stamp conftree.dep.stamp copyfile.dep.stamp debuglog.dep.stamp execmd.dep.stamp fstreewalk.dep.stamp idfile.dep.stamp fileudi.dep.stamp md5.dep.stamp mimeparse.dep.stamp pathut.dep.stamp readfile.dep.stamp smallut.dep.stamp transcode.dep.stamp wipedir.dep.stamp x11mon.dep.stamp +OBJS = rclaspell.o rclconfig.o rclinit.o textsplit.o unacpp.o csguess.o indexer.o mimetype.o htmlparse.o myhtmlparse.o mimehandler.o internfile.o mh_exec.o mh_html.o mh_mail.o mh_mbox.o mh_text.o docseq.o docseqdb.o docseqhist.o history.o recollq.o sortseq.o wasastringtoquery.o wasatorcl.o pathhash.o rcldb.o rcldoc.o rclquery.o searchdata.o stemdb.o stoplist.o base64.o conftree.o copyfile.o debuglog.o execmd.o fstreewalk.o idfile.o fileudi.o md5.o mimeparse.o pathut.o readfile.o smallut.o transcode.o wipedir.o x11mon.o +DEPS = rclaspell.dep.stamp rclconfig.dep.stamp rclinit.dep.stamp textsplit.dep.stamp unacpp.dep.stamp csguess.dep.stamp indexer.dep.stamp mimetype.dep.stamp htmlparse.dep.stamp myhtmlparse.dep.stamp mimehandler.dep.stamp internfile.dep.stamp mh_exec.dep.stamp mh_html.dep.stamp mh_mail.dep.stamp mh_mbox.dep.stamp mh_text.dep.stamp docseq.dep.stamp docseqdb.dep.stamp docseqhist.dep.stamp history.dep.stamp recollq.dep.stamp sortseq.dep.stamp wasastringtoquery.dep.stamp wasatorcl.dep.stamp pathhash.dep.stamp rcldb.dep.stamp rcldoc.dep.stamp rclquery.dep.stamp searchdata.dep.stamp stemdb.dep.stamp stoplist.dep.stamp base64.dep.stamp conftree.dep.stamp copyfile.dep.stamp debuglog.dep.stamp execmd.dep.stamp fstreewalk.dep.stamp idfile.dep.stamp fileudi.dep.stamp md5.dep.stamp mimeparse.dep.stamp pathut.dep.stamp readfile.dep.stamp smallut.dep.stamp transcode.dep.stamp wipedir.dep.stamp x11mon.dep.stamp librcl.a : $(DEPS) $(OBJS) unac.o ar ru librcl.a $(OBJS) unac.o @@ -71,6 +71,8 @@ pathhash.o : ../rcldb/pathhash.cpp $(CXX) $(ALL_CXXFLAGS) -c ../rcldb/pathhash.cpp rcldb.o : ../rcldb/rcldb.cpp $(CXX) $(ALL_CXXFLAGS) -c ../rcldb/rcldb.cpp +rcldoc.o : ../rcldb/rcldoc.cpp + $(CXX) $(ALL_CXXFLAGS) -c ../rcldb/rcldoc.cpp rclquery.o : ../rcldb/rclquery.cpp $(CXX) $(ALL_CXXFLAGS) -c ../rcldb/rclquery.cpp searchdata.o : ../rcldb/searchdata.cpp @@ -198,6 +200,9 @@ pathhash.dep.stamp : ../rcldb/pathhash.cpp rcldb.dep.stamp : ../rcldb/rcldb.cpp $(CXX) -M $(ALL_CXXFLAGS) ../rcldb/rcldb.cpp > rcldb.dep touch rcldb.dep.stamp +rcldoc.dep.stamp : ../rcldb/rcldoc.cpp + $(CXX) -M $(ALL_CXXFLAGS) ../rcldb/rcldoc.cpp > rcldoc.dep + touch rcldoc.dep.stamp rclquery.dep.stamp : ../rcldb/rclquery.cpp $(CXX) -M $(ALL_CXXFLAGS) ../rcldb/rclquery.cpp > rclquery.dep touch rclquery.dep.stamp @@ -285,6 +290,7 @@ include wasastringtoquery.dep include wasatorcl.dep include pathhash.dep include rcldb.dep +include rcldoc.dep include rclquery.dep include searchdata.dep include stemdb.dep diff --git a/src/lib/mkMake b/src/lib/mkMake index 5d54db75..96a2d8ac 100755 --- a/src/lib/mkMake +++ b/src/lib/mkMake @@ -31,6 +31,7 @@ ${depth}/query/wasastringtoquery.cpp \ ${depth}/query/wasatorcl.cpp \ ${depth}/rcldb/pathhash.cpp \ ${depth}/rcldb/rcldb.cpp \ +${depth}/rcldb/rcldoc.cpp \ ${depth}/rcldb/rclquery.cpp \ ${depth}/rcldb/searchdata.cpp \ ${depth}/rcldb/stemdb.cpp \ diff --git a/src/python/recoll/pyrecoll.cpp b/src/python/recoll/pyrecoll.cpp index 5b894351..71669f75 100644 --- a/src/python/recoll/pyrecoll.cpp +++ b/src/python/recoll/pyrecoll.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: pyrecoll.cpp,v 1.10 2008-08-28 15:44:37 dockes Exp $ (C) 2007 J.F.Dockes"; +static char rcsid[] = "@(#$Id: pyrecoll.cpp,v 1.11 2008-09-08 16:49:10 dockes Exp $ (C) 2007 J.F.Dockes"; #endif @@ -394,8 +394,8 @@ static PyGetSetDef Doc_getseters[] = { "fbytes", (void *)"fbytes"}, {"dbytes", (getter)Doc_getmeta, (setter)Doc_setmeta, "dbytes", (void *)"dbytes"}, - {"relevance", (getter)Doc_getmeta, (setter)Doc_setmeta, - "relevance", (void *)"relevance"}, + {"relevancyrating", (getter)Doc_getmeta, (setter)Doc_setmeta, + "relevance", (void *)"relevancyrating"}, {"title", (getter)Doc_getmeta, (setter)Doc_setmeta, "title", (void *)"title"}, {"keywords", (getter)Doc_getmeta, (setter)Doc_setmeta, diff --git a/src/python/samples/recollq.py b/src/python/samples/recollq.py index 0d3e49d5..a5227bea 100755 --- a/src/python/samples/recollq.py +++ b/src/python/samples/recollq.py @@ -3,7 +3,7 @@ import sys import recoll allmeta = ("title", "keywords", "abstract", "url", "mimetype", "mtime", - "ipath", "fbytes", "dbytes", "relevance") + "ipath", "fbytes", "dbytes", "relevancyrating") def dotest(db, q): diff --git a/src/qtgui/preview_w.cpp b/src/qtgui/preview_w.cpp index 4c6c92cc..29b82a0d 100644 --- a/src/qtgui/preview_w.cpp +++ b/src/qtgui/preview_w.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: preview_w.cpp,v 1.35 2008-07-01 08:27:58 dockes Exp $ (C) 2005 J.F.Dockes"; +static char rcsid[] = "@(#$Id: preview_w.cpp,v 1.36 2008-09-08 16:49:10 dockes Exp $ (C) 2005 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -753,8 +753,8 @@ bool Preview::loadFileInCurrentTab(string fn, size_t sz, const Rcl::Doc &idoc, Rcl::Doc doc = idoc; - if (doc.meta["title"].empty()) - doc.meta["title"] = path_getsimple(doc.url); + if (doc.meta[Rcl::Doc::keytt].empty()) + doc.meta[Rcl::Doc::keytt] = path_getsimple(doc.url); setCurTabProps(fn, doc, docnum); diff --git a/src/qtgui/reslist.cpp b/src/qtgui/reslist.cpp index 26c251f0..45fa8566 100644 --- a/src/qtgui/reslist.cpp +++ b/src/qtgui/reslist.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: reslist.cpp,v 1.41 2008-08-26 07:33:05 dockes Exp $ (C) 2005 J.F.Dockes"; +static char rcsid[] = "@(#$Id: reslist.cpp,v 1.42 2008-09-08 16:49:10 dockes Exp $ (C) 2005 J.F.Dockes"; #endif #include @@ -418,7 +418,7 @@ void ResList::resultPageNext() if (percent == -1) { percent = 0; // Document not available, maybe other further, will go on. - doc.meta["abstract"] = string(tr("Unavailable document").utf8()); + doc.meta[Rcl::Doc::keyabs] = string(tr("Unavailable document").utf8()); } // Determine icon to display if any @@ -442,8 +442,8 @@ void ResList::resultPageNext() printableUrl(rclconfig->getDefCharset(), doc.url, url); // Make title out of file name if none yet - if (doc.meta["title"].empty()) { - doc.meta["title"] = path_getsimple(url); + if (doc.meta[Rcl::Doc::keytt].empty()) { + doc.meta[Rcl::Doc::keytt] = path_getsimple(url); } // Result number @@ -485,7 +485,7 @@ void ResList::resultPageNext() (doc.syntabs || prefs.queryReplaceAbstract)) { abstract = m_docSource->getAbstract(doc); } else { - abstract = doc.meta["abstract"]; + abstract = doc.meta[Rcl::Doc::keyabs]; } // No need to call escapeHtml(), plaintorich handles it list lr; @@ -520,14 +520,14 @@ void ResList::resultPageNext() subs['A'] = !richabst.empty() ? richabst + "
" : ""; subs['D'] = datebuf; subs['I'] = img_name; - subs['K'] = !doc.meta["keywords"].empty() ? - escapeHtml(doc.meta["keywords"]) + "
" : ""; + subs['K'] = !doc.meta[Rcl::Doc::keykw].empty() ? + escapeHtml(doc.meta[Rcl::Doc::keykw]) + "
" : ""; subs['L'] = linksbuf; subs['N'] = numbuf; subs['M'] = doc.mimetype; subs['R'] = perbuf; subs['S'] = sizebuf; - subs['T'] = escapeHtml(doc.meta["title"]); + subs['T'] = escapeHtml(doc.meta[Rcl::Doc::keytt]); subs['U'] = url; string formatted; diff --git a/src/query/docseq.h b/src/query/docseq.h index 0311293a..e23a83db 100644 --- a/src/query/docseq.h +++ b/src/query/docseq.h @@ -16,7 +16,7 @@ */ #ifndef _DOCSEQ_H_INCLUDED_ #define _DOCSEQ_H_INCLUDED_ -/* @(#$Id: docseq.h,v 1.13 2008-06-13 18:22:46 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: docseq.h,v 1.14 2008-09-08 16:49:10 dockes Exp $ (C) 2004 J.F.Dockes */ #include #include #include @@ -70,7 +70,7 @@ class DocSequence { * The default is to return the input doc's abstract fields, but some * sequences can compute a better value (ie: docseqdb) */ virtual string getAbstract(Rcl::Doc& doc) { - return doc.meta["abstract"]; + return doc.meta[Rcl::Doc::keyabs]; } /** Get estimated total count in results */ diff --git a/src/query/docseqdb.cpp b/src/query/docseqdb.cpp index 426667bc..5435d048 100644 --- a/src/query/docseqdb.cpp +++ b/src/query/docseqdb.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: docseqdb.cpp,v 1.4 2008-06-13 18:22:46 dockes Exp $ (C) 2005 J.F.Dockes"; +static char rcsid[] = "@(#$Id: docseqdb.cpp,v 1.5 2008-09-08 16:49:10 dockes Exp $ (C) 2005 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -62,10 +62,10 @@ int DocSequenceDb::getResCnt() string DocSequenceDb::getAbstract(Rcl::Doc &doc) { if (!m_q->whatDb()) - return doc.meta["abstract"]; + return doc.meta[Rcl::Doc::keyabs]; string abstract; m_q->whatDb()->makeDocAbstract(doc, m_q.getptr(), abstract); - return abstract.empty() ? doc.meta["abstract"] : abstract; + return abstract.empty() ? doc.meta[Rcl::Doc::keyabs] : abstract; } list DocSequenceDb::expand(Rcl::Doc &doc) diff --git a/src/query/recollq.cpp b/src/query/recollq.cpp index 22ca6de3..b7510a89 100644 --- a/src/query/recollq.cpp +++ b/src/query/recollq.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: recollq.cpp,v 1.13 2008-06-13 18:22:46 dockes Exp $ (C) 2006 J.F.Dockes"; +static char rcsid[] = "@(#$Id: recollq.cpp,v 1.14 2008-09-08 16:49:10 dockes Exp $ (C) 2006 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -59,6 +59,7 @@ static char usage [] = " -d also dump file contents\n" " -n limit the maximum number of results (0->no limit, default 2000)\n" " -b : basic. Just output urls, no mime types or titles\n" +" -m : dump the whole document meta[] array\n" ; static void Usage(void) @@ -80,6 +81,7 @@ static int op_flags; #define OPT_l 0x100 #define OPT_q 0x200 #define OPT_t 0x400 +#define OPT_m 0x800 int recollq(RclConfig **cfp, int argc, char **argv) { @@ -103,6 +105,7 @@ int recollq(RclConfig **cfp, int argc, char **argv) case 'd': op_flags |= OPT_d; break; case 'f': op_flags |= OPT_f; break; case 'l': op_flags |= OPT_l; break; + case 'm': op_flags |= OPT_m; break; case 'n': op_flags |= OPT_n; if (argc < 2) Usage(); limit = atoi(*(++argv)); if (limit <= 0) limit = INT_MAX; @@ -192,9 +195,16 @@ int recollq(RclConfig **cfp, int argc, char **argv) cout << doc.mimetype.c_str() << "\t" << "[" << doc.url.c_str() << "]" << "\t" - << "[" << doc.meta["title"].c_str() << "]" << "\t" + << "[" << doc.meta[Rcl::Doc::keytt].c_str() << "]" << "\t" << doc.fbytes.c_str() << "\tbytes" << "\t" << endl; + if (op_flags & OPT_m) { + for (map::const_iterator it = doc.meta.begin(); + it != doc.meta.end(); it++) { + cout << it->first << " = " << it->second << endl; + } + } + cout << endl; } if (op_flags & OPT_d) { string fn = doc.url.substr(7); diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index 9deaedc0..8be74892 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.142 2008-09-05 10:34:17 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.143 2008-09-08 16:49:10 dockes Exp $ (C) 2004 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -129,6 +129,13 @@ bool Db::Native::subDocs(const string &udi, vector& docids) return false; } +static const string keycap("caption"); +static const string keymtp("mtype"); +static const string keyfmt("fmtime"); +static const string keydmt("dmtime"); +static const string keyoc("origcharset"); +static const string keyurl("url"); + // Turn data record from db into document fields bool Db::Native::dbDataToRclDoc(Xapian::docid docid, std::string &data, Doc &doc, int percent) @@ -137,30 +144,37 @@ bool Db::Native::dbDataToRclDoc(Xapian::docid docid, std::string &data, ConfSimple parms(&data); if (!parms.ok()) return false; - parms.get(string("url"), doc.url); - parms.get(string("mtype"), doc.mimetype); - parms.get(string("fmtime"), doc.fmtime); - parms.get(string("dmtime"), doc.dmtime); - parms.get(string("origcharset"), doc.origcharset); - parms.get(string("caption"), doc.meta["title"]); - parms.get(string("keywords"), doc.meta["keywords"]); - parms.get(string("abstract"), doc.meta["abstract"]); - parms.get(string("author"), doc.meta["author"]); + parms.get(keyurl, doc.url); + parms.get(keymtp, doc.mimetype); + parms.get(keyfmt, doc.fmtime); + parms.get(keydmt, doc.dmtime); + parms.get(keyoc, doc.origcharset); + parms.get(keycap, doc.meta[Doc::keytt]); + parms.get(Doc::keykw, doc.meta[Doc::keykw]); + parms.get(Doc::keyabs, doc.meta[Doc::keyabs]); // Possibly remove synthetic abstract indicator (if it's there, we // used to index the beginning of the text as abstract). doc.syntabs = false; - if (doc.meta["abstract"].find(rclSyntAbs) == 0) { - doc.meta["abstract"] = doc.meta["abstract"].substr(rclSyntAbs.length()); + if (doc.meta[Doc::keyabs].find(rclSyntAbs) == 0) { + doc.meta[Doc::keyabs] = doc.meta[Doc::keyabs].substr(rclSyntAbs.length()); doc.syntabs = true; } char buf[20]; sprintf(buf,"%.2f", float(percent) / 100.0); - doc.meta["relevancyrating"] = buf; + doc.meta[Doc::keyrr] = buf; parms.get(string("ipath"), doc.ipath); parms.get(string("fbytes"), doc.fbytes); parms.get(string("dbytes"), doc.dbytes); parms.get(string("sig"), doc.sig); doc.xdocid = docid; + + // Other, not predefined meta fields: + list keys = parms.getNames(string()); + for (list::const_iterator it = keys.begin(); + it != keys.end(); it++) { + if (doc.meta.find(*it) == doc.meta.end()) + parms.get(*it, doc.meta[*it]); + } return true; } @@ -680,21 +694,21 @@ bool Db::fieldToPrefix(const string& fldname, string &pfx) // This is the default table static map fldToPrefs; if (fldToPrefs.empty()) { - fldToPrefs["abstract"] = string(); + fldToPrefs[Doc::keyabs] = string(); fldToPrefs["ext"] = "XE"; fldToPrefs["filename"] = "XSFN"; fldToPrefs["title"] = "S"; - fldToPrefs["caption"] = "S"; + fldToPrefs[keycap] = "S"; fldToPrefs["subject"] = "S"; - fldToPrefs["author"] = "A"; + fldToPrefs[Doc::keyau] = "A"; fldToPrefs["creator"] = "A"; fldToPrefs["from"] = "A"; fldToPrefs["keyword"] = "K"; fldToPrefs["tag"] = "K"; - fldToPrefs["keywords"] = "K"; + fldToPrefs[Doc::keykw] = "K"; fldToPrefs["tags"] = "K"; } @@ -803,6 +817,7 @@ void Db::setAbstractParams(int idxtrunc, int syntlen, int syntctxlen) } static const int MB = 1024 * 1024; +static const string nc("\n\r\x0c"); // Add document in internal form to the database: index the terms in // the title abstract and body and add special terms for file name, @@ -831,35 +846,6 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc doc = idoc; - // The title, author, abstract and keywords fields are special, they - // get stored in the document data record. - // Truncate abstract, title and keywords to reasonable lengths. If - // abstract is currently empty, we make up one with the beginning - // of the document. This is then not indexed, but part of the doc - // data so that we can return it to a query without having to - // decode the original file. - bool syntabs = false; - // Note that the map accesses by operator[] create empty entries if they - // don't exist yet. - if (doc.meta["abstract"].empty()) { - syntabs = true; - doc.meta["abstract"] = rclSyntAbs + - neutchars(truncate_to_word(doc.text, m_idxAbsTruncLen), "\n\r"); - } else { - doc.meta["abstract"] = - neutchars(truncate_to_word(doc.meta["abstract"], m_idxAbsTruncLen), - "\n\r"); - } - if (doc.meta["title"].empty()) - doc.meta["title"] = doc.utf8fn; - doc.meta["title"] = - neutchars(truncate_to_word(doc.meta["title"], 150), "\n\r"); - doc.meta["author"] = - neutchars(truncate_to_word(doc.meta["author"], 150), "\n\r"); - doc.meta["keywords"] = - neutchars(truncate_to_word(doc.meta["keywords"], 300),"\n\r"); - - Xapian::Document newdocument; mySplitterCB splitData(newdocument, m_stops); TextSplit splitter(&splitData); @@ -882,11 +868,9 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, string pfx; for (meta_it = doc.meta.begin(); meta_it != doc.meta.end(); meta_it++) { if (!meta_it->second.empty()) { - if (meta_it->first == "abstract" && syntabs) - continue; if (!fieldToPrefix(meta_it->first, pfx)) { LOGDEB(("Db::add: no prefix for field [%s], no indexing\n", - meta_it->first.c_str())); + meta_it->first.c_str())); continue; } LOGDEB1(("Db::add: field [%s] pfx [%s]: [%s]\n", @@ -908,7 +892,7 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, else splitData.basepos += splitData.curpos + 100; - // Finally: split and index body text + // Split and index body text LOGDEB2(("Db::add: split body\n")); if (!dumb_string(doc.text, noacc)) { LOGERR(("Db::add: dumb_string failed\n")); @@ -958,11 +942,22 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, buf[4] = '\0'; newdocument.add_term("Y" + string(buf)); // Year (YYYY) + + ////////////////////////////////////////////////////////////////// // Document data record. omindex has the following nl separated fields: // - url // - sample // - caption (title limited to 100 chars) // - mime type + // + // The title, author, abstract and keywords fields are special, + // they always get stored in the document data + // record. Configurable other fields can be, too. + // + // We truncate stored fields abstract, title and keywords to + // reasonable lengths and suppress newlines (so that the data + // record can keep a simple syntax) + string record = "url=" + doc.url; record += "\nmtype=" + doc.mimetype; record += "\nfmtime=" + doc.fmtime; @@ -982,20 +977,55 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, sprintf(sizebuf, "%u", (unsigned int)doc.text.length()); record += string("\ndbytes=") + sizebuf; - if (!doc.ipath.empty()) { + if (!doc.ipath.empty()) record += "\nipath=" + doc.ipath; + + if (doc.meta[Doc::keytt].empty()) + doc.meta[Doc::keytt] = doc.utf8fn; + doc.meta[Doc::keytt] = + neutchars(truncate_to_word(doc.meta[Doc::keytt], 150), nc); + if (!doc.meta[Doc::keytt].empty()) + record += "\n" + keycap + "=" + doc.meta[Doc::keytt]; + + doc.meta[Doc::keykw] = + neutchars(truncate_to_word(doc.meta[Doc::keykw], 300), nc); + if (!doc.meta[Doc::keykw].empty()) + record += "\n" + Doc::keykw + "=" + doc.meta[Doc::keykw]; + + // If abstract is empty, we make up one with the beginning of the + // document. This is then not indexed, but part of the doc data so + // that we can return it to a query without having to decode the + // original file. + bool syntabs = false; + // Note that the map accesses by operator[] create empty entries if they + // don't exist yet. + if (doc.meta[Doc::keyabs].empty()) { + syntabs = true; + if (!doc.text.empty()) + doc.meta[Doc::keyabs] = rclSyntAbs + + neutchars(truncate_to_word(doc.text, m_idxAbsTruncLen), nc); + } else { + doc.meta[Doc::keyabs] = + neutchars(truncate_to_word(doc.meta[Doc::keyabs], m_idxAbsTruncLen), + nc); } - if (!doc.meta["title"].empty()) - record += "\ncaption=" + doc.meta["title"]; - if (!doc.meta["keywords"].empty()) - record += "\nkeywords=" + doc.meta["keywords"]; - if (!doc.meta["abstract"].empty()) - record += "\nabstract=" + doc.meta["abstract"]; - if (!doc.meta["author"].empty()) { - record += "\nauthor=" + doc.meta["author"]; + if (!doc.meta[Doc::keyabs].empty()) + record += "\n" + Doc::keyabs + "=" + doc.meta[Doc::keyabs]; + + RclConfig *config = RclConfig::getMainConfig(); + if (config) { + const set& stored = config->getStoredFields(); + for (set::const_iterator it = stored.begin(); + it != stored.end(); it++) { + if (!doc.meta[*it].empty()) { + string value = + neutchars(truncate_to_word(doc.meta[*it], 150), nc); + record += "\n" + *it + "=" + value; + } + } } record += "\n"; - LOGDEB1(("Newdocument data: %s\n", record.c_str())); + LOGDEB(("Rcl::Db::add: new doc record:\n %s\n", record.c_str())); newdocument.set_data(record); const char *fnc = udi.c_str(); diff --git a/src/rcldb/rcldoc.cpp b/src/rcldb/rcldoc.cpp new file mode 100644 index 00000000..2ace4310 --- /dev/null +++ b/src/rcldb/rcldoc.cpp @@ -0,0 +1,14 @@ +#ifndef lint +static char rcsid[] = "@(#$Id: rcldoc.cpp,v 1.1 2008-09-08 16:49:10 dockes Exp $ (C) 2007 J.F.Dockes"; +#endif + + +#include "rcldoc.h" +namespace Rcl { +const string Doc::keyabs("abstract"); +const string Doc::keyau("author"); +const string Doc::keyfn("filename"); +const string Doc::keykw("keywords"); +const string Doc::keyrr("relevancyrating"); +const string Doc::keytt("title"); +} diff --git a/src/rcldb/rcldoc.h b/src/rcldb/rcldoc.h index 7705968a..d0e840ee 100644 --- a/src/rcldb/rcldoc.h +++ b/src/rcldb/rcldoc.h @@ -16,7 +16,7 @@ */ #ifndef _RCLDOC_H_INCLUDED_ #define _RCLDOC_H_INCLUDED_ -/* @(#$Id: rcldoc.h,v 1.8 2008-08-26 07:33:31 dockes Exp $ (C) 2006 J.F.Dockes */ +/* @(#$Id: rcldoc.h,v 1.9 2008-09-08 16:49:10 dockes Exp $ (C) 2006 J.F.Dockes */ #include #include @@ -51,7 +51,7 @@ class Doc { // Transcoded version of the simple file name for SFN-prefixed // specific file name indexation - // Indexx: set by DbIndexer::processone + // Index: set by DbIndexer::processone string utf8fn; // Internal path for multi-doc files. Ascii @@ -78,11 +78,13 @@ class Doc { // handler. If a fieldname-to-prefix translation exists, the // terms in the value will be indexed with a prefix. // Only some predefined fields are stored in the data record: - // "title", "keywords", "abstract", "author" + // "title", "keywords", "abstract", "author", but if a field name is + // in the "stored" configuration list, it will be stored too. map meta; // Attribute for the "abstract" entry. true if it is just the top - // of doc, not a native document attribute. + // of doc, not a native document attribute. Not stored directly, but + // as an indicative prefix at the beginning of the abstract (ugly hack) bool syntabs; // File size. Index: Set by caller prior to Db::Add. Query: set by @@ -110,7 +112,7 @@ class Doc { // and indexed string text; - int pc; // used by sortseq, convenience + int pc; // relevancy percentage, used by sortseq, convenience unsigned long xdocid; // Opaque: rcldb doc identifier. /////////////////////////////////////////////////////////////////// @@ -132,6 +134,12 @@ class Doc { pc = 0; xdocid = 0; } + static const string keyfn; + static const string keyrr; + static const string keyabs; + static const string keyau; + static const string keytt; + static const string keykw; }; diff --git a/src/recollinstall.in b/src/recollinstall.in index 9e7d9826..b0bb1653 100755 --- a/src/recollinstall.in +++ b/src/recollinstall.in @@ -97,6 +97,7 @@ ${INSTALL} -m 0444 \ sampleconf/mimeview \ sampleconf/recoll.conf \ sampleconf/mimemap \ + sampleconf/fields \ ${datadir}/recoll/examples/ || exit 1 ${INSTALL} -m 0755 index/rclmon.sh ${datadir}/recoll/examples/ || exit 1 diff --git a/src/sampleconf/fields b/src/sampleconf/fields new file mode 100644 index 00000000..fa67be5b --- /dev/null +++ b/src/sampleconf/fields @@ -0,0 +1,55 @@ +# @(#$Id: fields,v 1.1 2008-09-08 16:49:10 dockes Exp $ (C) 2007 J.F.Dockes +# Field names configuration. This defines how one may search ie for +# author:Hemingway +# Important: +# - the field names MUST be all lowercase here. They can be anycased +# in the documents: + +##################################################### +# This section defines what prefix the terms inside named fields will be +# indexed with (in addition to prefix-less indexing for general search) +# ALL prefixes MUST be all UPPERCASE. Extension prefixes begin with X +# +# The choice of field names is rather arbitrary. Use of any of the aliases +# defined in the following section will yield exactly the same results, +# (both for indexing and search). +[prefixes] + +# Native fields matching omega uses, which we index without an X first +# letter. Don't change these +title = S +author = A +keyword = K + +# extension examples. This are actually used by default by Recoll: +ext = XE +filename = XSFN + +############################ +# Some fields are stored in the document data record inside the index and +# can be returned in result lists. There is no necessity that stored fields +# should be indexed (have a prefix) (example: url but this one doesn't need +# to be listed here) +# +# Some fields are stored by default, don't add them here, else they will be +# stored twice: title, keywords, abstract, filename, mimetype, url +# "author" used to be stored by default, now set here as optional +[stored] +stored = author + +########################## +# This section defines field names aliases or synonyms. Any right hand side +# value will be turned into the lhs canonic name before further treatment +[aliases] +title = caption subject +author = creator +keyword = keywords tag tags +dmtime = date contentmodified datemodified +mtype = type mimetype contenttype +ext = fileextension + +######################### +# This section defines a hierarchy for field names. Searching for a lhs +# ancestor will be expanded to a search for itself and all rhs descendants +[specialisations] +author = from