From 18b3573358e3e66ee5aeed8e962f9090930fb8c8 Mon Sep 17 00:00:00 2001 From: dockes Date: Mon, 18 Jun 2007 13:04:15 +0000 Subject: [PATCH] implement dynamic field name to prefix translation, query side --- src/common/rclconfig.cpp | 11 +++++++++- src/common/rclconfig.h | 46 +++++++++++++++++++++++---------------- src/rcldb/rcldb.cpp | 47 ++++++++++++++++++++++++++++++++++++++-- src/rcldb/rcldb.h | 4 +++- src/rcldb/searchdata.cpp | 35 ++++-------------------------- src/sampleconf/mimeconf | 39 +++++++++++++++++++++++---------- 6 files changed, 116 insertions(+), 66 deletions(-) diff --git a/src/common/rclconfig.cpp b/src/common/rclconfig.cpp index 0c7b0c72..d6aaeba6 100644 --- a/src/common/rclconfig.cpp +++ b/src/common/rclconfig.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: rclconfig.cpp,v 1.45 2007-06-08 12:31:54 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: rclconfig.cpp,v 1.46 2007-06-18 13:04:14 dockes Exp $ (C) 2004 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -373,6 +373,15 @@ string RclConfig::getMimeHandlerDef(const std::string &mtype) return hs; } +string RclConfig::getFieldPrefix(const string& fld) +{ + string hs; + if (!mimeconf->get(fld, hs, "prefixes")) { + LOGDEB(("getFieldPrefix: no prefix defined for '%s'\n", fld.c_str())); + } + return hs; +} + string RclConfig::getMimeViewerDef(const string &mtype) { string hs; diff --git a/src/common/rclconfig.h b/src/common/rclconfig.h index aa6a1ec5..181f19e8 100644 --- a/src/common/rclconfig.h +++ b/src/common/rclconfig.h @@ -16,7 +16,7 @@ */ #ifndef _RCLCONFIG_H_INCLUDED_ #define _RCLCONFIG_H_INCLUDED_ -/* @(#$Id: rclconfig.h,v 1.33 2007-06-08 16:47:19 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: rclconfig.h,v 1.34 2007-06-18 13:04:15 dockes Exp $ (C) 2004 J.F.Dockes */ #include #include @@ -35,15 +35,21 @@ using std::pair; class RclConfig { public: + // Constructor: we normally look for a configuration file, except + // if this was specified on the command line and passed through + // argcnf RclConfig(const string *argcnf = 0); - // Main programs should implement this, it avoids having to carry + + // Main programs must implement this, it avoids having to carry // the configuration parameter everywhere. Places where several - // instances might be needed will take care of themselves. + // RclConfig instances might be needed will take care of + // themselves. static RclConfig* getMainConfig(); bool ok() {return m_ok;} const string &getReason() {return m_reason;} - /** Return the directory where this config is stored */ + + /** Return the directory where this configuration is stored */ string getConfDir() {return m_confdir;} /** Set current directory reference, and fetch automatic parameters. */ @@ -113,30 +119,32 @@ class RclConfig { */ bool getUncompressor(const string &mtpe, list& cmd); - /** Use mimemap to compute mimetype */ + /** mimemap: compute mimetype */ string getMimeTypeFromSuffix(const string &suffix); - - /** Get appropriate suffix for mime type. This is inefficient */ + /** mimemap: get a list of all indexable mime types defined */ + list getAllMimeTypes(); + /** mimemap: Get appropriate suffix for mime type. This is inefficient */ string getSuffixFromMimeType(const string &mt); - /** Get input filter from mimeconf for mimetype */ + /** mimeconf: get input filter for mimetype */ string getMimeHandlerDef(const string &mimetype); - /** Get external viewer exec string from mimeconf for mimetype */ + /** mimeconf: get icon name for mimetype */ + string getMimeIconName(const string &mtype, string *path = 0); + + /** mimeconf: get list of file categories */ + bool getMimeCategories(list&); + /** mimeconf: get list of mime types for category */ + bool getMimeCatTypes(const string& cat, list&); + + /** mimeconf: get field prefix from field name */ + string getFieldPrefix(const string& fldname); + + /** mimeview: get/set external viewer exec string(s) for mimetype(s) */ string getMimeViewerDef(const string &mimetype); bool getMimeViewerDefs(vector >&); bool setMimeViewerDef(const string& mimetype, const string& cmd); - /** Get icon name from mimeconf for mimetype */ - string getMimeIconName(const string &mtype, string *path = 0); - - /** Get list of file categories from mimeconf */ - bool getMimeCategories(list&); - /** Get list of mime types for category from mimeconf */ - bool getMimeCatTypes(const string& cat, list&); - - /** Get a list of all indexable mime types defined in mimemap */ - list getAllMimeTypes(); /** Find exec file for external filter. cmd is the command name from the * command string returned by getMimeHandlerDef */ diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index 24b65b0c..345fdab8 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.113 2007-06-14 08:20:13 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.114 2007-06-18 13:04:15 dockes Exp $ (C) 2004 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -740,6 +740,43 @@ bool Db::isopen() return m_ndb->m_isopen; } +// Try to translate field specification into field prefix. We have a +// default table used if translations are not in the config for some +// reason (old config not updated ?). We use it only if the config +// translation fails +string Db::fieldToPrefix(const string& fldname) +{ + // This is the default table + static map fldToPrefs; + if (fldToPrefs.empty()) { + fldToPrefs["title"] = "S"; + fldToPrefs["caption"] = "S"; + fldToPrefs["subject"] = "S"; + + fldToPrefs["author"] = "A"; + fldToPrefs["creator"] = "A"; + fldToPrefs["from"] = "A"; + + fldToPrefs["keyword"] = "K"; + fldToPrefs["tag"] = "K"; + fldToPrefs["keywords"] = "K"; + fldToPrefs["tags"] = "K"; + } + + string fld(fldname), pfx; + stringtolower(fld); + RclConfig *config = RclConfig::getMainConfig(); + if (config) + pfx = config->getFieldPrefix(fld); + if (pfx.empty()) { + map::const_iterator it = fldToPrefs.find(fld); + if (it != fldToPrefs.end()) + fld = it->second; + } + return pfx; +} + + // The text splitter callback class which receives words from the // splitter and adds postings to the Xapian document. class mySplitterCB : public TextSplitCB { @@ -882,7 +919,13 @@ bool Db::add(const string &fn, const Doc &idoc, const struct stat *stp) TextSplit splitter(&splitData); - // /////// Split and index terms in document body and auxiliary fields + // Index the title, document text, keywords and other textual + // metadata. These are all indexed as text with positions, as we + // may want to do phrase searches with them (this makes no sense + // for keywords by the way, but wtf). + / + // The order has no importance, and we set a position gap of 100 + // between fields to avoid false proximity matches. string noacc; // Split and index file name as document term(s) diff --git a/src/rcldb/rcldb.h b/src/rcldb/rcldb.h index abebc6e2..981fa2d0 100644 --- a/src/rcldb/rcldb.h +++ b/src/rcldb/rcldb.h @@ -16,7 +16,7 @@ */ #ifndef _DB_H_INCLUDED_ #define _DB_H_INCLUDED_ -/* @(#$Id: rcldb.h,v 1.50 2007-06-08 16:47:19 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: rcldb.h,v 1.51 2007-06-18 13:04:15 dockes Exp $ (C) 2004 J.F.Dockes */ #include #include @@ -94,6 +94,8 @@ class Db { /** Return list of configured stop words */ const StopList& getStopList() const {return m_stops;} + /** Field name to prefix translation (ie: author -> 'A') */ + string fieldToPrefix(const string& fldname); /* Update-related methods ******************************************/ diff --git a/src/rcldb/searchdata.cpp b/src/rcldb/searchdata.cpp index 19d04cc4..80e4cd51 100644 --- a/src/rcldb/searchdata.cpp +++ b/src/rcldb/searchdata.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.14 2007-06-02 08:30:42 dockes Exp $ (C) 2006 J.F.Dockes"; +static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.15 2007-06-18 13:04:15 dockes Exp $ (C) 2006 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -33,6 +33,7 @@ static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.14 2007-06-02 08:30:42 dockes #include "unacpp.h" #include "utf8iter.h" #include "stoplist.h" +#include "rclconfig.h" #ifndef NO_NAMESPACES using namespace std; @@ -460,34 +461,6 @@ bool StringToXapianQ::processUserString(const string &iq, return true; } -// Try to translate field specification into field prefix. This should -// probably be an Rcl::Db method and much more configurable (store -// prefix translation list in config ?) -static string fieldToPrefix(const string& i_field) -{ - static map fldToPrefs; - if (fldToPrefs.empty()) { - fldToPrefs["title"] = "S"; - fldToPrefs["caption"] = "S"; - fldToPrefs["subject"] = "S"; - - fldToPrefs["author"] = "A"; - fldToPrefs["creator"] = "A"; - fldToPrefs["from"] = "A"; - - fldToPrefs["keyword"] = "K"; - fldToPrefs["tag"] = "K"; - fldToPrefs["keywords"] = "K"; - fldToPrefs["tags"] = "K"; - } - string fld(i_field); - stringtolower(fld); - map::const_iterator it = fldToPrefs.find(fld); - if (it != fldToPrefs.end()) - return it->second; - return ""; -} - static const string nullstemlang; // Translate a simple OR, AND, or EXCL search clause. @@ -514,7 +487,7 @@ bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p, } string prefix; if (!m_field.empty()) - prefix = fieldToPrefix(m_field); + prefix = db.fieldToPrefix(m_field); list pqueries; // We normally boost the original term in the stem expansion list. Don't @@ -568,7 +541,7 @@ bool SearchDataClauseDist::toNativeQuery(Rcl::Db &db, void *p, string prefix; if (!m_field.empty()) - prefix = fieldToPrefix(m_field); + prefix = db.fieldToPrefix(m_field); // We normally boost the original term in the stem expansion list. Don't // do it if there are wildcards anywhere, this would skew the results. diff --git a/src/sampleconf/mimeconf b/src/sampleconf/mimeconf index e0bcfc5c..62a6d125 100644 --- a/src/sampleconf/mimeconf +++ b/src/sampleconf/mimeconf @@ -1,4 +1,4 @@ -# @(#$Id: mimeconf,v 1.28 2007-06-15 11:41:50 dockes Exp $ (C) 2004 J.F.Dockes +# @(#$Id: mimeconf,v 1.29 2007-06-18 13:04:15 dockes Exp $ (C) 2004 J.F.Dockes # Recoll : associations of mime types to processing filters. # There are different sections for decompression, 'interning' for indexing @@ -109,23 +109,38 @@ texts = \ text/rtf spreadsheets = application/vnd.ms-excel \ - application/vnd.sun.xml.calc \ - application/vnd.sun.xml.calc.template + application/vnd.sun.xml.calc \ + application/vnd.sun.xml.calc.template presentations = application/vnd.ms-powerpoint \ - application/vnd.sun.xml.impress \ - application/vnd.sun.xml.impress.template + application/vnd.sun.xml.impress \ + application/vnd.sun.xml.impress.template media = audio/mpeg \ - image/jpeg \ - image/png \ + image/jpeg \ + image/png \ messages = message/rfc822 \ - text/x-gaim-log \ - text/x-mail \ + text/x-gaim-log \ + text/x-mail \ other = application/vnd.sun.xml.draw \ - application/vnd.sun.xml.draw.template \ - application/vnd.sun.xml.math \ - application/x-fsdirectory + application/vnd.sun.xml.draw.template \ + application/vnd.sun.xml.math \ + application/x-fsdirectory + +[prefixes] + +title = S +caption = S +subject = S + +author = A +creator = A +from = A + +keyword = K +tag = K +keywords = K +tags = K