From e0517a7d13ea42e7cee5675a8b1d92574f28c8ce Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Wed, 25 Mar 2015 11:48:59 +0100 Subject: [PATCH] Moved mimemap variable recoll_noindex to recoll.conf noContentSuffixes --- src/common/rclconfig.cpp | 25 ++++-- src/common/rclconfig.h | 4 +- src/doc/user/usermanual.xml | 123 ++++++++++++++++++++++++----- src/qtgui/confgui/confguiindex.cpp | 13 ++- src/sampleconf/mimemap | 15 +--- src/sampleconf/recoll.conf.in | 14 ++++ 6 files changed, 155 insertions(+), 39 deletions(-) diff --git a/src/common/rclconfig.cpp b/src/common/rclconfig.cpp index 531d4c64..bc8e32bb 100644 --- a/src/common/rclconfig.cpp +++ b/src/common/rclconfig.cpp @@ -114,6 +114,7 @@ void RclConfig::zeroMe() { m_stopsuffixes = 0; m_maxsufflen = 0; + m_oldstpsuffstate.init(0); m_stpsuffstate.init(0); m_skpnstate.init(0); m_rmtstate.init(0); @@ -130,7 +131,8 @@ bool RclConfig::isDefaultConfig() const } RclConfig::RclConfig(const string *argcnf) - : m_stpsuffstate(this, "recoll_noindex"), + : m_oldstpsuffstate(this, "recoll_noindex"), + m_stpsuffstate(this, "noContentSuffixes"), m_skpnstate(this, "skippedNames"), m_rmtstate(this, "indexedmimetypes"), m_xmtstate(this, "excludedmimetypes"), @@ -282,7 +284,8 @@ RclConfig::RclConfig(const string *argcnf) m_ok = true; setKeyDir(cstr_null); - m_stpsuffstate.init(mimemap); + m_oldstpsuffstate.init(mimemap); + m_stpsuffstate.init(m_conf); m_skpnstate.init(m_conf); m_rmtstate.init(m_conf); m_xmtstate.init(m_conf); @@ -605,17 +608,24 @@ typedef multiset SuffixStore; bool RclConfig::inStopSuffixes(const string& fni) { LOGDEB2(("RclConfig::inStopSuffixes(%s)\n", fni.c_str())); - // Beware: needrecompute() needs to be called always. 2nd test stays back. - if (m_stpsuffstate.needrecompute() || m_stopsuffixes == 0) { + // Beware: both needrecompute() need to be called always hence the + // bizarre way we do things + bool needrecompute = m_stpsuffstate.needrecompute(); + needrecompute = needrecompute || m_oldstpsuffstate.needrecompute(); + if (needrecompute || m_stopsuffixes == 0) { // Need to initialize the suffixes delete STOPSUFFIXES; if ((m_stopsuffixes = new SuffixStore) == 0) { LOGERR(("RclConfig::inStopSuffixes: out of memory\n")); return false; } - list stoplist; + vector stoplist; stringToStrings(m_stpsuffstate.savedvalue, stoplist); - for (list::const_iterator it = stoplist.begin(); + vector ostoplist; + stringToStrings(m_oldstpsuffstate.savedvalue, ostoplist); + stoplist.resize(stoplist.size() + ostoplist.size()); + stoplist.insert(stoplist.end(), ostoplist.begin(), ostoplist.end()); + for (vector::const_iterator it = stoplist.begin(); it != stoplist.end(); it++) { STOPSUFFIXES->insert(SfString(stringtolower(*it))); if (m_maxsufflen < it->length()) @@ -1461,7 +1471,8 @@ void RclConfig::initFrom(const RclConfig& r) m_maxsufflen = r.m_maxsufflen; m_defcharset = r.m_defcharset; - m_stpsuffstate.init(mimemap); + m_oldstpsuffstate.init(mimemap); + m_stpsuffstate.init(m_conf); m_skpnstate.init(m_conf); m_rmtstate.init(m_conf); m_xmtstate.init(m_conf); diff --git a/src/common/rclconfig.h b/src/common/rclconfig.h index b9db0d20..afcd0cf6 100644 --- a/src/common/rclconfig.h +++ b/src/common/rclconfig.h @@ -81,7 +81,8 @@ class RclConfig { RclConfig(const string *argcnf = 0); RclConfig(const RclConfig &r) - : m_stpsuffstate(this, "recoll_noindex"), + : m_oldstpsuffstate(this, "recoll_noindex"), + m_stpsuffstate(this, "noContentSuffixes"), m_skpnstate(this, "skippedNames"), m_rmtstate(this, "indexedmimetypes"), m_xmtstate(this, "excludedmimetypes"), @@ -334,6 +335,7 @@ class RclConfig { void *m_stopsuffixes; unsigned int m_maxsufflen; + ParamStale m_oldstpsuffstate; // Values from user mimemap, now obsolete ParamStale m_stpsuffstate; ParamStale m_skpnstate; diff --git a/src/doc/user/usermanual.xml b/src/doc/user/usermanual.xml index e3f1f788..ea3e5f40 100644 --- a/src/doc/user/usermanual.xml +++ b/src/doc/user/usermanual.xml @@ -3,7 +3,7 @@ Recoll"> http://www.recoll.org/features.html"> - + Xapian"> ]> @@ -22,7 +22,7 @@ - 2005-2014 + 2005-2015 Jean-Francois Dockes @@ -274,6 +274,14 @@ aspects of the indexing processes and configuration, with links to detailed sections. + Depending on your data, temporary files may be needed during + indexing, some of them possibly quite big. You can use the + RECOLL_TMPDIR or TMPDIR environment + variables to determine where they are created (the default is to + use /tmp). Using TMPDIR has + the nice property that it may also be taken into account by + auxiliary commands executed by recollindex. + Indexing modes @@ -5122,6 +5130,73 @@ except: + + Environment variables + + + + RECOLL_CONFDIR + Defines the main configuration + directory. + + + + RECOLL_TMPDIR, TMPDIR + Locations for temporary files, in this order + of priority. The default if none of these is set is to use + /tmp. Big temporary files may be created + during indexing, mostly for decompressing, and also for + processing, e.g. email attachments. + + + + RECOLL_CONFTOP, RECOLL_CONFMID + Allow adding configuration directories with + priorities below and above the user directory (see above the + Configuration overview section for details). + + + + RECOLL_EXTRA_DBS, + RECOLL_ACTIVE_EXTRA_DBS + + Help for setting up external indexes. See this paragraph for + explanations. + + + + + RECOLL_DATADIR + Defines replacement for the default location + of Recoll data files, normally found in, e.g., + /usr/share/recoll). + + + + RECOLL_FILTERSDIR + Defines replacement for the default location + of Recoll filters, normally found in, e.g., + /usr/share/recoll/filters). + + + + ASPELL_PROG + aspell program to use for + creating the spelling dictionary. The result has to be + compatible with the libaspell which &RCL; + is using. + + + + VARNAME + Blabla + + + + + + The main configuration file, recoll.conf @@ -5188,12 +5263,29 @@ skippedNames = #* bin CVS Cache cache* caughtspam tmp .thumbnails .svn \ Not even the file names are indexed for patterns in this list. See the - recoll_noindex variable in - mimemap for an alternative + noContentSuffixes variable for an alternative approach which indexes the file names. + noContentSuffixes + This is a list of file name endings (not + wildcard expressions, nor dot-delimited suffixes). Only the + names of matching files will be indexed (no attempt at MIME + type identification, no decompression, no content + indexing). This can be redefined for + subdirectories, and edited from the GUI. The default value is: + +noContentSuffixes = .md5 .map \ + .o .lib .dll .a .sys .exe .com \ + .mpp .mpt .vsd \ + .img .img.gz .img.bz2 .img.xz .image .image.gz .image.bz2 .image.xz \ + .dat .bak .rdf .log.gz .log .db .msf .pid \ + ,v ~ # + + + + skippedPaths and daemSkippedPaths @@ -6049,21 +6141,14 @@ x-my-tag = mailmytag should be handled specially, which is possible because they are usually all located in one place. - mimemap also has a - recoll_noindex variable which is a list of - suffixes. Matching files will be skipped (which avoids - unnecessary decompressions or file - executions). This is partially redundant with - skippedNames in the main configuration - file, with a few differences: it will not affect directories, - it cannot be made dependant on the file-system location (it is - a configuration-wide parameter), and the file names will still - be indexed (not even the file names are indexed for patterns - in skippedNames. - recoll_noindex is used mostly for things - known to be unindexable by a given &RCL; version. Having it - there avoids cluttering the more user-oriented and locally - customized skippedNames. + The recoll_noindex + mimemap variable has been moved to + recoll.conf and renamed to + noContentSuffixes, while keeping the same + function, as of &RCL; version 1.21. For older &RCL; versions, + see the documentation for noContentSuffixes + but use recoll_noindex in + mimemap. diff --git a/src/qtgui/confgui/confguiindex.cpp b/src/qtgui/confgui/confguiindex.cpp index 4cef94c3..140cc2b5 100644 --- a/src/qtgui/confgui/confguiindex.cpp +++ b/src/qtgui/confgui/confguiindex.cpp @@ -454,6 +454,17 @@ ConfSubPanelW::ConfSubPanelW(QWidget *parent, ConfNull *config, m_widgets.push_back(eexcm); gl1->addWidget(eexcm, gridy, 0); + ConfParamSLW *encs = new ConfParamSLW( + m_groupbox, + ConfLink(new ConfLinkRclRep(config, "noContentSuffixes", &m_sk)), + QObject::tr("Ignored endings"), + QObject::tr("These are file name endings for files which will be " + "indexed by content only \n(no MIME type identification " + "attempt, no decompression, no content indexing.")); + encs->setFsEncoding(true); + m_widgets.push_back(encs); + gl1->addWidget(encs, gridy++, 1); + vector args; args.push_back("-l"); ExecCmd ex; @@ -484,7 +495,7 @@ ConfSubPanelW::ConfSubPanelW(QWidget *parent, ConfNull *config, "and the value from the NLS environnement is used." ), charsets); m_widgets.push_back(e21); - gl1->addWidget(e21, gridy++, 1); + gl1->addWidget(e21, gridy++, 0); ConfParamBoolW *e3 = new ConfParamBoolW( m_groupbox, diff --git a/src/sampleconf/mimemap b/src/sampleconf/mimemap index 6c24576b..4b2cddd0 100644 --- a/src/sampleconf/mimemap +++ b/src/sampleconf/mimemap @@ -164,17 +164,10 @@ .mht = application/x-mimehtml .mhtml = application/x-mimehtml -# A list of suffixes (name endings) that we don't want to touch at all. -# Having these explicitely listed speeds things up a bit by avoiding -# unneeded decompression or 'file' calls. File names still get indexed if -# indexallfilenames is set (so this is different from skippedNames). It's a -# bit unconsistent to have it listed among the suffix translations, but no -# problem in practice. -recoll_noindex = .md5 .map \ - .o .lib .dll .a .sys .exe .com \ - .mpp .mpt .vsd \ - .dat .bak .rdf .log.gz .log .db .msf .pid \ - ,v ~ # +# Note: recoll_noindex has been obsoleted and moved to recoll.conf as +# noContentSuffixes. recoll_noindex from your personal mimemap file is +# still taken into account for now, but you should move its contents to the +# new recoll.conf variable. # Special handling of .txt files inside ~/.gaim and ~/.purple directories [~/.gaim] diff --git a/src/sampleconf/recoll.conf.in b/src/sampleconf/recoll.conf.in index 307211cd..83629029 100644 --- a/src/sampleconf/recoll.conf.in +++ b/src/sampleconf/recoll.conf.in @@ -31,6 +31,20 @@ skippedNames = #* bin CVS Cache cache* .cache caughtspam tmp \ # Explicitely adding /media/xxx to the topdirs will override this. skippedPaths = /media +# List of suffixes for which we don't try mime type identification (and +# don't uncompress or index content obviously). This complements the now +# obsoleted mimemap recoll_noindex list, which will go away in a future +# release (the move from mimemap to recoll.conf allows editing the list +# through the GUI). This is different from skippedNames because these are +# name ending matches only (not wildcard patterns), and the file name +# itself gets indexed normally. +noContentSuffixes = .md5 .map \ + .o .lib .dll .a .sys .exe .com \ + .mpp .mpt .vsd \ + .img .img.gz .img.bz2 .img.xz .image .image.gz .image.bz2 .image.xz \ + .dat .bak .rdf .log.gz .log .db .msf .pid \ + ,v ~ # + # Same for real time indexing. The idea here is that there is stuff that # you might want to initially index but not monitor. If daemSkippedPaths is # not set, the daemon uses skippedPaths.