Moved mimemap variable recoll_noindex to recoll.conf noContentSuffixes

This commit is contained in:
Jean-Francois Dockes 2015-03-25 11:48:59 +01:00
parent 34e1a25d31
commit e0517a7d13
6 changed files with 155 additions and 39 deletions

View File

@ -114,6 +114,7 @@ void RclConfig::zeroMe() {
m_stopsuffixes = 0; m_stopsuffixes = 0;
m_maxsufflen = 0; m_maxsufflen = 0;
m_oldstpsuffstate.init(0);
m_stpsuffstate.init(0); m_stpsuffstate.init(0);
m_skpnstate.init(0); m_skpnstate.init(0);
m_rmtstate.init(0); m_rmtstate.init(0);
@ -130,7 +131,8 @@ bool RclConfig::isDefaultConfig() const
} }
RclConfig::RclConfig(const string *argcnf) RclConfig::RclConfig(const string *argcnf)
: m_stpsuffstate(this, "recoll_noindex"), : m_oldstpsuffstate(this, "recoll_noindex"),
m_stpsuffstate(this, "noContentSuffixes"),
m_skpnstate(this, "skippedNames"), m_skpnstate(this, "skippedNames"),
m_rmtstate(this, "indexedmimetypes"), m_rmtstate(this, "indexedmimetypes"),
m_xmtstate(this, "excludedmimetypes"), m_xmtstate(this, "excludedmimetypes"),
@ -282,7 +284,8 @@ RclConfig::RclConfig(const string *argcnf)
m_ok = true; m_ok = true;
setKeyDir(cstr_null); setKeyDir(cstr_null);
m_stpsuffstate.init(mimemap); m_oldstpsuffstate.init(mimemap);
m_stpsuffstate.init(m_conf);
m_skpnstate.init(m_conf); m_skpnstate.init(m_conf);
m_rmtstate.init(m_conf); m_rmtstate.init(m_conf);
m_xmtstate.init(m_conf); m_xmtstate.init(m_conf);
@ -605,17 +608,24 @@ typedef multiset<SfString, SuffCmp> SuffixStore;
bool RclConfig::inStopSuffixes(const string& fni) bool RclConfig::inStopSuffixes(const string& fni)
{ {
LOGDEB2(("RclConfig::inStopSuffixes(%s)\n", fni.c_str())); LOGDEB2(("RclConfig::inStopSuffixes(%s)\n", fni.c_str()));
// Beware: needrecompute() needs to be called always. 2nd test stays back. // Beware: both needrecompute() need to be called always hence the
if (m_stpsuffstate.needrecompute() || m_stopsuffixes == 0) { // bizarre way we do things
bool needrecompute = m_stpsuffstate.needrecompute();
needrecompute = needrecompute || m_oldstpsuffstate.needrecompute();
if (needrecompute || m_stopsuffixes == 0) {
// Need to initialize the suffixes // Need to initialize the suffixes
delete STOPSUFFIXES; delete STOPSUFFIXES;
if ((m_stopsuffixes = new SuffixStore) == 0) { if ((m_stopsuffixes = new SuffixStore) == 0) {
LOGERR(("RclConfig::inStopSuffixes: out of memory\n")); LOGERR(("RclConfig::inStopSuffixes: out of memory\n"));
return false; return false;
} }
list<string> stoplist; vector<string> stoplist;
stringToStrings(m_stpsuffstate.savedvalue, stoplist); stringToStrings(m_stpsuffstate.savedvalue, stoplist);
for (list<string>::const_iterator it = stoplist.begin(); vector<string> ostoplist;
stringToStrings(m_oldstpsuffstate.savedvalue, ostoplist);
stoplist.resize(stoplist.size() + ostoplist.size());
stoplist.insert(stoplist.end(), ostoplist.begin(), ostoplist.end());
for (vector<string>::const_iterator it = stoplist.begin();
it != stoplist.end(); it++) { it != stoplist.end(); it++) {
STOPSUFFIXES->insert(SfString(stringtolower(*it))); STOPSUFFIXES->insert(SfString(stringtolower(*it)));
if (m_maxsufflen < it->length()) if (m_maxsufflen < it->length())
@ -1461,7 +1471,8 @@ void RclConfig::initFrom(const RclConfig& r)
m_maxsufflen = r.m_maxsufflen; m_maxsufflen = r.m_maxsufflen;
m_defcharset = r.m_defcharset; m_defcharset = r.m_defcharset;
m_stpsuffstate.init(mimemap); m_oldstpsuffstate.init(mimemap);
m_stpsuffstate.init(m_conf);
m_skpnstate.init(m_conf); m_skpnstate.init(m_conf);
m_rmtstate.init(m_conf); m_rmtstate.init(m_conf);
m_xmtstate.init(m_conf); m_xmtstate.init(m_conf);

View File

@ -81,7 +81,8 @@ class RclConfig {
RclConfig(const string *argcnf = 0); RclConfig(const string *argcnf = 0);
RclConfig(const RclConfig &r) RclConfig(const RclConfig &r)
: m_stpsuffstate(this, "recoll_noindex"), : m_oldstpsuffstate(this, "recoll_noindex"),
m_stpsuffstate(this, "noContentSuffixes"),
m_skpnstate(this, "skippedNames"), m_skpnstate(this, "skippedNames"),
m_rmtstate(this, "indexedmimetypes"), m_rmtstate(this, "indexedmimetypes"),
m_xmtstate(this, "excludedmimetypes"), m_xmtstate(this, "excludedmimetypes"),
@ -334,6 +335,7 @@ class RclConfig {
void *m_stopsuffixes; void *m_stopsuffixes;
unsigned int m_maxsufflen; unsigned int m_maxsufflen;
ParamStale m_oldstpsuffstate; // Values from user mimemap, now obsolete
ParamStale m_stpsuffstate; ParamStale m_stpsuffstate;
ParamStale m_skpnstate; ParamStale m_skpnstate;

View File

@ -3,7 +3,7 @@
<!ENTITY RCL "<application>Recoll</application>"> <!ENTITY RCL "<application>Recoll</application>">
<!ENTITY RCLAPPS "<ulink url='http://www.recoll.org/features.html'>http://www.recoll.org/features.html</ulink>"> <!ENTITY RCLAPPS "<ulink url='http://www.recoll.org/features.html'>http://www.recoll.org/features.html</ulink>">
<!ENTITY RCLVERSION "1.20"> <!ENTITY RCLVERSION "1.21">
<!ENTITY XAP "<application>Xapian</application>"> <!ENTITY XAP "<application>Xapian</application>">
<!ENTITY WIKI "http://bitbucket.org/medoc/recoll/wiki/"> <!ENTITY WIKI "http://bitbucket.org/medoc/recoll/wiki/">
]> ]>
@ -22,7 +22,7 @@
</author> </author>
<copyright> <copyright>
<year>2005-2014</year> <year>2005-2015</year>
<holder role="mailto:jfd@recoll.org">Jean-Francois Dockes</holder> <holder role="mailto:jfd@recoll.org">Jean-Francois Dockes</holder>
</copyright> </copyright>
<abstract> <abstract>
@ -274,6 +274,14 @@
aspects of the indexing processes and configuration, with links aspects of the indexing processes and configuration, with links
to detailed sections.</para> to detailed sections.</para>
<para>Depending on your data, temporary files may be needed during
indexing, some of them possibly quite big. You can use the
<envar>RECOLL_TMPDIR</envar> or <envar>TMPDIR</envar> environment
variables to determine where they are created (the default is to
use <filename>/tmp</filename>). Using <envar>TMPDIR</envar> has
the nice property that it may also be taken into account by
auxiliary commands executed by <command>recollindex</command>.</para>
<sect2 id="RCL.INDEXING.INTRODUCTION.MODES"> <sect2 id="RCL.INDEXING.INTRODUCTION.MODES">
<title>Indexing modes</title> <title>Indexing modes</title>
@ -5122,6 +5130,73 @@ except:
</itemizedlist> </itemizedlist>
</para> </para>
<sect2 id="RCL.INSTALL.CONFIG.ENVIR">
<title>Environment variables</title>
<variablelist>
<varlistentry>
<term><varname>RECOLL_CONFDIR</varname></term>
<listitem><para>Defines the main configuration
directory.</para></listitem>
</varlistentry>
<varlistentry>
<term><varname>RECOLL_TMPDIR, TMPDIR</varname></term>
<listitem><para>Locations for temporary files, in this order
of priority. The default if none of these is set is to use
<filename>/tmp</filename>. Big temporary files may be created
during indexing, mostly for decompressing, and also for
processing, e.g. email attachments.</para></listitem>
</varlistentry>
<varlistentry>
<term><varname>RECOLL_CONFTOP, RECOLL_CONFMID</varname></term>
<listitem><para>Allow adding configuration directories with
priorities below and above the user directory (see above the
Configuration overview section for details).</para></listitem>
</varlistentry>
<varlistentry>
<term><varname>RECOLL_EXTRA_DBS,
RECOLL_ACTIVE_EXTRA_DBS</varname></term>
<listitem><para>
Help for setting up external indexes. See <link
linkend="RCL.SEARCH.GUI.MULTIDB">this paragraph</link> for
explanations.
</para></listitem>
</varlistentry>
<varlistentry>
<term><varname>RECOLL_DATADIR</varname></term>
<listitem><para>Defines replacement for the default location
of Recoll data files, normally found in, e.g.,
<filename>/usr/share/recoll</filename>).</para></listitem>
</varlistentry>
<varlistentry>
<term><varname>RECOLL_FILTERSDIR</varname></term>
<listitem><para>Defines replacement for the default location
of Recoll filters, normally found in, e.g.,
<filename>/usr/share/recoll/filters</filename>).</para></listitem>
</varlistentry>
<varlistentry>
<term><varname>ASPELL_PROG</varname></term>
<listitem><para><command>aspell</command> program to use for
creating the spelling dictionary. The result has to be
compatible with the <filename>libaspell</filename> which &RCL;
is using.</para></listitem>
</varlistentry>
<varlistentry>
<term><varname>VARNAME</varname></term>
<listitem><para>Blabla</para></listitem>
</varlistentry>
</variablelist>
</sect2>
<sect2 id="RCL.INSTALL.CONFIG.RECOLLCONF"> <sect2 id="RCL.INSTALL.CONFIG.RECOLLCONF">
<title>The main configuration file, recoll.conf</title> <title>The main configuration file, recoll.conf</title>
@ -5188,12 +5263,29 @@ skippedNames = #* bin CVS Cache cache* caughtspam tmp .thumbnails .svn \
<para>Not even the file names are indexed for patterns <para>Not even the file names are indexed for patterns
in this list. See the in this list. See the
<varname>recoll_noindex</varname> variable in <varname>noContentSuffixes</varname> variable for an alternative
<filename>mimemap</filename> for an alternative
approach which indexes the file names.</para> approach which indexes the file names.</para>
</listitem> </listitem>
</varlistentry> </varlistentry>
<varlistentry><term><varname>noContentSuffixes</varname></term>
<listitem><para>This is a list of file name endings (not
wildcard expressions, nor dot-delimited suffixes). Only the
names of matching files will be indexed (no attempt at MIME
type identification, no decompression, no content
indexing). This can be redefined for
subdirectories, and edited from the GUI. The default value is:
<programlisting>
noContentSuffixes = .md5 .map \
.o .lib .dll .a .sys .exe .com \
.mpp .mpt .vsd \
.img .img.gz .img.bz2 .img.xz .image .image.gz .image.bz2 .image.xz \
.dat .bak .rdf .log.gz .log .db .msf .pid \
,v ~ #
</programlisting>
</para></listitem>
</varlistentry>
<varlistentry><term><varname>skippedPaths</varname> and <varlistentry><term><varname>skippedPaths</varname> and
<varname>daemSkippedPaths</varname> </term> <varname>daemSkippedPaths</varname> </term>
<listitem> <listitem>
@ -6049,21 +6141,14 @@ x-my-tag = mailmytag
should be handled specially, which is possible because they should be handled specially, which is possible because they
are usually all located in one place.</para> are usually all located in one place.</para>
<para><filename>mimemap</filename> also has a <para>The <varname>recoll_noindex</varname>
<varname>recoll_noindex</varname> variable which is a list of <filename>mimemap</filename> variable has been moved to
suffixes. Matching files will be skipped (which avoids <filename>recoll.conf</filename> and renamed to
unnecessary decompressions or <command>file</command> <varname>noContentSuffixes</varname>, while keeping the same
executions). This is partially redundant with function, as of &RCL; version 1.21. For older &RCL; versions,
<varname>skippedNames</varname> in the main configuration see the documentation for <varname>noContentSuffixes</varname>
file, with a few differences: it will not affect directories, but use <varname>recoll_noindex</varname> in
it cannot be made dependant on the file-system location (it is <filename>mimemap</filename>.</para>
a configuration-wide parameter), and the file names will still
be indexed (not even the file names are indexed for patterns
in <varname>skippedNames</varname>.
<varname>recoll_noindex</varname> is used mostly for things
known to be unindexable by a given &RCL; version. Having it
there avoids cluttering the more user-oriented and locally
customized <varname>skippedNames</varname>.</para>
</sect2> </sect2>

View File

@ -454,6 +454,17 @@ ConfSubPanelW::ConfSubPanelW(QWidget *parent, ConfNull *config,
m_widgets.push_back(eexcm); m_widgets.push_back(eexcm);
gl1->addWidget(eexcm, gridy, 0); gl1->addWidget(eexcm, gridy, 0);
ConfParamSLW *encs = new ConfParamSLW(
m_groupbox,
ConfLink(new ConfLinkRclRep(config, "noContentSuffixes", &m_sk)),
QObject::tr("Ignored endings"),
QObject::tr("These are file name endings for files which will be "
"indexed by content only \n(no MIME type identification "
"attempt, no decompression, no content indexing."));
encs->setFsEncoding(true);
m_widgets.push_back(encs);
gl1->addWidget(encs, gridy++, 1);
vector<string> args; vector<string> args;
args.push_back("-l"); args.push_back("-l");
ExecCmd ex; ExecCmd ex;
@ -484,7 +495,7 @@ ConfSubPanelW::ConfSubPanelW(QWidget *parent, ConfNull *config,
"and the value from the NLS environnement is used." "and the value from the NLS environnement is used."
), charsets); ), charsets);
m_widgets.push_back(e21); m_widgets.push_back(e21);
gl1->addWidget(e21, gridy++, 1); gl1->addWidget(e21, gridy++, 0);
ConfParamBoolW *e3 = new ConfParamBoolW( ConfParamBoolW *e3 = new ConfParamBoolW(
m_groupbox, m_groupbox,

View File

@ -164,17 +164,10 @@
.mht = application/x-mimehtml .mht = application/x-mimehtml
.mhtml = application/x-mimehtml .mhtml = application/x-mimehtml
# A list of suffixes (name endings) that we don't want to touch at all. # Note: recoll_noindex has been obsoleted and moved to recoll.conf as
# Having these explicitely listed speeds things up a bit by avoiding # noContentSuffixes. recoll_noindex from your personal mimemap file is
# unneeded decompression or 'file' calls. File names still get indexed if # still taken into account for now, but you should move its contents to the
# indexallfilenames is set (so this is different from skippedNames). It's a # new recoll.conf variable.
# bit unconsistent to have it listed among the suffix translations, but no
# problem in practice.
recoll_noindex = .md5 .map \
.o .lib .dll .a .sys .exe .com \
.mpp .mpt .vsd \
.dat .bak .rdf .log.gz .log .db .msf .pid \
,v ~ #
# Special handling of .txt files inside ~/.gaim and ~/.purple directories # Special handling of .txt files inside ~/.gaim and ~/.purple directories
[~/.gaim] [~/.gaim]

View File

@ -31,6 +31,20 @@ skippedNames = #* bin CVS Cache cache* .cache caughtspam tmp \
# Explicitely adding /media/xxx to the topdirs will override this. # Explicitely adding /media/xxx to the topdirs will override this.
skippedPaths = /media skippedPaths = /media
# List of suffixes for which we don't try mime type identification (and
# don't uncompress or index content obviously). This complements the now
# obsoleted mimemap recoll_noindex list, which will go away in a future
# release (the move from mimemap to recoll.conf allows editing the list
# through the GUI). This is different from skippedNames because these are
# name ending matches only (not wildcard patterns), and the file name
# itself gets indexed normally.
noContentSuffixes = .md5 .map \
.o .lib .dll .a .sys .exe .com \
.mpp .mpt .vsd \
.img .img.gz .img.bz2 .img.xz .image .image.gz .image.bz2 .image.xz \
.dat .bak .rdf .log.gz .log .db .msf .pid \
,v ~ #
# Same for real time indexing. The idea here is that there is stuff that # Same for real time indexing. The idea here is that there is stuff that
# you might want to initially index but not monitor. If daemSkippedPaths is # you might want to initially index but not monitor. If daemSkippedPaths is
# not set, the daemon uses skippedPaths. # not set, the daemon uses skippedPaths.