changed stopsuffixes processing

This commit is contained in:
dockes 2006-12-20 09:54:18 +00:00
parent 2bba8a159d
commit 1c80f0d67c
4 changed files with 133 additions and 51 deletions

View File

@ -1,5 +1,5 @@
#ifndef lint #ifndef lint
static char rcsid[] = "@(#$Id: rclconfig.cpp,v 1.38 2006-12-19 08:40:50 dockes Exp $ (C) 2004 J.F.Dockes"; static char rcsid[] = "@(#$Id: rclconfig.cpp,v 1.39 2006-12-20 09:54:17 dockes Exp $ (C) 2004 J.F.Dockes";
#endif #endif
/* /*
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
@ -24,6 +24,8 @@ static char rcsid[] = "@(#$Id: rclconfig.cpp,v 1.38 2006-12-19 08:40:50 dockes E
#include <errno.h> #include <errno.h>
#include <langinfo.h> #include <langinfo.h>
#include <set>
#include <sys/types.h> #include <sys/types.h>
#include <sys/stat.h> #include <sys/stat.h>
#ifdef __FreeBSD__ #ifdef __FreeBSD__
@ -42,6 +44,13 @@ static char rcsid[] = "@(#$Id: rclconfig.cpp,v 1.38 2006-12-19 08:40:50 dockes E
using namespace std; using namespace std;
#endif /* NO_NAMESPACES */ #endif /* NO_NAMESPACES */
#ifndef MIN
#define MIN(A,B) (((A)<(B)) ? (A) : (B))
#endif
#ifndef MAX
#define MAX(A,B) (((A)>(B)) ? (A) : (B))
#endif
RclConfig::RclConfig(const string *argcnf) RclConfig::RclConfig(const string *argcnf)
{ {
zeroMe(); zeroMe();
@ -211,16 +220,81 @@ std::list<string> RclConfig::getAllMimeTypes()
return lst; return lst;
} }
const list<string>* RclConfig::getStopSuffixes() // Things for suffix comparison. We define a string class and string
// comparison with suffix-only sensitivity
class SfString {
public:
SfString(const string& s) : m_str(s) {}
bool operator==(const SfString& s2) {
string::const_reverse_iterator r1 = m_str.rbegin(), re1 = m_str.rend(),
r2 = s2.m_str.rbegin(), re2 = s2.m_str.rend();
while (r1 != re1 && r2 != re2) {
if (*r1 != *r2) {
return 0;
}
++r1; ++r2;
}
return 1;
}
string m_str;
};
class SuffCmp {
public:
int operator()(const SfString& s1, const SfString& s2) {
//cout << "Comparing " << s1.m_str << " and " << s2.m_str << endl;
string::const_reverse_iterator
r1 = s1.m_str.rbegin(), re1 = s1.m_str.rend(),
r2 = s2.m_str.rbegin(), re2 = s2.m_str.rend();
while (r1 != re1 && r2 != re2) {
if (*r1 != *r2) {
return *r1 < *r2 ? 1 : 0;
}
++r1; ++r2;
}
return 0;
}
};
typedef multiset<SfString, SuffCmp> SuffixStore;
#define STOPSUFFIXES ((SuffixStore *)m_stopsuffixes)
bool RclConfig::inStopSuffixes(const string& fni)
{ {
if (stopsuffixes == 0 && (stopsuffixes = new list<string>) != 0) { if (m_stopsuffixes == 0) {
// Need to initialize the suffixes
if ((m_stopsuffixes = new SuffixStore) == 0) {
LOGERR(("RclConfig::inStopSuffixes: out of memory\n"));
return false;
}
string stp; string stp;
list<string> stoplist;
if (mimemap && mimemap->get("recoll_noindex", stp, m_keydir)) { if (mimemap && mimemap->get("recoll_noindex", stp, m_keydir)) {
stringToStrings(stp, *stopsuffixes); stringToStrings(stp, stoplist);
}
for (list<string>::const_iterator it = stoplist.begin();
it != stoplist.end(); it++) {
string lower(*it);
stringtolower(lower);
STOPSUFFIXES->insert(SfString(lower));
if (m_maxsufflen < lower.length())
m_maxsufflen = lower.length();
} }
} }
return stopsuffixes; string fn(fni,
MAX(0, fni.length() - m_maxsufflen),
MIN(fni.length(), m_maxsufflen));
stringtolower(fn);
SuffixStore::const_iterator it = STOPSUFFIXES->find(fn);
if (it != STOPSUFFIXES->end()) {
LOGDEB2(("RclConfig::inStopSuffixes: Found (%s) [%s]\n",
fni.c_str(), (*it).m_str.c_str()));
return true;
} else {
LOGDEB2(("RclConfig::inStopSuffixes: not found [%s]\n", fni.c_str()));
return false;
}
} }
string RclConfig::getMimeTypeFromSuffix(const string &suff) string RclConfig::getMimeTypeFromSuffix(const string &suff)
@ -244,6 +318,17 @@ string RclConfig::getSuffixFromMimeType(const string &mt)
return ""; return "";
} }
void RclConfig::freeAll()
{
delete m_conf;
delete mimemap;
delete mimeconf;
delete mimeview;
delete STOPSUFFIXES;
// just in case
zeroMe();
}
string RclConfig::getMimeHandlerDef(const std::string &mtype) string RclConfig::getMimeHandlerDef(const std::string &mtype)
{ {
string hs; string hs;
@ -495,8 +580,9 @@ void RclConfig::initFrom(const RclConfig& r)
mimeconf = new ConfStack<ConfTree>(*(r.mimeconf)); mimeconf = new ConfStack<ConfTree>(*(r.mimeconf));
if (r.mimeview) if (r.mimeview)
mimeview = new ConfStack<ConfTree>(*(r.mimeview)); mimeview = new ConfStack<ConfTree>(*(r.mimeview));
if (r.stopsuffixes) if (r.m_stopsuffixes)
stopsuffixes = new std::list<std::string>(*(r.stopsuffixes)); m_stopsuffixes = new SuffixStore(*((SuffixStore*)r.m_stopsuffixes));
m_maxsufflen = r.m_maxsufflen;
defcharset = r.defcharset; defcharset = r.defcharset;
guesscharset = r.guesscharset; guesscharset = r.guesscharset;
} }

View File

@ -16,7 +16,7 @@
*/ */
#ifndef _RCLCONFIG_H_INCLUDED_ #ifndef _RCLCONFIG_H_INCLUDED_
#define _RCLCONFIG_H_INCLUDED_ #define _RCLCONFIG_H_INCLUDED_
/* @(#$Id: rclconfig.h,v 1.28 2006-12-16 15:30:02 dockes Exp $ (C) 2004 J.F.Dockes */ /* @(#$Id: rclconfig.h,v 1.29 2006-12-20 09:54:17 dockes Exp $ (C) 2004 J.F.Dockes */
#include <list> #include <list>
#include <string> #include <string>
@ -35,7 +35,7 @@ using std::pair;
class RclConfig { class RclConfig {
public: public:
RclConfig(const string *argcnf=0); RclConfig(const string *argcnf = 0);
bool ok() {return m_ok;} bool ok() {return m_ok;}
const string &getReason() {return m_reason;} const string &getReason() {return m_reason;}
/** Return the directory where this config is stored */ /** Return the directory where this config is stored */
@ -83,12 +83,12 @@ class RclConfig {
list<string> getSkippedNames(); list<string> getSkippedNames();
/** /**
* Get list of ignored suffixes from mimemap * Check if file name should be ignored because of suffix
* *
* The list is initialized on first call, and not changed for subsequent * The list of ignored suffixes is initialized on first call, and
* setKeydirs. * not changed for subsequent setKeydirs.
*/ */
const list<string>* getStopSuffixes(); bool inStopSuffixes(const string& fn);
/** /**
* Check in mimeconf if input mime type is a compressed one, and * Check in mimeconf if input mime type is a compressed one, and
@ -153,7 +153,8 @@ class RclConfig {
ConfStack<ConfTree> *mimeconf; // but their content may depend on it. ConfStack<ConfTree> *mimeconf; // but their content may depend on it.
ConfStack<ConfTree> *mimeview; // ConfStack<ConfTree> *mimeview; //
list<string> *stopsuffixes; void *m_stopsuffixes;
unsigned int m_maxsufflen;
// Parameters auto-fetched on setkeydir // Parameters auto-fetched on setkeydir
string defcharset; // These are stored locally to avoid string defcharset; // These are stored locally to avoid
@ -171,18 +172,11 @@ class RclConfig {
mimemap = 0; mimemap = 0;
mimeconf = 0; mimeconf = 0;
mimeview = 0; mimeview = 0;
stopsuffixes = 0; m_stopsuffixes = 0;
m_maxsufflen = 0;
} }
/** Free data then zero pointers */ /** Free data then zero pointers */
void freeAll() { void freeAll();
delete m_conf;
delete mimemap;
delete mimeconf;
delete mimeview;
delete stopsuffixes;
// just in case
zeroMe();
}
}; };

View File

@ -24,7 +24,7 @@
Dockes</holder> Dockes</holder>
</copyright> </copyright>
<releaseinfo>$Id: usermanual.sgml,v 1.30 2006-12-18 12:06:11 dockes Exp $</releaseinfo> <releaseinfo>$Id: usermanual.sgml,v 1.31 2006-12-20 09:54:17 dockes Exp $</releaseinfo>
<abstract> <abstract>
<para>This document introduces full text search notions <para>This document introduces full text search notions
@ -1528,7 +1528,7 @@ RECOLL_CONFDIR=$recollconf $recolldata/examples/rclmon.sh stop
agents like <application>thunderbird</application> agents like <application>thunderbird</application>
usually store messages in hidden directories, and you usually store messages in hidden directories, and you
probably want this indexed. One possible solution is to probably want this indexed. One possible solution is to
have <userinput>.*</userinput> in have <filename>.*</filename> in
<literal>skippedNames</literal>, and add things like <literal>skippedNames</literal>, and add things like
<filename>~/.thunderbird</filename> or <filename>~/.thunderbird</filename> or
<filename>~/.evolution</filename> in <filename>~/.evolution</filename> in
@ -1656,12 +1656,19 @@ RECOLL_CONFDIR=$recollconf $recolldata/examples/rclmon.sh stop
<para><filename>mimemap</filename> also has a <para><filename>mimemap</filename> also has a
<literal>recoll_noindex</literal> variable which is a list of <literal>recoll_noindex</literal> variable which is a list of
suffixes. Matching files will be skipped (avoids unnecessary suffixes. Matching files will be skipped (which avoids
decompressions or <command>file</command> executions). This is unnecessary decompressions or <command>file</command>
partially redundant with <literal>skippedNames</literal> in executions). This is partially redundant with
the main configuration file, with two differences: it will not <literal>skippedNames</literal> in the main configuration
affect directories, and it can be changed for any file, with two differences: it will not affect directories,
sub-directory.</para> and it cannot be made dependant on the file-system location
(it is a configuration-wide parameter). You could accomplish
with <literal>skippedNames</literal> anything that
<literal>recoll_noindex</literal> does. The latter is used
mostly for things known to be unindexable by a given &RCL;
version. Having it there avoids cluttering the more
user-oriented and locally customized
<literal>skippedNames</literal>.</para>
</sect2> </sect2>
@ -1672,13 +1679,14 @@ RECOLL_CONFDIR=$recollconf $recolldata/examples/rclmon.sh stop
different mime types are handled for indexing, and which icons different mime types are handled for indexing, and which icons
are displayed in the <command>recoll</command> result lists.</para> are displayed in the <command>recoll</command> result lists.</para>
<para>Changing the indexing parameters is probably not a <para>Changing the parameters in the [index] section is
good idea except if you are a &RCL; developers.</para> probably not a good idea except if you are a &RCL;
developer.</para>
<para>You can change the icons which are displayed by <para>The [icons] section allows you to change the icons which
<command>recoll</command> in the result lists (the values are are displayed by <command>recoll</command> in the result
the basenames of the png images inside the lists (the values are the basenames of the png images inside
<filename>iconsdir</filename> directory (specified in the <filename>iconsdir</filename> directory (specified in
<filename>recoll.conf</filename>).</para> <filename>recoll.conf</filename>).</para>
</sect2> </sect2>

View File

@ -1,5 +1,5 @@
#ifndef lint #ifndef lint
static char rcsid[] = "@(#$Id: mimetype.cpp,v 1.19 2006-12-19 08:40:50 dockes Exp $ (C) 2004 J.F.Dockes"; static char rcsid[] = "@(#$Id: mimetype.cpp,v 1.20 2006-12-20 09:54:18 dockes Exp $ (C) 2004 J.F.Dockes";
#endif #endif
/* /*
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
@ -117,19 +117,12 @@ string mimetype(const string &fn, const struct stat *stp,
if (cfg == 0) if (cfg == 0)
return ""; return "";
const list<string>* stoplist = cfg->getStopSuffixes(); if (cfg->inStopSuffixes(fn)) {
if (stoplist && !stoplist->empty()) { LOGDEB(("mimetype: fn [%s] in stopsuffixes\n", fn.c_str()));
for (list<string>::const_iterator it = stoplist->begin(); return "";
it != stoplist->end(); it++) {
if (!stringisuffcmp(fn, *it)) {
LOGDEB(("mimetype: fn %s in stoplist (%s)\n", fn.c_str(),
it->c_str()));
return "";
}
}
} }
// Look for suffix in mimetype map // First look for suffix in mimetype map
string::size_type dot = fn.find_last_of("."); string::size_type dot = fn.find_last_of(".");
string suff; string suff;
if (dot != string::npos) { if (dot != string::npos) {
@ -142,6 +135,7 @@ string mimetype(const string &fn, const struct stat *stp,
return mtype; return mtype;
} }
// Then examine data
return mimetypefromdata(fn, usfc); return mimetypefromdata(fn, usfc);
} }