changed stopsuffixes processing

This commit is contained in:
dockes 2006-12-20 09:54:18 +00:00
parent 2bba8a159d
commit 1c80f0d67c
4 changed files with 133 additions and 51 deletions

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: rclconfig.cpp,v 1.38 2006-12-19 08:40:50 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: rclconfig.cpp,v 1.39 2006-12-20 09:54:17 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -24,6 +24,8 @@ static char rcsid[] = "@(#$Id: rclconfig.cpp,v 1.38 2006-12-19 08:40:50 dockes E
#include <errno.h>
#include <langinfo.h>
#include <set>
#include <sys/types.h>
#include <sys/stat.h>
#ifdef __FreeBSD__
@ -42,6 +44,13 @@ static char rcsid[] = "@(#$Id: rclconfig.cpp,v 1.38 2006-12-19 08:40:50 dockes E
using namespace std;
#endif /* NO_NAMESPACES */
#ifndef MIN
#define MIN(A,B) (((A)<(B)) ? (A) : (B))
#endif
#ifndef MAX
#define MAX(A,B) (((A)>(B)) ? (A) : (B))
#endif
RclConfig::RclConfig(const string *argcnf)
{
zeroMe();
@ -211,16 +220,81 @@ std::list<string> RclConfig::getAllMimeTypes()
return lst;
}
const list<string>* RclConfig::getStopSuffixes()
// Things for suffix comparison. We define a string class and string
// comparison with suffix-only sensitivity
class SfString {
public:
SfString(const string& s) : m_str(s) {}
bool operator==(const SfString& s2) {
string::const_reverse_iterator r1 = m_str.rbegin(), re1 = m_str.rend(),
r2 = s2.m_str.rbegin(), re2 = s2.m_str.rend();
while (r1 != re1 && r2 != re2) {
if (*r1 != *r2) {
return 0;
}
++r1; ++r2;
}
return 1;
}
string m_str;
};
class SuffCmp {
public:
int operator()(const SfString& s1, const SfString& s2) {
//cout << "Comparing " << s1.m_str << " and " << s2.m_str << endl;
string::const_reverse_iterator
r1 = s1.m_str.rbegin(), re1 = s1.m_str.rend(),
r2 = s2.m_str.rbegin(), re2 = s2.m_str.rend();
while (r1 != re1 && r2 != re2) {
if (*r1 != *r2) {
return *r1 < *r2 ? 1 : 0;
}
++r1; ++r2;
}
return 0;
}
};
typedef multiset<SfString, SuffCmp> SuffixStore;
#define STOPSUFFIXES ((SuffixStore *)m_stopsuffixes)
bool RclConfig::inStopSuffixes(const string& fni)
{
if (stopsuffixes == 0 && (stopsuffixes = new list<string>) != 0) {
if (m_stopsuffixes == 0) {
// Need to initialize the suffixes
if ((m_stopsuffixes = new SuffixStore) == 0) {
LOGERR(("RclConfig::inStopSuffixes: out of memory\n"));
return false;
}
string stp;
list<string> stoplist;
if (mimemap && mimemap->get("recoll_noindex", stp, m_keydir)) {
stringToStrings(stp, *stopsuffixes);
stringToStrings(stp, stoplist);
}
for (list<string>::const_iterator it = stoplist.begin();
it != stoplist.end(); it++) {
string lower(*it);
stringtolower(lower);
STOPSUFFIXES->insert(SfString(lower));
if (m_maxsufflen < lower.length())
m_maxsufflen = lower.length();
}
}
return stopsuffixes;
string fn(fni,
MAX(0, fni.length() - m_maxsufflen),
MIN(fni.length(), m_maxsufflen));
stringtolower(fn);
SuffixStore::const_iterator it = STOPSUFFIXES->find(fn);
if (it != STOPSUFFIXES->end()) {
LOGDEB2(("RclConfig::inStopSuffixes: Found (%s) [%s]\n",
fni.c_str(), (*it).m_str.c_str()));
return true;
} else {
LOGDEB2(("RclConfig::inStopSuffixes: not found [%s]\n", fni.c_str()));
return false;
}
}
string RclConfig::getMimeTypeFromSuffix(const string &suff)
@ -244,6 +318,17 @@ string RclConfig::getSuffixFromMimeType(const string &mt)
return "";
}
void RclConfig::freeAll()
{
delete m_conf;
delete mimemap;
delete mimeconf;
delete mimeview;
delete STOPSUFFIXES;
// just in case
zeroMe();
}
string RclConfig::getMimeHandlerDef(const std::string &mtype)
{
string hs;
@ -495,8 +580,9 @@ void RclConfig::initFrom(const RclConfig& r)
mimeconf = new ConfStack<ConfTree>(*(r.mimeconf));
if (r.mimeview)
mimeview = new ConfStack<ConfTree>(*(r.mimeview));
if (r.stopsuffixes)
stopsuffixes = new std::list<std::string>(*(r.stopsuffixes));
if (r.m_stopsuffixes)
m_stopsuffixes = new SuffixStore(*((SuffixStore*)r.m_stopsuffixes));
m_maxsufflen = r.m_maxsufflen;
defcharset = r.defcharset;
guesscharset = r.guesscharset;
}

View File

@ -16,7 +16,7 @@
*/
#ifndef _RCLCONFIG_H_INCLUDED_
#define _RCLCONFIG_H_INCLUDED_
/* @(#$Id: rclconfig.h,v 1.28 2006-12-16 15:30:02 dockes Exp $ (C) 2004 J.F.Dockes */
/* @(#$Id: rclconfig.h,v 1.29 2006-12-20 09:54:17 dockes Exp $ (C) 2004 J.F.Dockes */
#include <list>
#include <string>
@ -35,7 +35,7 @@ using std::pair;
class RclConfig {
public:
RclConfig(const string *argcnf=0);
RclConfig(const string *argcnf = 0);
bool ok() {return m_ok;}
const string &getReason() {return m_reason;}
/** Return the directory where this config is stored */
@ -83,12 +83,12 @@ class RclConfig {
list<string> getSkippedNames();
/**
* Get list of ignored suffixes from mimemap
* Check if file name should be ignored because of suffix
*
* The list is initialized on first call, and not changed for subsequent
* setKeydirs.
* The list of ignored suffixes is initialized on first call, and
* not changed for subsequent setKeydirs.
*/
const list<string>* getStopSuffixes();
bool inStopSuffixes(const string& fn);
/**
* Check in mimeconf if input mime type is a compressed one, and
@ -153,7 +153,8 @@ class RclConfig {
ConfStack<ConfTree> *mimeconf; // but their content may depend on it.
ConfStack<ConfTree> *mimeview; //
list<string> *stopsuffixes;
void *m_stopsuffixes;
unsigned int m_maxsufflen;
// Parameters auto-fetched on setkeydir
string defcharset; // These are stored locally to avoid
@ -171,18 +172,11 @@ class RclConfig {
mimemap = 0;
mimeconf = 0;
mimeview = 0;
stopsuffixes = 0;
m_stopsuffixes = 0;
m_maxsufflen = 0;
}
/** Free data then zero pointers */
void freeAll() {
delete m_conf;
delete mimemap;
delete mimeconf;
delete mimeview;
delete stopsuffixes;
// just in case
zeroMe();
}
void freeAll();
};

View File

@ -24,7 +24,7 @@
Dockes</holder>
</copyright>
<releaseinfo>$Id: usermanual.sgml,v 1.30 2006-12-18 12:06:11 dockes Exp $</releaseinfo>
<releaseinfo>$Id: usermanual.sgml,v 1.31 2006-12-20 09:54:17 dockes Exp $</releaseinfo>
<abstract>
<para>This document introduces full text search notions
@ -1528,7 +1528,7 @@ RECOLL_CONFDIR=$recollconf $recolldata/examples/rclmon.sh stop
agents like <application>thunderbird</application>
usually store messages in hidden directories, and you
probably want this indexed. One possible solution is to
have <userinput>.*</userinput> in
have <filename>.*</filename> in
<literal>skippedNames</literal>, and add things like
<filename>~/.thunderbird</filename> or
<filename>~/.evolution</filename> in
@ -1656,12 +1656,19 @@ RECOLL_CONFDIR=$recollconf $recolldata/examples/rclmon.sh stop
<para><filename>mimemap</filename> also has a
<literal>recoll_noindex</literal> variable which is a list of
suffixes. Matching files will be skipped (avoids unnecessary
decompressions or <command>file</command> executions). This is
partially redundant with <literal>skippedNames</literal> in
the main configuration file, with two differences: it will not
affect directories, and it can be changed for any
sub-directory.</para>
suffixes. Matching files will be skipped (which avoids
unnecessary decompressions or <command>file</command>
executions). This is partially redundant with
<literal>skippedNames</literal> in the main configuration
file, with two differences: it will not affect directories,
and it cannot be made dependant on the file-system location
(it is a configuration-wide parameter). You could accomplish
with <literal>skippedNames</literal> anything that
<literal>recoll_noindex</literal> does. The latter is used
mostly for things known to be unindexable by a given &RCL;
version. Having it there avoids cluttering the more
user-oriented and locally customized
<literal>skippedNames</literal>.</para>
</sect2>
@ -1672,14 +1679,15 @@ RECOLL_CONFDIR=$recollconf $recolldata/examples/rclmon.sh stop
different mime types are handled for indexing, and which icons
are displayed in the <command>recoll</command> result lists.</para>
<para>Changing the indexing parameters is probably not a
good idea except if you are a &RCL; developers.</para>
<para>Changing the parameters in the [index] section is
probably not a good idea except if you are a &RCL;
developer.</para>
<para>You can change the icons which are displayed by
<command>recoll</command> in the result lists (the values are
the basenames of the png images inside the
<filename>iconsdir</filename> directory (specified in
<filename>recoll.conf</filename>).</para>
<para>The [icons] section allows you to change the icons which
are displayed by <command>recoll</command> in the result
lists (the values are the basenames of the png images inside
the <filename>iconsdir</filename> directory (specified in
<filename>recoll.conf</filename>).</para>
</sect2>
<sect2 id="rclinstall.config.mimeview">

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: mimetype.cpp,v 1.19 2006-12-19 08:40:50 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: mimetype.cpp,v 1.20 2006-12-20 09:54:18 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -117,19 +117,12 @@ string mimetype(const string &fn, const struct stat *stp,
if (cfg == 0)
return "";
const list<string>* stoplist = cfg->getStopSuffixes();
if (stoplist && !stoplist->empty()) {
for (list<string>::const_iterator it = stoplist->begin();
it != stoplist->end(); it++) {
if (!stringisuffcmp(fn, *it)) {
LOGDEB(("mimetype: fn %s in stoplist (%s)\n", fn.c_str(),
it->c_str()));
return "";
}
}
if (cfg->inStopSuffixes(fn)) {
LOGDEB(("mimetype: fn [%s] in stopsuffixes\n", fn.c_str()));
return "";
}
// Look for suffix in mimetype map
// First look for suffix in mimetype map
string::size_type dot = fn.find_last_of(".");
string suff;
if (dot != string::npos) {
@ -142,6 +135,7 @@ string mimetype(const string &fn, const struct stat *stp,
return mtype;
}
// Then examine data
return mimetypefromdata(fn, usfc);
}