From 1c80f0d67c458295412e95a7ab6eb601793234fe Mon Sep 17 00:00:00 2001 From: dockes Date: Wed, 20 Dec 2006 09:54:18 +0000 Subject: [PATCH] changed stopsuffixes processing --- src/common/rclconfig.cpp | 100 ++++++++++++++++++++++++++++++++--- src/common/rclconfig.h | 28 ++++------ src/doc/user/usermanual.sgml | 38 +++++++------ src/index/mimetype.cpp | 18 +++---- 4 files changed, 133 insertions(+), 51 deletions(-) diff --git a/src/common/rclconfig.cpp b/src/common/rclconfig.cpp index f33abbbf..d0f24cea 100644 --- a/src/common/rclconfig.cpp +++ b/src/common/rclconfig.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: rclconfig.cpp,v 1.38 2006-12-19 08:40:50 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: rclconfig.cpp,v 1.39 2006-12-20 09:54:17 dockes Exp $ (C) 2004 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -24,6 +24,8 @@ static char rcsid[] = "@(#$Id: rclconfig.cpp,v 1.38 2006-12-19 08:40:50 dockes E #include #include +#include + #include #include #ifdef __FreeBSD__ @@ -42,6 +44,13 @@ static char rcsid[] = "@(#$Id: rclconfig.cpp,v 1.38 2006-12-19 08:40:50 dockes E using namespace std; #endif /* NO_NAMESPACES */ +#ifndef MIN +#define MIN(A,B) (((A)<(B)) ? (A) : (B)) +#endif +#ifndef MAX +#define MAX(A,B) (((A)>(B)) ? (A) : (B)) +#endif + RclConfig::RclConfig(const string *argcnf) { zeroMe(); @@ -211,16 +220,81 @@ std::list RclConfig::getAllMimeTypes() return lst; } -const list* RclConfig::getStopSuffixes() +// Things for suffix comparison. We define a string class and string +// comparison with suffix-only sensitivity +class SfString { +public: + SfString(const string& s) : m_str(s) {} + bool operator==(const SfString& s2) { + string::const_reverse_iterator r1 = m_str.rbegin(), re1 = m_str.rend(), + r2 = s2.m_str.rbegin(), re2 = s2.m_str.rend(); + while (r1 != re1 && r2 != re2) { + if (*r1 != *r2) { + return 0; + } + ++r1; ++r2; + } + return 1; + } + string m_str; +}; + +class SuffCmp { +public: + int operator()(const SfString& s1, const SfString& s2) { + //cout << "Comparing " << s1.m_str << " and " << s2.m_str << endl; + string::const_reverse_iterator + r1 = s1.m_str.rbegin(), re1 = s1.m_str.rend(), + r2 = s2.m_str.rbegin(), re2 = s2.m_str.rend(); + while (r1 != re1 && r2 != re2) { + if (*r1 != *r2) { + return *r1 < *r2 ? 1 : 0; + } + ++r1; ++r2; + } + return 0; + } +}; +typedef multiset SuffixStore; + +#define STOPSUFFIXES ((SuffixStore *)m_stopsuffixes) + +bool RclConfig::inStopSuffixes(const string& fni) { - if (stopsuffixes == 0 && (stopsuffixes = new list) != 0) { + if (m_stopsuffixes == 0) { + // Need to initialize the suffixes + if ((m_stopsuffixes = new SuffixStore) == 0) { + LOGERR(("RclConfig::inStopSuffixes: out of memory\n")); + return false; + } string stp; + list stoplist; if (mimemap && mimemap->get("recoll_noindex", stp, m_keydir)) { - stringToStrings(stp, *stopsuffixes); + stringToStrings(stp, stoplist); + } + for (list::const_iterator it = stoplist.begin(); + it != stoplist.end(); it++) { + string lower(*it); + stringtolower(lower); + STOPSUFFIXES->insert(SfString(lower)); + if (m_maxsufflen < lower.length()) + m_maxsufflen = lower.length(); } } - return stopsuffixes; + string fn(fni, + MAX(0, fni.length() - m_maxsufflen), + MIN(fni.length(), m_maxsufflen)); + stringtolower(fn); + SuffixStore::const_iterator it = STOPSUFFIXES->find(fn); + if (it != STOPSUFFIXES->end()) { + LOGDEB2(("RclConfig::inStopSuffixes: Found (%s) [%s]\n", + fni.c_str(), (*it).m_str.c_str())); + return true; + } else { + LOGDEB2(("RclConfig::inStopSuffixes: not found [%s]\n", fni.c_str())); + return false; + } } string RclConfig::getMimeTypeFromSuffix(const string &suff) @@ -244,6 +318,17 @@ string RclConfig::getSuffixFromMimeType(const string &mt) return ""; } +void RclConfig::freeAll() +{ + delete m_conf; + delete mimemap; + delete mimeconf; + delete mimeview; + delete STOPSUFFIXES; + // just in case + zeroMe(); +} + string RclConfig::getMimeHandlerDef(const std::string &mtype) { string hs; @@ -495,8 +580,9 @@ void RclConfig::initFrom(const RclConfig& r) mimeconf = new ConfStack(*(r.mimeconf)); if (r.mimeview) mimeview = new ConfStack(*(r.mimeview)); - if (r.stopsuffixes) - stopsuffixes = new std::list(*(r.stopsuffixes)); + if (r.m_stopsuffixes) + m_stopsuffixes = new SuffixStore(*((SuffixStore*)r.m_stopsuffixes)); + m_maxsufflen = r.m_maxsufflen; defcharset = r.defcharset; guesscharset = r.guesscharset; } diff --git a/src/common/rclconfig.h b/src/common/rclconfig.h index 85acad73..8712ea93 100644 --- a/src/common/rclconfig.h +++ b/src/common/rclconfig.h @@ -16,7 +16,7 @@ */ #ifndef _RCLCONFIG_H_INCLUDED_ #define _RCLCONFIG_H_INCLUDED_ -/* @(#$Id: rclconfig.h,v 1.28 2006-12-16 15:30:02 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: rclconfig.h,v 1.29 2006-12-20 09:54:17 dockes Exp $ (C) 2004 J.F.Dockes */ #include #include @@ -35,7 +35,7 @@ using std::pair; class RclConfig { public: - RclConfig(const string *argcnf=0); + RclConfig(const string *argcnf = 0); bool ok() {return m_ok;} const string &getReason() {return m_reason;} /** Return the directory where this config is stored */ @@ -83,12 +83,12 @@ class RclConfig { list getSkippedNames(); /** - * Get list of ignored suffixes from mimemap + * Check if file name should be ignored because of suffix * - * The list is initialized on first call, and not changed for subsequent - * setKeydirs. + * The list of ignored suffixes is initialized on first call, and + * not changed for subsequent setKeydirs. */ - const list* getStopSuffixes(); + bool inStopSuffixes(const string& fn); /** * Check in mimeconf if input mime type is a compressed one, and @@ -153,7 +153,8 @@ class RclConfig { ConfStack *mimeconf; // but their content may depend on it. ConfStack *mimeview; // - list *stopsuffixes; + void *m_stopsuffixes; + unsigned int m_maxsufflen; // Parameters auto-fetched on setkeydir string defcharset; // These are stored locally to avoid @@ -171,18 +172,11 @@ class RclConfig { mimemap = 0; mimeconf = 0; mimeview = 0; - stopsuffixes = 0; + m_stopsuffixes = 0; + m_maxsufflen = 0; } /** Free data then zero pointers */ - void freeAll() { - delete m_conf; - delete mimemap; - delete mimeconf; - delete mimeview; - delete stopsuffixes; - // just in case - zeroMe(); - } + void freeAll(); }; diff --git a/src/doc/user/usermanual.sgml b/src/doc/user/usermanual.sgml index 83a70a86..a3e1bfdf 100644 --- a/src/doc/user/usermanual.sgml +++ b/src/doc/user/usermanual.sgml @@ -24,7 +24,7 @@ Dockes - $Id: usermanual.sgml,v 1.30 2006-12-18 12:06:11 dockes Exp $ + $Id: usermanual.sgml,v 1.31 2006-12-20 09:54:17 dockes Exp $ This document introduces full text search notions @@ -1528,7 +1528,7 @@ RECOLL_CONFDIR=$recollconf $recolldata/examples/rclmon.sh stop agents like thunderbird usually store messages in hidden directories, and you probably want this indexed. One possible solution is to - have .* in + have .* in skippedNames, and add things like ~/.thunderbird or ~/.evolution in @@ -1656,12 +1656,19 @@ RECOLL_CONFDIR=$recollconf $recolldata/examples/rclmon.sh stop mimemap also has a recoll_noindex variable which is a list of - suffixes. Matching files will be skipped (avoids unnecessary - decompressions or file executions). This is - partially redundant with skippedNames in - the main configuration file, with two differences: it will not - affect directories, and it can be changed for any - sub-directory. + suffixes. Matching files will be skipped (which avoids + unnecessary decompressions or file + executions). This is partially redundant with + skippedNames in the main configuration + file, with two differences: it will not affect directories, + and it cannot be made dependant on the file-system location + (it is a configuration-wide parameter). You could accomplish + with skippedNames anything that + recoll_noindex does. The latter is used + mostly for things known to be unindexable by a given &RCL; + version. Having it there avoids cluttering the more + user-oriented and locally customized + skippedNames. @@ -1672,14 +1679,15 @@ RECOLL_CONFDIR=$recollconf $recolldata/examples/rclmon.sh stop different mime types are handled for indexing, and which icons are displayed in the recoll result lists. - Changing the indexing parameters is probably not a - good idea except if you are a &RCL; developers. + Changing the parameters in the [index] section is + probably not a good idea except if you are a &RCL; + developer. - You can change the icons which are displayed by - recoll in the result lists (the values are - the basenames of the png images inside the - iconsdir directory (specified in - recoll.conf). + The [icons] section allows you to change the icons which + are displayed by recoll in the result + lists (the values are the basenames of the png images inside + the iconsdir directory (specified in + recoll.conf). diff --git a/src/index/mimetype.cpp b/src/index/mimetype.cpp index d2dc4ed6..2be54785 100644 --- a/src/index/mimetype.cpp +++ b/src/index/mimetype.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: mimetype.cpp,v 1.19 2006-12-19 08:40:50 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: mimetype.cpp,v 1.20 2006-12-20 09:54:18 dockes Exp $ (C) 2004 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -117,19 +117,12 @@ string mimetype(const string &fn, const struct stat *stp, if (cfg == 0) return ""; - const list* stoplist = cfg->getStopSuffixes(); - if (stoplist && !stoplist->empty()) { - for (list::const_iterator it = stoplist->begin(); - it != stoplist->end(); it++) { - if (!stringisuffcmp(fn, *it)) { - LOGDEB(("mimetype: fn %s in stoplist (%s)\n", fn.c_str(), - it->c_str())); - return ""; - } - } + if (cfg->inStopSuffixes(fn)) { + LOGDEB(("mimetype: fn [%s] in stopsuffixes\n", fn.c_str())); + return ""; } - // Look for suffix in mimetype map + // First look for suffix in mimetype map string::size_type dot = fn.find_last_of("."); string suff; if (dot != string::npos) { @@ -142,6 +135,7 @@ string mimetype(const string &fn, const struct stat *stp, return mtype; } + // Then examine data return mimetypefromdata(fn, usfc); }