From 548a4c1a27a77d2fb3aadfc1b459a3c9103fce6e Mon Sep 17 00:00:00 2001 From: dockes Date: Thu, 10 Nov 2005 08:47:49 +0000 Subject: [PATCH] add config parameter to decide if we use the file command as a final step of mimetype identification --- src/index/Makefile | 10 ++- src/index/indexer.cpp | 89 +++++++++++--------- src/index/indexer.h | 16 +++- src/index/mimetype.cpp | 148 +++++++++++++++++++++------------- src/index/mimetype.h | 4 +- src/internfile/internfile.cpp | 17 +++- src/sampleconf/recoll.conf | 7 +- 7 files changed, 190 insertions(+), 101 deletions(-) diff --git a/src/index/Makefile b/src/index/Makefile index 916b6f04..45a79ba8 100644 --- a/src/index/Makefile +++ b/src/index/Makefile @@ -3,7 +3,7 @@ include ../mk/sysconf BIGLIB = ../lib/librcl.a MIMELIB = ../bincimapmime/libmime.a -PROGS = recollindex csguess +PROGS = recollindex csguess mimetype all: $(PROGS) RECOLLINDEX_OBJS= recollindex.o $(BIGLIB) $(MIMELIB) @@ -21,6 +21,14 @@ trcsguess.o : csguess.cpp $(CXX) $(CXXFLAGS) -DTEST_CSGUESS -c -o trcsguess.o \ csguess.cpp +MIMETYPE_OBJS= trmimetype.o $(BIGLIB) +mimetype : $(MIMETYPE_OBJS) + $(CXX) $(CXXFLAGS) -o mimetype $(MIMETYPE_OBJS) \ + $(LIBICONV) +trmimetype.o : mimetype.cpp + $(CXX) $(CXXFLAGS) -DTEST_MIMETYPE -c -o trmimetype.o \ + mimetype.cpp + clean: rm -f *.o $(PROGS) diff --git a/src/index/indexer.cpp b/src/index/indexer.cpp index 7be8e43d..87cf996c 100644 --- a/src/index/indexer.cpp +++ b/src/index/indexer.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: indexer.cpp,v 1.13 2005-11-05 14:40:50 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: indexer.cpp,v 1.14 2005-11-10 08:47:49 dockes Exp $ (C) 2004 J.F.Dockes"; #endif #include #include @@ -15,7 +15,6 @@ static char rcsid[] = "@(#$Id: indexer.cpp,v 1.13 2005-11-05 14:40:50 dockes Exp #include "conftree.h" #include "rclconfig.h" #include "fstreewalk.h" -#include "mimetype.h" #include "rcldb.h" #include "readfile.h" #include "indexer.h" @@ -32,11 +31,12 @@ using namespace std; #define deleteZ(X) {delete X;X = 0;} #endif -/** - * Bunch holder for data used while indexing a directory tree. This also the - * tree walker callback object (the processone method gets called for every - * file or directory). - */ +/// A class to index a list of top directories into one database. +/// +/// Inherits FsTreeWalkerCB so that its processone() method is +/// called by the file-system tree walk code for each file and +/// directory, and keeps all state used while indexing a +/// directory tree. class DbIndexer : public FsTreeWalkerCB { FsTreeWalker walker; RclConfig *config; @@ -45,9 +45,9 @@ class DbIndexer : public FsTreeWalkerCB { Rcl::Db db; string tmpdir; public: + /// Constructor does nothing but store parameters DbIndexer(RclConfig *cnf, const string &dbd, list *top) - : config(cnf), dbdir(dbd), topdirs(top) - { } + : config(cnf), dbdir(dbd), topdirs(top) {} virtual ~DbIndexer() { // Maybe clean up temporary directory @@ -60,19 +60,21 @@ class DbIndexer : public FsTreeWalkerCB { } } + /// Start indexing. + bool index(); + + /// Tree walker callback method FsTreeWalker::Status processone(const std::string &, const struct stat *, FsTreeWalker::CbFlag); - - // The top level entry point. - bool index(); }; -// Top level file system tree index method for updating a given database. -// -// We create the temporary directory, open the database, then call a -// file system walk for each top-level directory. -// When walking is done, we create the stem databases and close the main db. +/// Top level file system tree index method for updating a given database. +/// +/// We create the temporary directory, open the database, then call a +/// file system walk for each top-level directory. +/// When walking is done, we create the stem databases and close the +/// main db. bool DbIndexer::index() { string tdir; @@ -90,9 +92,13 @@ bool DbIndexer::index() it != topdirs->end(); it++) { LOGDEB(("DbIndexer::index: Indexing %s into %s\n", it->c_str(), dbdir.c_str())); + + // Set the current directory in config so that subsequent + // getConfParams() will get local values config->setKeyDir(*it); - // Set up skipped patterns for this subtree + // Set up skipped patterns for this subtree. This probably should be + // done in the directory change code in processone() instead. { walker.clearSkippedNames(); string skipped; @@ -106,6 +112,7 @@ bool DbIndexer::index() } } + // Walk the directory tree if (walker.walk(*it, *this) != FsTreeWalker::FtwOk) { LOGERR(("DbIndexer::index: error while indexing %s\n", it->c_str())); @@ -113,6 +120,9 @@ bool DbIndexer::index() return false; } } + + // Get rid of all database entries that don't exist in the + // filesystem anymore. db.purge(); // Create stemming databases @@ -135,22 +145,23 @@ bool DbIndexer::index() } -/** - * This function gets called for every file and directory found by the - * tree walker. It checks with the db if the file has changed and needs to - * be reindexed. If so, it calls internfile() which will identify the - * file type and call an appropriate handler to create documents in - * internal form, which we then add to the database. - * - * Accent and majuscule handling are performed by the db module when doing - * the actual indexing work. The Rcl::Doc created by internfile() - contains pretty raw utf8 data. - */ +/// This method gets called for every file and directory found by the +/// tree walker. +/// +/// It checks with the db if the file has changed and needs to be +/// reindexed. If so, it calls internfile() which will identify the +/// file type and call an appropriate handler to convert the document into +/// internal format, which we then add to the database. +/// +/// Accent and majuscule handling are performed by the db module when doing +/// the actual indexing work. The Rcl::Doc created by internfile() +/// contains pretty raw utf8 data. FsTreeWalker::Status DbIndexer::processone(const std::string &fn, const struct stat *stp, - FsTreeWalker::CbFlag flg) + FsTreeWalker::CbFlag flg) { - // If we're changing directories, possibly adjust parameters. + // If we're changing directories, possibly adjust parameters (set + // the current directory in configuration object) if (flg == FsTreeWalker::FtwDirEnter || flg == FsTreeWalker::FtwDirReturn) { config->setKeyDir(fn); @@ -189,9 +200,13 @@ DbIndexer::processone(const std::string &fn, const struct stat *stp, return FsTreeWalker::FtwOk; } -ConfIndexer::~ConfIndexer() +//////////////////////////////////////////////////////////////////////////// +// ConIndexer methods: ConfIndexer is the top-level object, that can index +// multiple directories to multiple databases. + +ConfIndexer::~ConfIndexer() { - deleteZ(indexer); + deleteZ(dbindexer); } bool ConfIndexer::index() @@ -245,12 +260,12 @@ bool ConfIndexer::index() //} //cout << endl; - indexer = new DbIndexer(config, dbit->first, &dbit->second); - if (!indexer->index()) { - deleteZ(indexer); + dbindexer = new DbIndexer(config, dbit->first, &dbit->second); + if (!dbindexer->index()) { + deleteZ(dbindexer); return false; } - deleteZ(indexer); + deleteZ(dbindexer); } return true; } diff --git a/src/index/indexer.h b/src/index/indexer.h index 6eddbbb7..ad6d04aa 100644 --- a/src/index/indexer.h +++ b/src/index/indexer.h @@ -1,21 +1,31 @@ #ifndef _INDEXER_H_INCLUDED_ #define _INDEXER_H_INCLUDED_ -/* @(#$Id: indexer.h,v 1.5 2005-03-17 15:35:49 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: indexer.h,v 1.6 2005-11-10 08:47:49 dockes Exp $ (C) 2004 J.F.Dockes */ #include "rclconfig.h" + +/** + * An internal class to process all directories indexed into the same database. + */ class DbIndexer; /** * The file system indexing object. Processes the configuration, then invokes * file system walking to populate/update the database(s). + * + * Multiple top-level directories can be listed in the + * configuration. Each can be indexed to a different + * database. Directories are first grouped by database, then an + * internal class (DbIndexer) is used to process each group. */ class ConfIndexer { RclConfig *config; - DbIndexer *indexer; // Internal object used to store opaque private data + DbIndexer *dbindexer; // Object to process directories for a given db public: enum runStatus {IndexerOk, IndexerError}; - ConfIndexer(RclConfig *cnf) : config(cnf), indexer(0) {} + ConfIndexer(RclConfig *cnf) : config(cnf), dbindexer(0) {} virtual ~ConfIndexer(); + /** Worker function: doe the actual indexing */ bool index(); }; diff --git a/src/index/mimetype.cpp b/src/index/mimetype.cpp index 3b94179f..cff38e67 100644 --- a/src/index/mimetype.cpp +++ b/src/index/mimetype.cpp @@ -1,12 +1,13 @@ #ifndef lint -static char rcsid[] = "@(#$Id: mimetype.cpp,v 1.9 2005-04-07 09:05:39 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: mimetype.cpp,v 1.10 2005-11-10 08:47:49 dockes Exp $ (C) 2004 J.F.Dockes"; #endif +#ifndef TEST_MIMETYPE #include - #include -using std::string; #include + +using std::string; using std::list; #include "mimetype.h" @@ -16,44 +17,72 @@ using std::list; #include "smallut.h" #include "idfile.h" -// The system 'file' utility is not that great for us. For exemple it -// will mistake mail folders for simple text files if there is no -// 'Received' header, which would be the case, for exemple in a 'Sent' -// folder. Also "file -i" does not exist on all systems -static string mimetypefromdata(const string &fn) +#define USE_SYSTEM_FILE_COMMAND + +/// Identification of file from contents. This is called for files with +/// unrecognized extensions (none, or not known either for indexing or +/// stop list) +/// +/// The system 'file' utility is not that great for us. For exemple it +/// will mistake mail folders for simple text files if there is no +/// 'Received' header, which would be the case, for exemple in a 'Sent' +/// folder. Also "file -i" does not exist on all systems, and it's +/// quite costly. +/// So we first call the internal file identifier, which currently +/// only knows about mail, but in which we can add the more +/// current/interesting file types. +/// As a last resort we execute 'file' + +static string mimetypefromdata(const string &fn, bool usfc) { string mime; -#ifdef USE_SYSTEM_FILE_UTILITY - list args; - args.push_back("-i"); - args.push_back(fn); - ExecCmd ex; - string result; - string cmd = "file"; - int status = ex.doexec(cmd, args, 0, &result); - if (status) { - LOGERR(("mimetypefromdata: doexec: status 0x%x\n", status)); - return ""; - } - // LOGDEB(("mimetypefromdata: %s [%s]\n", result.c_str(), fn.c_str())); - list res; - ConfTree::stringToStrings(result, res); - if (res.size() <= 1) - return ""; - list::iterator it = res.begin(); - it++; - mime = *it; - - if (mime.length() > 0 && !isalpha(mime[mime.length() - 1])) - mime.erase(mime.length() -1); -#else + // In any case first try the internal identifier mime = idFile(fn.c_str()); + +#ifdef USE_SYSTEM_FILE_COMMAND + if (usfc && mime == "") { + // Last resort: use "file -i" + list args; + + args.push_back("-i"); + args.push_back(fn); + ExecCmd ex; + string result; + string cmd = "file"; + int status = ex.doexec(cmd, args, 0, &result); + if (status) { + LOGERR(("mimetypefromdata: doexec: status 0x%x\n", status)); + return ""; + } + // LOGDEB(("mimetypefromdata: %s [%s]\n", result.c_str(), fn.c_str())); + + // The result of 'file' execution begins with the file name + // which may contain spaces. We happen to know its size, so + // strip it: + result = result.substr(fn.size()); + // Now looks like ": text/plain; charset=us-ascii" + // Split it, and take second field + list res; + ConfTree::stringToStrings(result, res); + if (res.size() <= 1) + return ""; + list::iterator it = res.begin(); + it++; + mime = *it; + // Remove possible punctuation at the end + if (mime.length() > 0 && !isalpha(mime[mime.length() - 1])) + mime.erase(mime.length() -1); + } #endif + return mime; } -string mimetype(const string &fn, ConfTree *mtypes) +/// Guess mime type, first from suffix, then from file data. We also +/// have a list of suffixes that we don't touch at all (ie: .jpg, +/// etc...) +string mimetype(const string &fn, ConfTree *mtypes, bool usfc) { if (mtypes == 0) return ""; @@ -94,35 +123,46 @@ string mimetype(const string &fn, ConfTree *mtypes) } // Look at file data ? Only when no suffix or always ? - //if (suff.empty()) // causes problems with shifted files, like - // messages.1, messages.2 etc... - return mimetypefromdata(fn); - - return ""; +#if 0 + // Don't do this only for empty suffixes: would cause problems + // with shifted files, like messages.1, messages.2 etc... And others too + if (suff.empty()) +#endif + return mimetypefromdata(fn, usfc); } -#ifdef _TEST_MIMETYPE_ +#else // TEST-> + #include -const char *tvec[] = { - "/toto/tutu", - "/", - "toto.txt", - "toto.TXT", - "toto.C.txt", - "toto.C1", - "", -}; -const int n = sizeof(tvec) / sizeof(char*); + +#include "debuglog.h" +#include "rclconfig.h" +#include "rclinit.h" +#include "mimetype.h" + using namespace std; int main(int argc, const char **argv) { - mapmtypes; - mtypes[".txt"] = "text/plain"; + string reason; + RclConfig *config = recollinit(0, 0, reason); - for (int i = 0; i < n; i++) { - cout << tvec[i] << " -> " << mimetype(string(tvec[i]), mtypes) << endl; + if (config == 0 || !config->ok()) { + string str = "Configuration problem: "; + str += reason; + fprintf(stderr, "%s\n", str.c_str()); + exit(1); } + + while (--argc > 0) { + string filename = *++argv; + cout << filename << " -> " << + mimetype(filename, config->getMimeMap(), true) << endl; + + } + return 0; } -#endif + + +#endif // TEST diff --git a/src/index/mimetype.h b/src/index/mimetype.h index da22e165..44b55e28 100644 --- a/src/index/mimetype.h +++ b/src/index/mimetype.h @@ -1,6 +1,6 @@ #ifndef _MIMETYPE_H_INCLUDED_ #define _MIMETYPE_H_INCLUDED_ -/* @(#$Id: mimetype.h,v 1.2 2004-12-14 17:54:16 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: mimetype.h,v 1.3 2005-11-10 08:47:49 dockes Exp $ (C) 2004 J.F.Dockes */ #include #include "conftree.h" @@ -11,7 +11,7 @@ * This may imply more than matching the suffix, the name must be usable * to actually access file data. */ -string mimetype(const std::string &filename, ConfTree *mtypes); +string mimetype(const std::string &filename, ConfTree *mtypes, bool usfc); #endif /* _MIMETYPE_H_INCLUDED_ */ diff --git a/src/internfile/internfile.cpp b/src/internfile/internfile.cpp index 409d5b4f..484cc361 100644 --- a/src/internfile/internfile.cpp +++ b/src/internfile/internfile.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: internfile.cpp,v 1.6 2005-11-08 21:02:55 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: internfile.cpp,v 1.7 2005-11-10 08:47:49 dockes Exp $ (C) 2004 J.F.Dockes"; #endif #include #include @@ -86,7 +86,18 @@ FileInterner::FileInterner(const std::string &f, RclConfig *cnf, const string& td) : fn(f), config(cnf), tdir(td), handler(0) { - mime = mimetype(fn, config->getMimeMap()); + // Note that we are actually going to access the file, so that it's ok + // to check this config variable at every call even if it can only change + // when we change directories + string usfc; + int usfci; + if (!cnf->getConfParam("usesystemfilecommand", usfc)) + usfci = 0; + else + usfci = atoi(usfc.c_str()) ? 1 : 0; + LOGDEB1(("FileInterner::FileInterner: usfci now %d\n", usfci)); + + mime = mimetype(fn, config->getMimeMap(), usfci); if (mime.empty()) { // No mime type: not listed in our map, or present in stop list LOGDEB(("FileInterner::FileInterner: (no mime) [%s]\n", fn.c_str())); @@ -104,7 +115,7 @@ FileInterner::FileInterner(const std::string &f, RclConfig *cnf, LOGDEB(("internfile: after ucomp: tdir %s, tfile %s\n", tdir.c_str(), tfile.c_str())); fn = tfile; - mime = mimetype(fn, config->getMimeMap()); + mime = mimetype(fn, config->getMimeMap(), usfci); if (mime.empty()) { // No mime type ?? pass on. LOGDEB(("internfile: (no mime) [%s]\n", fn.c_str())); diff --git a/src/sampleconf/recoll.conf b/src/sampleconf/recoll.conf index 229f64ef..773f36f5 100644 --- a/src/sampleconf/recoll.conf +++ b/src/sampleconf/recoll.conf @@ -1,4 +1,4 @@ -# @(#$Id: recoll.conf,v 1.5 2005-10-20 11:38:53 dockes Exp $ (C) 2004 J.F.Dockes +# @(#$Id: recoll.conf,v 1.6 2005-11-10 08:47:49 dockes Exp $ (C) 2004 J.F.Dockes # Recoll default configuration file. This should be copied to # ~/.recoll/recoll.conf @@ -40,6 +40,11 @@ defaultlanguage = english # Guessing charsets usually does not work well guesscharset = 0 +# Should we use the system's 'file -i' command as a final step in file type +# identification ? This may be useful, but will usually cause the +# indexation of many bogus 'text' files +usesystemfilecommand = 1 + # You could specify different parameters for a subdirectory like this. No # tilde substitution there for now, sorry: #[/home/me/englishdocs/plain]