From 11bb233ba5e06cb390781f0a8979bc5b493da83f Mon Sep 17 00:00:00 2001 From: dockes Date: Thu, 7 Apr 2005 09:05:39 +0000 Subject: [PATCH] replaced /usr/bin/file exec with internal code --- src/index/mimetype.cpp | 24 ++++---- src/lib/Makefile | 6 +- src/sampleconf/mimemap | 10 ++-- src/utils/Makefile | 7 +++ src/utils/idfile.cpp | 127 +++++++++++++++++++++++++++++++++++++++++ src/utils/idfile.h | 12 ++++ 6 files changed, 169 insertions(+), 17 deletions(-) create mode 100644 src/utils/idfile.cpp create mode 100644 src/utils/idfile.h diff --git a/src/index/mimetype.cpp b/src/index/mimetype.cpp index de502ea2..3b94179f 100644 --- a/src/index/mimetype.cpp +++ b/src/index/mimetype.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: mimetype.cpp,v 1.8 2005-04-05 09:35:35 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: mimetype.cpp,v 1.9 2005-04-07 09:05:39 dockes Exp $ (C) 2004 J.F.Dockes"; #endif #include @@ -14,9 +14,16 @@ using std::list; #include "execmd.h" #include "conftree.h" #include "smallut.h" +#include "idfile.h" +// The system 'file' utility is not that great for us. For exemple it +// will mistake mail folders for simple text files if there is no +// 'Received' header, which would be the case, for exemple in a 'Sent' +// folder. Also "file -i" does not exist on all systems static string mimetypefromdata(const string &fn) { + string mime; +#ifdef USE_SYSTEM_FILE_UTILITY list args; args.push_back("-i"); @@ -36,11 +43,13 @@ static string mimetypefromdata(const string &fn) return ""; list::iterator it = res.begin(); it++; - string mime = *it; + mime = *it; if (mime.length() > 0 && !isalpha(mime[mime.length() - 1])) mime.erase(mime.length() -1); - +#else + mime = idFile(fn.c_str()); +#endif return mime; } @@ -64,8 +73,8 @@ string mimetype(const string &fn, ConfTree *mtypes) continue; if (!stringicmp(fn.substr(fn.length() - it->length(),string::npos), *it)) { - LOGINFO(("mimetype: fn %s in stoplist (%s)\n", fn.c_str(), - it->c_str())); + LOGDEB(("mimetype: fn %s in stoplist (%s)\n", fn.c_str(), + it->c_str())); return ""; } } @@ -85,11 +94,6 @@ string mimetype(const string &fn, ConfTree *mtypes) } // Look at file data ? Only when no suffix or always ? - // Also 'file' is not that great for us. For exemple it will - // mistake mail folders for simple text files if there is no 'Received' - // header, which would be the case, for exemple in a 'Sent' folder. Also - // I'm not sure that file -i exists on all systems - //if (suff.empty()) // causes problems with shifted files, like // messages.1, messages.2 etc... return mimetypefromdata(fn); diff --git a/src/lib/Makefile b/src/lib/Makefile index ffcf7b43..a7d53bfb 100644 --- a/src/lib/Makefile +++ b/src/lib/Makefile @@ -8,13 +8,13 @@ all: $(LIBS) OBJS = conftree.o csguess.o debuglog.o \ execmd.o wipedir.o \ - fstreewalk.o html.o mail.o htmlparse.o indexer.o internfile.o \ + fstreewalk.o html.o mail.o htmlparse.o idfile.o indexer.o internfile.o \ mimehandler.o mimeparse.o mimetype.o myhtmlparse.o pathut.o \ rclconfig.o rcldb.o rclinit.o readfile.o smallut.o \ textsplit.o transcode.o \ unacpp.o unac.o SRCS = ../utils/conftree.cpp ../index/csguess.cpp ../utils/debuglog.cpp \ - ../utils/execmd.cpp ../utils/wipedir.cpp \ + ../utils/execmd.cpp ../utils/idfile.cpp ../utils/wipedir.cpp \ ../utils/fstreewalk.cpp ../common/html.cpp ../common/mail.cpp \ ../common/htmlparse.cpp \ ../index/indexer.cpp ../common/internfile.cpp \ @@ -48,6 +48,8 @@ html.o : ../common/html.cpp $(CXX) $(CXXFLAGS) -c $< htmlparse.o : ../common/htmlparse.cpp $(CXX) $(CXXFLAGS) -c $< +idfile.o : ../utils/idfile.cpp + $(CXX) $(CXXFLAGS) -c $< indexer.o : ../index/indexer.cpp $(CXX) $(CXXFLAGS) -c $< internfile.o : ../common/internfile.cpp diff --git a/src/sampleconf/mimemap b/src/sampleconf/mimemap index 626170a5..f75ea825 100644 --- a/src/sampleconf/mimemap +++ b/src/sampleconf/mimemap @@ -1,4 +1,4 @@ -# @(#$Id: mimemap,v 1.4 2005-04-05 09:35:35 dockes Exp $ (C) 2004 J.F.Dockes +# @(#$Id: mimemap,v 1.5 2005-04-07 09:05:39 dockes Exp $ (C) 2004 J.F.Dockes # Recoll: associations of file name extensions to mime types .txt = text/plain @@ -41,8 +41,8 @@ # suffixes listed in there speeds up things quite a lot by avoiding # unneeded decompression or 'file' calls recoll_noindex = .tar.gz .tgz .tar.bz2 .tbz \ - .c .h .cpp .m4 .tcl .js .sh .pl .awk \ - .o .lib .dll .a \ - .dat .bak .rdf .log .db .ini .gnm .gnumeric \ - .jpg .gif .bmp .xpm + .c .h .cpp .m4 .tcl .js .sh .pl .awk \ + .o .lib .dll .a \ + .dat .bak .rdf .log .db .ini .gnm .gnumeric .msf \ + .jpg .gif .bmp .xpm diff --git a/src/utils/Makefile b/src/utils/Makefile index 07d08238..4365d71d 100644 --- a/src/utils/Makefile +++ b/src/utils/Makefile @@ -33,6 +33,13 @@ trtranscode.o : ../utils/transcode.cpp $(CXX) $(CXXFLAGS) -DTEST_TRANSCODE -c -o trtranscode.o \ transcode.cpp +IDFILE_OBJS= tridfile.o $(BIGLIB) +idfile : $(IDFILE_OBJS) + $(CXX) $(CXXFLAGS) -o idfile $(IDFILE_OBJS) $(LIBICONV) +tridfile.o : ../utils/idfile.cpp + $(CXX) $(CXXFLAGS) -DTEST_IDFILE -c -o tridfile.o \ + idfile.cpp + MIMEPARSE_OBJS= trmimeparse.o $(BIGLIB) trmimeparse : $(MIMEPARSE_OBJS) $(CXX) $(CXXFLAGS) -o trmimeparse $(MIMEPARSE_OBJS) $(LIBICONV) diff --git a/src/utils/idfile.cpp b/src/utils/idfile.cpp new file mode 100644 index 00000000..e48af12d --- /dev/null +++ b/src/utils/idfile.cpp @@ -0,0 +1,127 @@ +#ifndef lint +static char rcsid[] = "@(#$Id: idfile.cpp,v 1.1 2005-04-07 09:05:39 dockes Exp $ (C) 2005 J.F.Dockes"; +#endif +#ifndef TEST_IDFILE +#include // for access(2) +#include + +#include +#include + +#include "debuglog.h" + +using namespace std; + + +// Mail headers we compare to: +static const char *mailhs[] = {"From: ", "Received: ", "Message-Id: ", "To: ", + "Date: ", "Subject: ", "Status: "}; +static const int mailhsl[] = {6, 10, 12, 4, 6, 9, 8}; +static const int nmh = sizeof(mailhs) / sizeof(char *); + +const int wantnhead = 3; + +string idFile(const char *fn) +{ + ifstream input; + input.open(fn, ios::in); + if (!input.is_open()) { + LOGERR(("idFile: could not open [%s]\n", fn)); + return string(""); + } + + bool line1HasFrom = false; + int lookslikemail = 0; + + // emacs VM sometimes inserts very long lines with continuations or + // not (for folder information). This forces us to look at many + // lines and long ones + for (int lnum = 1; lnum < 200; lnum++) { + +#define LL 1024 + char cline[LL+1]; + cline[LL] = 0; + input.getline(cline, LL-1); + if (input.fail()) { + if (input.bad()) { + LOGERR(("idfile: error while reading [%s]\n", fn)); + return string(""); + } + // Must be eof ? + break; + } + + LOGDEB2(("idfile: lnum %d : [%s]\n", lnum, cline)); + // Check for a few things that can't be found in a mail file, + // (optimization to get a quick negative + + // Lines must begin with whitespace or have a colon in the + // first 50 chars (hope no one comes up with a longer header + // name ! + if (!isspace(cline[0])) { + char *cp = strchr(cline, ':'); + if (cp == 0 || (cp - cline) > 70) { + LOGDEB2(("idfile: can't be mail header line: [%s]\n", cline)); + break; + } + } + + int ll = strlen(cline); + if (ll > 1000) { + LOGDEB2(("idFile: Line too long\n")); + return string(""); + } + if (lnum == 1) { + if (!strncmp("From ", cline, 5)) { + line1HasFrom = true; + continue; + } + } + + for (int i = 0; i < nmh; i++) { + if (!strncasecmp(mailhs[i], cline, mailhsl[i])) { + //fprintf(stderr, "Got [%s]\n", mailhs[i]); + lookslikemail++; + break; + } + } + if (lookslikemail >= wantnhead) + break; + } + if (line1HasFrom) + lookslikemail++; + + if (lookslikemail >= wantnhead) + return line1HasFrom ? string("text/x-mail") : string("message/rfc822"); + + return string(""); +} + + +#else + +#include +#include + +#include +#include + +using namespace std; + +#include "debuglog.h" +#include "idfile.h" + +int main(int argc, char **argv) +{ + if (argc != 2) { + cerr << "Usage: idfile filename" << endl; + exit(1); + } + DebugLog::getdbl()->setloglevel(DEBDEB1); + DebugLog::setfilename("stderr"); + string mime = idFile(argv[1]); + cout << argv[1] << " : " << mime << endl; + exit(0); +} + +#endif diff --git a/src/utils/idfile.h b/src/utils/idfile.h new file mode 100644 index 00000000..71d4535a --- /dev/null +++ b/src/utils/idfile.h @@ -0,0 +1,12 @@ +#ifndef _IDFILE_H_INCLUDED_ +#define _IDFILE_H_INCLUDED_ +/* @(#$Id: idfile.h,v 1.1 2005-04-07 09:05:39 dockes Exp $ (C) 2004 J.F.Dockes */ + +#include + +// Return mime type for file or empty string. The system's file utility does +// a bad job on mail folders. idFile only looks for mail file types for now, +// but this may change +extern std::string idFile(const char *fn); + +#endif /* _IDFILE_H_INCLUDED_ */