replaced /usr/bin/file exec with internal code

This commit is contained in:
dockes 2005-04-07 09:05:39 +00:00
parent 1293f0d834
commit 11bb233ba5
6 changed files with 169 additions and 17 deletions

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: mimetype.cpp,v 1.8 2005-04-05 09:35:35 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: mimetype.cpp,v 1.9 2005-04-07 09:05:39 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
#include <ctype.h>
@ -14,9 +14,16 @@ using std::list;
#include "execmd.h"
#include "conftree.h"
#include "smallut.h"
#include "idfile.h"
// The system 'file' utility is not that great for us. For exemple it
// will mistake mail folders for simple text files if there is no
// 'Received' header, which would be the case, for exemple in a 'Sent'
// folder. Also "file -i" does not exist on all systems
static string mimetypefromdata(const string &fn)
{
string mime;
#ifdef USE_SYSTEM_FILE_UTILITY
list<string> args;
args.push_back("-i");
@ -36,11 +43,13 @@ static string mimetypefromdata(const string &fn)
return "";
list<string>::iterator it = res.begin();
it++;
string mime = *it;
mime = *it;
if (mime.length() > 0 && !isalpha(mime[mime.length() - 1]))
mime.erase(mime.length() -1);
#else
mime = idFile(fn.c_str());
#endif
return mime;
}
@ -64,8 +73,8 @@ string mimetype(const string &fn, ConfTree *mtypes)
continue;
if (!stringicmp(fn.substr(fn.length() - it->length(),string::npos),
*it)) {
LOGINFO(("mimetype: fn %s in stoplist (%s)\n", fn.c_str(),
it->c_str()));
LOGDEB(("mimetype: fn %s in stoplist (%s)\n", fn.c_str(),
it->c_str()));
return "";
}
}
@ -85,11 +94,6 @@ string mimetype(const string &fn, ConfTree *mtypes)
}
// Look at file data ? Only when no suffix or always ?
// Also 'file' is not that great for us. For exemple it will
// mistake mail folders for simple text files if there is no 'Received'
// header, which would be the case, for exemple in a 'Sent' folder. Also
// I'm not sure that file -i exists on all systems
//if (suff.empty()) // causes problems with shifted files, like
// messages.1, messages.2 etc...
return mimetypefromdata(fn);

View File

@ -8,13 +8,13 @@ all: $(LIBS)
OBJS = conftree.o csguess.o debuglog.o \
execmd.o wipedir.o \
fstreewalk.o html.o mail.o htmlparse.o indexer.o internfile.o \
fstreewalk.o html.o mail.o htmlparse.o idfile.o indexer.o internfile.o \
mimehandler.o mimeparse.o mimetype.o myhtmlparse.o pathut.o \
rclconfig.o rcldb.o rclinit.o readfile.o smallut.o \
textsplit.o transcode.o \
unacpp.o unac.o
SRCS = ../utils/conftree.cpp ../index/csguess.cpp ../utils/debuglog.cpp \
../utils/execmd.cpp ../utils/wipedir.cpp \
../utils/execmd.cpp ../utils/idfile.cpp ../utils/wipedir.cpp \
../utils/fstreewalk.cpp ../common/html.cpp ../common/mail.cpp \
../common/htmlparse.cpp \
../index/indexer.cpp ../common/internfile.cpp \
@ -48,6 +48,8 @@ html.o : ../common/html.cpp
$(CXX) $(CXXFLAGS) -c $<
htmlparse.o : ../common/htmlparse.cpp
$(CXX) $(CXXFLAGS) -c $<
idfile.o : ../utils/idfile.cpp
$(CXX) $(CXXFLAGS) -c $<
indexer.o : ../index/indexer.cpp
$(CXX) $(CXXFLAGS) -c $<
internfile.o : ../common/internfile.cpp

View File

@ -1,4 +1,4 @@
# @(#$Id: mimemap,v 1.4 2005-04-05 09:35:35 dockes Exp $ (C) 2004 J.F.Dockes
# @(#$Id: mimemap,v 1.5 2005-04-07 09:05:39 dockes Exp $ (C) 2004 J.F.Dockes
# Recoll: associations of file name extensions to mime types
.txt = text/plain
@ -41,8 +41,8 @@
# suffixes listed in there speeds up things quite a lot by avoiding
# unneeded decompression or 'file' calls
recoll_noindex = .tar.gz .tgz .tar.bz2 .tbz \
.c .h .cpp .m4 .tcl .js .sh .pl .awk \
.o .lib .dll .a \
.dat .bak .rdf .log .db .ini .gnm .gnumeric \
.jpg .gif .bmp .xpm
.c .h .cpp .m4 .tcl .js .sh .pl .awk \
.o .lib .dll .a \
.dat .bak .rdf .log .db .ini .gnm .gnumeric .msf \
.jpg .gif .bmp .xpm

View File

@ -33,6 +33,13 @@ trtranscode.o : ../utils/transcode.cpp
$(CXX) $(CXXFLAGS) -DTEST_TRANSCODE -c -o trtranscode.o \
transcode.cpp
IDFILE_OBJS= tridfile.o $(BIGLIB)
idfile : $(IDFILE_OBJS)
$(CXX) $(CXXFLAGS) -o idfile $(IDFILE_OBJS) $(LIBICONV)
tridfile.o : ../utils/idfile.cpp
$(CXX) $(CXXFLAGS) -DTEST_IDFILE -c -o tridfile.o \
idfile.cpp
MIMEPARSE_OBJS= trmimeparse.o $(BIGLIB)
trmimeparse : $(MIMEPARSE_OBJS)
$(CXX) $(CXXFLAGS) -o trmimeparse $(MIMEPARSE_OBJS) $(LIBICONV)

127
src/utils/idfile.cpp Normal file
View File

@ -0,0 +1,127 @@
#ifndef lint
static char rcsid[] = "@(#$Id: idfile.cpp,v 1.1 2005-04-07 09:05:39 dockes Exp $ (C) 2005 J.F.Dockes";
#endif
#ifndef TEST_IDFILE
#include <unistd.h> // for access(2)
#include <ctype.h>
#include <fstream>
#include <sstream>
#include "debuglog.h"
using namespace std;
// Mail headers we compare to:
static const char *mailhs[] = {"From: ", "Received: ", "Message-Id: ", "To: ",
"Date: ", "Subject: ", "Status: "};
static const int mailhsl[] = {6, 10, 12, 4, 6, 9, 8};
static const int nmh = sizeof(mailhs) / sizeof(char *);
const int wantnhead = 3;
string idFile(const char *fn)
{
ifstream input;
input.open(fn, ios::in);
if (!input.is_open()) {
LOGERR(("idFile: could not open [%s]\n", fn));
return string("");
}
bool line1HasFrom = false;
int lookslikemail = 0;
// emacs VM sometimes inserts very long lines with continuations or
// not (for folder information). This forces us to look at many
// lines and long ones
for (int lnum = 1; lnum < 200; lnum++) {
#define LL 1024
char cline[LL+1];
cline[LL] = 0;
input.getline(cline, LL-1);
if (input.fail()) {
if (input.bad()) {
LOGERR(("idfile: error while reading [%s]\n", fn));
return string("");
}
// Must be eof ?
break;
}
LOGDEB2(("idfile: lnum %d : [%s]\n", lnum, cline));
// Check for a few things that can't be found in a mail file,
// (optimization to get a quick negative
// Lines must begin with whitespace or have a colon in the
// first 50 chars (hope no one comes up with a longer header
// name !
if (!isspace(cline[0])) {
char *cp = strchr(cline, ':');
if (cp == 0 || (cp - cline) > 70) {
LOGDEB2(("idfile: can't be mail header line: [%s]\n", cline));
break;
}
}
int ll = strlen(cline);
if (ll > 1000) {
LOGDEB2(("idFile: Line too long\n"));
return string("");
}
if (lnum == 1) {
if (!strncmp("From ", cline, 5)) {
line1HasFrom = true;
continue;
}
}
for (int i = 0; i < nmh; i++) {
if (!strncasecmp(mailhs[i], cline, mailhsl[i])) {
//fprintf(stderr, "Got [%s]\n", mailhs[i]);
lookslikemail++;
break;
}
}
if (lookslikemail >= wantnhead)
break;
}
if (line1HasFrom)
lookslikemail++;
if (lookslikemail >= wantnhead)
return line1HasFrom ? string("text/x-mail") : string("message/rfc822");
return string("");
}
#else
#include <string>
#include <iostream>
#include <unistd.h>
#include <fcntl.h>
using namespace std;
#include "debuglog.h"
#include "idfile.h"
int main(int argc, char **argv)
{
if (argc != 2) {
cerr << "Usage: idfile filename" << endl;
exit(1);
}
DebugLog::getdbl()->setloglevel(DEBDEB1);
DebugLog::setfilename("stderr");
string mime = idFile(argv[1]);
cout << argv[1] << " : " << mime << endl;
exit(0);
}
#endif

12
src/utils/idfile.h Normal file
View File

@ -0,0 +1,12 @@
#ifndef _IDFILE_H_INCLUDED_
#define _IDFILE_H_INCLUDED_
/* @(#$Id: idfile.h,v 1.1 2005-04-07 09:05:39 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string>
// Return mime type for file or empty string. The system's file utility does
// a bad job on mail folders. idFile only looks for mail file types for now,
// but this may change
extern std::string idFile(const char *fn);
#endif /* _IDFILE_H_INCLUDED_ */