*** empty log message ***

This commit is contained in:
dockes 2005-04-04 13:18:47 +00:00
parent 04b279dcd5
commit 50b927f65c
8 changed files with 149 additions and 61 deletions

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: indexer.cpp,v 1.9 2005-03-25 09:40:27 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: indexer.cpp,v 1.10 2005-04-04 13:18:46 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
#include <stdio.h>
#include <sys/stat.h>
@ -89,6 +89,22 @@ bool DbIndexer::index()
it != topdirs->end(); it++) {
LOGDEB(("DbIndexer::index: Indexing %s into %s\n", it->c_str(),
dbdir.c_str()));
config->setKeyDir(*it);
// Set up skipped patterns for this subtree
{
walker.clearSkippedNames();
string skipped;
if (config->getConfParam("skippedNames", skipped)) {
list<string> skpl;
ConfTree::stringToStrings(skipped, skpl);
list<string>::const_iterator it;
for (it = skpl.begin(); it != skpl.end(); it++) {
walker.addSkippedName(*it);
}
}
}
if (walker.walk(*it, *this) != FsTreeWalker::FtwOk) {
LOGERR(("DbIndexer::index: error while indexing %s\n",
it->c_str()));

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: mimetype.cpp,v 1.6 2005-03-25 09:40:27 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: mimetype.cpp,v 1.7 2005-04-04 13:18:46 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
#include <ctype.h>
@ -82,8 +82,12 @@ string mimetype(const string &fn, ConfTree *mtypes)
return mtype;
}
// Look at file data ? Only when no suffix
if (suff.empty())
// Look at file data ? Only when no suffix or always
// Also 'file' is not that great for us. For exemple it will
// mistake mail folders for simple text files if there is no 'Received'
// header, which would be the case, for exemple in a 'Sent' folder. Also
// I'm not sure that file -i exists on all systems
//if (suff.empty())
return mimetypefromdata(fn);
return "";
}

View File

@ -63,11 +63,11 @@ MimeHandlerHtml::worker1(RclConfig *conf, const string &,
string charset;
if (!charsethint.empty()) {
charset = charsethint;
if (conf->getGuessCharset()) {
} else if (conf->getGuessCharset()) {
charset = csguess(htext, conf->getDefCharset());
} else
charset = conf->getDefCharset();
}
// - We first try to convert from the default configured charset
// (which may depend of the current directory) to utf-8. If this
@ -75,7 +75,7 @@ MimeHandlerHtml::worker1(RclConfig *conf, const string &,
// - During parsing, if we find a charset parameter, and it differs from
// what we started with, we abort and restart with the parameter value
// instead of the configuration one.
LOGDEB(("textHtmlToDoc: charset before parsing: %s\n", charset.c_str()));
LOGDEB(("textHtmlToDoc: charset before parsing: [%s]\n", charset.c_str()));
MyHtmlParser pres;
for (int pass = 0; pass < 2; pass++) {

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: mh_mail.cpp,v 1.2 2005-03-31 10:04:07 dockes Exp $ (C) 2005 J.F.Dockes";
static char rcsid[] = "@(#$Id: mh_mail.cpp,v 1.3 2005-04-04 13:18:46 dockes Exp $ (C) 2005 J.F.Dockes";
#endif
#include <stdio.h>
@ -46,7 +46,7 @@ MimeHandler::Status
MimeHandlerMail::worker(RclConfig *cnf, const string &fn,
const string &mtype, Rcl::Doc &docout, string& ipath)
{
LOGDEB(("MimeHandlerMail::worker: %s [%s]\n", mtype.c_str(), fn.c_str()));
LOGDEB2(("MimeHandlerMail::worker: %s [%s]\n", mtype.c_str(), fn.c_str()));
conf = cnf;
if (!stringlowercmp("message/rfc822", mtype)) {
@ -75,7 +75,7 @@ MimeHandlerMail::processmbox(const string &fn, Rcl::Doc &docout, string& ipath)
if (ipath != "") {
sscanf(ipath.c_str(), "%d", &mtarg);
}
LOGDEB(("MimeHandlerMail::processmbox: fn %s, mtarg %d\n", fn.c_str(),
LOGDEB2(("MimeHandlerMail::processmbox: fn %s, mtarg %d\n", fn.c_str(),
mtarg));
FILE *fp;
@ -125,7 +125,6 @@ MimeHandlerMail::processmbox(const string &fn, Rcl::Doc &docout, string& ipath)
}
}
msgnum++;
LOGDEB(("MimeHandlerMail::processmbox: got msg %d\n", msgnum));
fseek(fp, end, SEEK_SET);
} while (mtarg > 0 && msgnum < mtarg);
@ -173,25 +172,37 @@ MimeHandlerMail::processone(const string &fn, Binc::MimeDocument& doc,
}
// Handle some headers. We should process rfc2047 encoding here
// Also there should be no 8bit chars, but there sometimes are. So
// we transcode as if from iso-8859-1, which is better than
// getting utf8 conversion errors later on
Binc::HeaderItem hi;
string transcoded;
if (doc.h.getFirstHeader("Subject", hi)) {
docout.title = hi.getValue();
transcode(hi.getValue(), transcoded, "iso-8859-1", "UTF-8");
docout.title = transcoded;
}
if (doc.h.getFirstHeader("From", hi)) {
docout.text += string("From: ") + hi.getValue() + string("\n");
transcode(hi.getValue(), transcoded, "iso-8859-1", "UTF-8");
docout.text += string("From: ") + transcoded + string("\n");
}
if (doc.h.getFirstHeader("To", hi)) {
docout.text += string("To: ") + hi.getValue() + string("\n");
transcode(hi.getValue(), transcoded, "iso-8859-1", "UTF-8");
docout.text += string("To: ") + transcoded + string("\n");
}
if (doc.h.getFirstHeader("Date", hi)) {
docout.text += string("Date: ") + hi.getValue() + string("\n");
transcode(hi.getValue(), transcoded, "iso-8859-1", "UTF-8");
docout.text += string("Date: ") + transcoded + string("\n");
}
if (doc.h.getFirstHeader("Subject", hi)) {
transcode(hi.getValue(), transcoded, "iso-8859-1", "UTF-8");
docout.text += string("Subject: ") + transcoded + string("\n");
}
LOGDEB(("MimeHandlerMail::processone: ismultipart %d mime subtype '%s'\n",
LOGDEB2(("MimeHandlerMail::processone: ismultipart %d mime subtype '%s'\n",
doc.isMultipart(), doc.getSubType().c_str()));
walkmime(conf, docout.text, doc, 0);
LOGDEB(("MimeHandlerMail::processone: text: '%s'\n", docout.text.c_str()));
//LOGDEB(("MimeHandlerMail::processone: text: '%s'\n", docout.text.c_str()));
return MimeHandler::MHDone;
}
@ -206,13 +217,14 @@ static void walkmime(RclConfig *cnf, string &out, Binc::MimePart& doc,
}
if (doc.isMultipart()) {
LOGDEB(("walkmime: ismultipart %d subtype '%s'\n",
LOGDEB2(("walkmime: ismultipart %d subtype '%s'\n",
doc.isMultipart(), doc.getSubType().c_str()));
// We only handle alternative and mixed for now. For
// We only handle alternative, related and mixed for now. For
// alternative, we look for a text/plain part, else html and
// process it For mixed, we process each part.
// process it For mixed and related, we process each part.
std::vector<Binc::MimePart>::iterator it;
if (!stringicmp("mixed", doc.getSubType())) {
if (!stringicmp("mixed", doc.getSubType()) ||
!stringicmp("related", doc.getSubType())) {
for (it = doc.members.begin(); it != doc.members.end();it++) {
walkmime(cnf, out, *it, depth+1);
}
@ -247,18 +259,32 @@ static void walkmime(RclConfig *cnf, string &out, Binc::MimePart& doc,
if (doc.h.getFirstHeader("Content-Type", hi)) {
ctt = hi.getValue();
}
LOGDEB(("walkmime:content-type: %s\n", ctt.c_str()));
LOGDEB2(("walkmime:content-type: %s\n", ctt.c_str()));
MimeHeaderValue content_type;
parseMimeHeaderValue(ctt, content_type);
if (stringlowercmp("text/plain", content_type.value) &&
stringlowercmp("text/html", content_type.value)) {
return;
}
string charset = "us-ascii";
// Normally the default charset is us-ascii. But it happens that
// 8 bit chars exist in a message that is stated as us-ascii. Ie the
// mailer used by yahoo support ('KANA') does this. We could convert
// to iso-8859 only if the transfer-encoding is 8 bit, or test for
// actual 8 bit chars, but what the heck, le'ts use 8859-1 as default
string charset = "iso-8859-1";
map<string,string>::const_iterator it;
it = content_type.params.find(string("charset"));
if (it != content_type.params.end())
charset = it->second;
if (charset.empty() ||
!stringlowercmp("us-ascii", charset) ||
!stringlowercmp("default", charset) ||
!stringlowercmp("x-user-defined", charset) ||
!stringlowercmp("x-unknown", charset) ||
!stringlowercmp("unknown", charset) ) {
charset = "iso-8859-1";
}
// Content disposition
string ctd = "inline";
@ -277,7 +303,7 @@ static void walkmime(RclConfig *cnf, string &out, Binc::MimePart& doc,
cte = hi.getValue();
}
LOGDEB(("walkmime: final: body start offset %d, length %d\n",
LOGDEB2(("walkmime: final: body start offset %d, length %d\n",
doc.getBodyStartOffset(), doc.getBodyLength()));
string body;
doc.getBody(body, 0, doc.bodylength);

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.25 2005-03-31 10:04:07 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.26 2005-04-04 13:18:46 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
#include <stdio.h>
#include <sys/stat.h>
@ -216,7 +216,8 @@ bool Rcl::dumb_string(const string &in, string &out)
return true;
if (!unac_cpp(in, inter)) {
LOGERR(("dumb_string: unac_cpp failed for %s\n", in.c_str()));
return false;
// Ok, no need to stop the whole show
inter = "";
}
out.reserve(inter.length());
for (unsigned int i = 0; i < inter.length(); i++) {
@ -268,7 +269,7 @@ truncate_to_word(string & input, string::size_type maxlen)
bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
{
LOGDEB(("Rcl::Db::add: fn %s\n", fn.c_str()));
LOGDEB1(("Rcl::Db::add: fn %s\n", fn.c_str()));
if (pdata == 0)
return false;
Native *ndb = (Native *)pdata;
@ -288,7 +289,7 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
TextSplit splitter(&splitData);
string noacc;
if (!unac_cpp(doc.title, noacc)) {
if (!dumb_string(doc.title, noacc)) {
LOGERR(("Rcl::Db::add: unac failed\n"));
return false;
}

View File

@ -1,4 +1,4 @@
# @(#$Id: mimeconf,v 1.5 2005-03-25 09:40:28 dockes Exp $ (C) 2004 J.F.Dockes
# @(#$Id: mimeconf,v 1.6 2005-04-04 13:18:47 dockes Exp $ (C) 2004 J.F.Dockes
# Recoll : associations of mime types to processing filters.
# There are different sections for decompression, 'interning' for indexing
@ -49,7 +49,8 @@ application/vnd.sun.xml.writer.template = exec rclsoff
# External viewers, launched when you double-click a result entry
[view]
text/plain = xemacs %f
text/html = firefox -remote "openFile(%u)"
#text/html = firefox -remote "openFile(%u)"
text/html = firefox %u
application/pdf = xpdf %f
application/postscript = gv %f
application/msword = openoffice-1.1.3-swriter %f

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: fstreewalk.cpp,v 1.3 2005-02-10 15:21:12 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: fstreewalk.cpp,v 1.4 2005-04-04 13:18:47 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
#ifndef TEST_FSTREEWALK
@ -7,8 +7,10 @@ static char rcsid[] = "@(#$Id: fstreewalk.cpp,v 1.3 2005-02-10 15:21:12 dockes E
#include <dirent.h>
#include <sys/stat.h>
#include <errno.h>
#include <fnmatch.h>
#include <sstream>
#include <list>
#include "debuglog.h"
#include "pathut.h"
@ -19,6 +21,7 @@ using namespace std;
class FsTreeWalker::Internal {
Options options;
stringstream reason;
list<string> skippedNames;
int errors;
void logsyserr(const char *call, const string &param)
{
@ -53,6 +56,18 @@ int FsTreeWalker::getErrCnt()
return data->errors;
}
bool FsTreeWalker::addSkippedName(const string& pattern)
{
data->skippedNames.push_back(pattern);
return true;
}
void FsTreeWalker::clearSkippedNames()
{
data->skippedNames.clear();
}
FsTreeWalker::Status FsTreeWalker::walk(const string &top,
FsTreeWalkerCB& cb)
{
@ -94,10 +109,23 @@ FsTreeWalker::Status FsTreeWalker::walk(const string &top,
struct dirent *ent;
while ((ent = readdir(d)) != 0) {
// We do process hidden files for now
// We do process hidden files for now, only skip . and ..
if (!strcmp(ent->d_name, ".") || !strcmp(ent->d_name, ".."))
continue;
if (!data->skippedNames.empty()) {
list<string>::const_iterator it;
for (it = data->skippedNames.begin();
it != data->skippedNames.end(); it++) {
if (fnmatch(it->c_str(), ent->d_name, 0) == 0) {
//fprintf(stderr,
//"Skipping [%s] because of pattern match\n", ent->d_name);
goto skip;
}
}
}
{
string fn = top;
path_cat(fn, ent->d_name);
@ -125,6 +153,10 @@ FsTreeWalker::Status FsTreeWalker::walk(const string &top,
goto out;
}
}
}
skip: ;
// We skip other file types (devices etc...)
}

View File

@ -1,6 +1,6 @@
#ifndef _FSTREEWALK_H_INCLUDED_
#define _FSTREEWALK_H_INCLUDED_
/* @(#$Id: fstreewalk.h,v 1.2 2005-02-10 15:21:12 dockes Exp $ (C) 2004 J.F.Dockes */
/* @(#$Id: fstreewalk.h,v 1.3 2005-04-04 13:18:47 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string>
@ -22,6 +22,14 @@ class FsTreeWalker {
Status walk(const std::string &dir, FsTreeWalkerCB& cb);
std::string getReason();
int getErrCnt();
bool addSkippedName(const std::string &pattern); // Add a pattern
// for directory
// entries (file
// or dir) to be
// ignored (ie:
// #* , *~)
void clearSkippedNames(); // Clear all patterns
private:
class Internal;
Internal *data;