diff --git a/src/index/indexer.cpp b/src/index/indexer.cpp index 083f3f74..8646878b 100644 --- a/src/index/indexer.cpp +++ b/src/index/indexer.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: indexer.cpp,v 1.9 2005-03-25 09:40:27 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: indexer.cpp,v 1.10 2005-04-04 13:18:46 dockes Exp $ (C) 2004 J.F.Dockes"; #endif #include #include @@ -89,6 +89,22 @@ bool DbIndexer::index() it != topdirs->end(); it++) { LOGDEB(("DbIndexer::index: Indexing %s into %s\n", it->c_str(), dbdir.c_str())); + config->setKeyDir(*it); + + // Set up skipped patterns for this subtree + { + walker.clearSkippedNames(); + string skipped; + if (config->getConfParam("skippedNames", skipped)) { + list skpl; + ConfTree::stringToStrings(skipped, skpl); + list::const_iterator it; + for (it = skpl.begin(); it != skpl.end(); it++) { + walker.addSkippedName(*it); + } + } + } + if (walker.walk(*it, *this) != FsTreeWalker::FtwOk) { LOGERR(("DbIndexer::index: error while indexing %s\n", it->c_str())); diff --git a/src/index/mimetype.cpp b/src/index/mimetype.cpp index be609854..5fbc2905 100644 --- a/src/index/mimetype.cpp +++ b/src/index/mimetype.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: mimetype.cpp,v 1.6 2005-03-25 09:40:27 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: mimetype.cpp,v 1.7 2005-04-04 13:18:46 dockes Exp $ (C) 2004 J.F.Dockes"; #endif #include @@ -82,8 +82,12 @@ string mimetype(const string &fn, ConfTree *mtypes) return mtype; } - // Look at file data ? Only when no suffix - if (suff.empty()) + // Look at file data ? Only when no suffix or always + // Also 'file' is not that great for us. For exemple it will + // mistake mail folders for simple text files if there is no 'Received' + // header, which would be the case, for exemple in a 'Sent' folder. Also + // I'm not sure that file -i exists on all systems + //if (suff.empty()) return mimetypefromdata(fn); return ""; } diff --git a/src/internfile/mh_html.cpp b/src/internfile/mh_html.cpp index a6c0cc82..bdfa588e 100644 --- a/src/internfile/mh_html.cpp +++ b/src/internfile/mh_html.cpp @@ -63,11 +63,11 @@ MimeHandlerHtml::worker1(RclConfig *conf, const string &, string charset; if (!charsethint.empty()) { charset = charsethint; - if (conf->getGuessCharset()) { - charset = csguess(htext, conf->getDefCharset()); - } else - charset = conf->getDefCharset(); - } + } else if (conf->getGuessCharset()) { + charset = csguess(htext, conf->getDefCharset()); + } else + charset = conf->getDefCharset(); + // - We first try to convert from the default configured charset // (which may depend of the current directory) to utf-8. If this @@ -75,7 +75,7 @@ MimeHandlerHtml::worker1(RclConfig *conf, const string &, // - During parsing, if we find a charset parameter, and it differs from // what we started with, we abort and restart with the parameter value // instead of the configuration one. - LOGDEB(("textHtmlToDoc: charset before parsing: %s\n", charset.c_str())); + LOGDEB(("textHtmlToDoc: charset before parsing: [%s]\n", charset.c_str())); MyHtmlParser pres; for (int pass = 0; pass < 2; pass++) { diff --git a/src/internfile/mh_mail.cpp b/src/internfile/mh_mail.cpp index d2b797fd..002e1103 100644 --- a/src/internfile/mh_mail.cpp +++ b/src/internfile/mh_mail.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: mh_mail.cpp,v 1.2 2005-03-31 10:04:07 dockes Exp $ (C) 2005 J.F.Dockes"; +static char rcsid[] = "@(#$Id: mh_mail.cpp,v 1.3 2005-04-04 13:18:46 dockes Exp $ (C) 2005 J.F.Dockes"; #endif #include @@ -46,7 +46,7 @@ MimeHandler::Status MimeHandlerMail::worker(RclConfig *cnf, const string &fn, const string &mtype, Rcl::Doc &docout, string& ipath) { - LOGDEB(("MimeHandlerMail::worker: %s [%s]\n", mtype.c_str(), fn.c_str())); + LOGDEB2(("MimeHandlerMail::worker: %s [%s]\n", mtype.c_str(), fn.c_str())); conf = cnf; if (!stringlowercmp("message/rfc822", mtype)) { @@ -75,7 +75,7 @@ MimeHandlerMail::processmbox(const string &fn, Rcl::Doc &docout, string& ipath) if (ipath != "") { sscanf(ipath.c_str(), "%d", &mtarg); } - LOGDEB(("MimeHandlerMail::processmbox: fn %s, mtarg %d\n", fn.c_str(), + LOGDEB2(("MimeHandlerMail::processmbox: fn %s, mtarg %d\n", fn.c_str(), mtarg)); FILE *fp; @@ -125,7 +125,6 @@ MimeHandlerMail::processmbox(const string &fn, Rcl::Doc &docout, string& ipath) } } msgnum++; - LOGDEB(("MimeHandlerMail::processmbox: got msg %d\n", msgnum)); fseek(fp, end, SEEK_SET); } while (mtarg > 0 && msgnum < mtarg); @@ -173,25 +172,37 @@ MimeHandlerMail::processone(const string &fn, Binc::MimeDocument& doc, } // Handle some headers. We should process rfc2047 encoding here + // Also there should be no 8bit chars, but there sometimes are. So + // we transcode as if from iso-8859-1, which is better than + // getting utf8 conversion errors later on Binc::HeaderItem hi; + string transcoded; if (doc.h.getFirstHeader("Subject", hi)) { - docout.title = hi.getValue(); + transcode(hi.getValue(), transcoded, "iso-8859-1", "UTF-8"); + docout.title = transcoded; } if (doc.h.getFirstHeader("From", hi)) { - docout.text += string("From: ") + hi.getValue() + string("\n"); + transcode(hi.getValue(), transcoded, "iso-8859-1", "UTF-8"); + docout.text += string("From: ") + transcoded + string("\n"); } if (doc.h.getFirstHeader("To", hi)) { - docout.text += string("To: ") + hi.getValue() + string("\n"); + transcode(hi.getValue(), transcoded, "iso-8859-1", "UTF-8"); + docout.text += string("To: ") + transcoded + string("\n"); } if (doc.h.getFirstHeader("Date", hi)) { - docout.text += string("Date: ") + hi.getValue() + string("\n"); + transcode(hi.getValue(), transcoded, "iso-8859-1", "UTF-8"); + docout.text += string("Date: ") + transcoded + string("\n"); + } + if (doc.h.getFirstHeader("Subject", hi)) { + transcode(hi.getValue(), transcoded, "iso-8859-1", "UTF-8"); + docout.text += string("Subject: ") + transcoded + string("\n"); } - LOGDEB(("MimeHandlerMail::processone: ismultipart %d mime subtype '%s'\n", + LOGDEB2(("MimeHandlerMail::processone: ismultipart %d mime subtype '%s'\n", doc.isMultipart(), doc.getSubType().c_str())); walkmime(conf, docout.text, doc, 0); - LOGDEB(("MimeHandlerMail::processone: text: '%s'\n", docout.text.c_str())); + //LOGDEB(("MimeHandlerMail::processone: text: '%s'\n", docout.text.c_str())); return MimeHandler::MHDone; } @@ -206,13 +217,14 @@ static void walkmime(RclConfig *cnf, string &out, Binc::MimePart& doc, } if (doc.isMultipart()) { - LOGDEB(("walkmime: ismultipart %d subtype '%s'\n", + LOGDEB2(("walkmime: ismultipart %d subtype '%s'\n", doc.isMultipart(), doc.getSubType().c_str())); - // We only handle alternative and mixed for now. For + // We only handle alternative, related and mixed for now. For // alternative, we look for a text/plain part, else html and - // process it For mixed, we process each part. + // process it For mixed and related, we process each part. std::vector::iterator it; - if (!stringicmp("mixed", doc.getSubType())) { + if (!stringicmp("mixed", doc.getSubType()) || + !stringicmp("related", doc.getSubType())) { for (it = doc.members.begin(); it != doc.members.end();it++) { walkmime(cnf, out, *it, depth+1); } @@ -247,19 +259,33 @@ static void walkmime(RclConfig *cnf, string &out, Binc::MimePart& doc, if (doc.h.getFirstHeader("Content-Type", hi)) { ctt = hi.getValue(); } - LOGDEB(("walkmime:content-type: %s\n", ctt.c_str())); + LOGDEB2(("walkmime:content-type: %s\n", ctt.c_str())); MimeHeaderValue content_type; parseMimeHeaderValue(ctt, content_type); if (stringlowercmp("text/plain", content_type.value) && stringlowercmp("text/html", content_type.value)) { return; } - string charset = "us-ascii"; + + // Normally the default charset is us-ascii. But it happens that + // 8 bit chars exist in a message that is stated as us-ascii. Ie the + // mailer used by yahoo support ('KANA') does this. We could convert + // to iso-8859 only if the transfer-encoding is 8 bit, or test for + // actual 8 bit chars, but what the heck, le'ts use 8859-1 as default + string charset = "iso-8859-1"; map::const_iterator it; it = content_type.params.find(string("charset")); if (it != content_type.params.end()) charset = it->second; - + if (charset.empty() || + !stringlowercmp("us-ascii", charset) || + !stringlowercmp("default", charset) || + !stringlowercmp("x-user-defined", charset) || + !stringlowercmp("x-unknown", charset) || + !stringlowercmp("unknown", charset) ) { + charset = "iso-8859-1"; + } + // Content disposition string ctd = "inline"; if (doc.h.getFirstHeader("Content-Disposition", hi)) { @@ -277,8 +303,8 @@ static void walkmime(RclConfig *cnf, string &out, Binc::MimePart& doc, cte = hi.getValue(); } - LOGDEB(("walkmime: final: body start offset %d, length %d\n", - doc.getBodyStartOffset(), doc.getBodyLength())); + LOGDEB2(("walkmime: final: body start offset %d, length %d\n", + doc.getBodyStartOffset(), doc.getBodyLength())); string body; doc.getBody(body, 0, doc.bodylength); diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index 318a2352..ff046764 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.25 2005-03-31 10:04:07 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.26 2005-04-04 13:18:46 dockes Exp $ (C) 2004 J.F.Dockes"; #endif #include #include @@ -216,7 +216,8 @@ bool Rcl::dumb_string(const string &in, string &out) return true; if (!unac_cpp(in, inter)) { LOGERR(("dumb_string: unac_cpp failed for %s\n", in.c_str())); - return false; + // Ok, no need to stop the whole show + inter = ""; } out.reserve(inter.length()); for (unsigned int i = 0; i < inter.length(); i++) { @@ -268,7 +269,7 @@ truncate_to_word(string & input, string::size_type maxlen) bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc) { - LOGDEB(("Rcl::Db::add: fn %s\n", fn.c_str())); + LOGDEB1(("Rcl::Db::add: fn %s\n", fn.c_str())); if (pdata == 0) return false; Native *ndb = (Native *)pdata; @@ -288,7 +289,7 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc) TextSplit splitter(&splitData); string noacc; - if (!unac_cpp(doc.title, noacc)) { + if (!dumb_string(doc.title, noacc)) { LOGERR(("Rcl::Db::add: unac failed\n")); return false; } diff --git a/src/sampleconf/mimeconf b/src/sampleconf/mimeconf index 3c608d15..86713020 100644 --- a/src/sampleconf/mimeconf +++ b/src/sampleconf/mimeconf @@ -1,4 +1,4 @@ -# @(#$Id: mimeconf,v 1.5 2005-03-25 09:40:28 dockes Exp $ (C) 2004 J.F.Dockes +# @(#$Id: mimeconf,v 1.6 2005-04-04 13:18:47 dockes Exp $ (C) 2004 J.F.Dockes # Recoll : associations of mime types to processing filters. # There are different sections for decompression, 'interning' for indexing @@ -49,7 +49,8 @@ application/vnd.sun.xml.writer.template = exec rclsoff # External viewers, launched when you double-click a result entry [view] text/plain = xemacs %f -text/html = firefox -remote "openFile(%u)" +#text/html = firefox -remote "openFile(%u)" +text/html = firefox %u application/pdf = xpdf %f application/postscript = gv %f application/msword = openoffice-1.1.3-swriter %f diff --git a/src/utils/fstreewalk.cpp b/src/utils/fstreewalk.cpp index 33cbdb71..07f7671b 100644 --- a/src/utils/fstreewalk.cpp +++ b/src/utils/fstreewalk.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: fstreewalk.cpp,v 1.3 2005-02-10 15:21:12 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: fstreewalk.cpp,v 1.4 2005-04-04 13:18:47 dockes Exp $ (C) 2004 J.F.Dockes"; #endif #ifndef TEST_FSTREEWALK @@ -7,8 +7,10 @@ static char rcsid[] = "@(#$Id: fstreewalk.cpp,v 1.3 2005-02-10 15:21:12 dockes E #include #include #include +#include #include +#include #include "debuglog.h" #include "pathut.h" @@ -19,6 +21,7 @@ using namespace std; class FsTreeWalker::Internal { Options options; stringstream reason; + list skippedNames; int errors; void logsyserr(const char *call, const string ¶m) { @@ -53,6 +56,18 @@ int FsTreeWalker::getErrCnt() return data->errors; } +bool FsTreeWalker::addSkippedName(const string& pattern) +{ + data->skippedNames.push_back(pattern); + return true; +} + +void FsTreeWalker::clearSkippedNames() +{ + data->skippedNames.clear(); +} + + FsTreeWalker::Status FsTreeWalker::walk(const string &top, FsTreeWalkerCB& cb) { @@ -94,37 +109,54 @@ FsTreeWalker::Status FsTreeWalker::walk(const string &top, struct dirent *ent; while ((ent = readdir(d)) != 0) { - // We do process hidden files for now + // We do process hidden files for now, only skip . and .. if (!strcmp(ent->d_name, ".") || !strcmp(ent->d_name, "..")) continue; - string fn = top; - path_cat(fn, ent->d_name); + if (!data->skippedNames.empty()) { + list::const_iterator it; + for (it = data->skippedNames.begin(); + it != data->skippedNames.end(); it++) { + if (fnmatch(it->c_str(), ent->d_name, 0) == 0) { + //fprintf(stderr, + //"Skipping [%s] because of pattern match\n", ent->d_name); + goto skip; + } + } + } - struct stat st; - int statret = (data->options & FtwFollow) ? stat(fn.c_str(), &st) : - lstat(fn.c_str(), &st); - if (statret == -1) { - data->logsyserr("stat", fn); - continue; - } - if (S_ISDIR(st.st_mode)) { - if (data->options & FtwNoRecurse) { - status = cb.processone(fn, &st, FtwDirEnter); - } else { - status=walk(fn, cb); + { + string fn = top; + path_cat(fn, ent->d_name); + + struct stat st; + int statret = (data->options & FtwFollow) ? stat(fn.c_str(), &st) : + lstat(fn.c_str(), &st); + if (statret == -1) { + data->logsyserr("stat", fn); + continue; } - if (status & (FtwStop|FtwError)) - goto out; - if ((status = cb.processone(top, &st, FtwDirReturn)) - & (FtwStop|FtwError)) - goto out; - } else if (S_ISREG(st.st_mode)) { - if ((status = cb.processone(fn, &st, FtwRegular)) & - (FtwStop|FtwError)) { - goto out; + if (S_ISDIR(st.st_mode)) { + if (data->options & FtwNoRecurse) { + status = cb.processone(fn, &st, FtwDirEnter); + } else { + status=walk(fn, cb); + } + if (status & (FtwStop|FtwError)) + goto out; + if ((status = cb.processone(top, &st, FtwDirReturn)) + & (FtwStop|FtwError)) + goto out; + } else if (S_ISREG(st.st_mode)) { + if ((status = cb.processone(fn, &st, FtwRegular)) & + (FtwStop|FtwError)) { + goto out; + } } } + + skip: ; + // We skip other file types (devices etc...) } diff --git a/src/utils/fstreewalk.h b/src/utils/fstreewalk.h index 855c1269..6244e768 100644 --- a/src/utils/fstreewalk.h +++ b/src/utils/fstreewalk.h @@ -1,6 +1,6 @@ #ifndef _FSTREEWALK_H_INCLUDED_ #define _FSTREEWALK_H_INCLUDED_ -/* @(#$Id: fstreewalk.h,v 1.2 2005-02-10 15:21:12 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: fstreewalk.h,v 1.3 2005-04-04 13:18:47 dockes Exp $ (C) 2004 J.F.Dockes */ #include @@ -22,6 +22,14 @@ class FsTreeWalker { Status walk(const std::string &dir, FsTreeWalkerCB& cb); std::string getReason(); int getErrCnt(); + bool addSkippedName(const std::string &pattern); // Add a pattern + // for directory + // entries (file + // or dir) to be + // ignored (ie: + // #* , *~) + void clearSkippedNames(); // Clear all patterns + private: class Internal; Internal *data;