*** empty log message ***

2005-04-04 13:18:47 +00:00 · 2005-04-04 13:18:47 +00:00 · 50b927f65c
commit 50b927f65c
parent 04b279dcd5
8 changed files with 149 additions and 61 deletions
--- a/src/index/indexer.cpp
+++ b/src/index/indexer.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: indexer.cpp,v 1.9 2005-03-25 09:40:27 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: indexer.cpp,v 1.10 2005-04-04 13:18:46 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 #include <stdio.h>
 #include <sys/stat.h>
@ -89,6 +89,22 @@ bool DbIndexer::index()
 	 it != topdirs->end(); it++) {
 	LOGDEB(("DbIndexer::index: Indexing %s into %s\n", it->c_str(), 
 		dbdir.c_str()));
+	config->setKeyDir(*it);
+
+	// Set up skipped patterns for this subtree
+	{
+	    walker.clearSkippedNames();
+	    string skipped; 
+	    if (config->getConfParam("skippedNames", skipped)) {
+		list<string> skpl;
+		ConfTree::stringToStrings(skipped, skpl);
+		list<string>::const_iterator it;
+		for (it = skpl.begin(); it != skpl.end(); it++) {
+		    walker.addSkippedName(*it);
+		}
+	    }
+	}
+
 	if (walker.walk(*it, *this) != FsTreeWalker::FtwOk) {
 	    LOGERR(("DbIndexer::index: error while indexing %s\n", 
 		    it->c_str()));
--- a/src/index/mimetype.cpp
+++ b/src/index/mimetype.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: mimetype.cpp,v 1.6 2005-03-25 09:40:27 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: mimetype.cpp,v 1.7 2005-04-04 13:18:46 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif

 #include <ctype.h>
@ -82,8 +82,12 @@ string mimetype(const string &fn, ConfTree *mtypes)
 	    return mtype;
    }

-    // Look at file data ? Only when no suffix
-    if (suff.empty())
+    // Look at file data ? Only when no suffix or always
+    // Also 'file' is not that great for us. For exemple it will 
+    // mistake mail folders for simple text files if there is no 'Received' 
+    // header, which would be the case, for exemple in a 'Sent' folder. Also
+    // I'm not sure that file -i exists on all systems
+    //if (suff.empty())
 	return mimetypefromdata(fn);
    return "";
 }
--- a/src/internfile/mh_html.cpp
+++ b/src/internfile/mh_html.cpp
@ -63,11 +63,11 @@ MimeHandlerHtml::worker1(RclConfig *conf, const string &,
    string charset;
    if (!charsethint.empty()) {
 	charset = charsethint;
-	if (conf->getGuessCharset()) {
+    } else if (conf->getGuessCharset()) {
 	charset = csguess(htext, conf->getDefCharset());
    } else
 	charset = conf->getDefCharset();
-    }
+

    // - We first try to convert from the default configured charset
    //   (which may depend of the current directory) to utf-8. If this
@ -75,7 +75,7 @@ MimeHandlerHtml::worker1(RclConfig *conf, const string &,
    // - During parsing, if we find a charset parameter, and it differs from
    //   what we started with, we abort and restart with the parameter value
    //   instead of the configuration one.
-    LOGDEB(("textHtmlToDoc: charset before parsing: %s\n", charset.c_str()));
+    LOGDEB(("textHtmlToDoc: charset before parsing: [%s]\n", charset.c_str()));

    MyHtmlParser pres;
    for (int pass = 0; pass < 2; pass++) {
--- a/src/internfile/mh_mail.cpp
+++ b/src/internfile/mh_mail.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: mh_mail.cpp,v 1.2 2005-03-31 10:04:07 dockes Exp $ (C) 2005 J.F.Dockes";
+static char rcsid[] = "@(#$Id: mh_mail.cpp,v 1.3 2005-04-04 13:18:46 dockes Exp $ (C) 2005 J.F.Dockes";
 #endif

 #include <stdio.h>
@ -46,7 +46,7 @@ MimeHandler::Status
 MimeHandlerMail::worker(RclConfig *cnf, const string &fn, 
 			const string &mtype, Rcl::Doc &docout, string& ipath)
 {
-    LOGDEB(("MimeHandlerMail::worker: %s [%s]\n", mtype.c_str(), fn.c_str()));
+    LOGDEB2(("MimeHandlerMail::worker: %s [%s]\n", mtype.c_str(), fn.c_str()));
    conf = cnf;

    if (!stringlowercmp("message/rfc822", mtype)) {
@ -75,7 +75,7 @@ MimeHandlerMail::processmbox(const string &fn, Rcl::Doc &docout, string& ipath)
    if (ipath != "") {
 	sscanf(ipath.c_str(), "%d", &mtarg);
    }
-    LOGDEB(("MimeHandlerMail::processmbox: fn %s, mtarg %d\n", fn.c_str(),
+    LOGDEB2(("MimeHandlerMail::processmbox: fn %s, mtarg %d\n", fn.c_str(),
 	    mtarg));

    FILE *fp;
@ -125,7 +125,6 @@ MimeHandlerMail::processmbox(const string &fn, Rcl::Doc &docout, string& ipath)
 	    }
 	}
 	msgnum++;
-	LOGDEB(("MimeHandlerMail::processmbox: got msg %d\n", msgnum));
 	fseek(fp, end, SEEK_SET);
    } while (mtarg > 0 && msgnum < mtarg);

@ -173,25 +172,37 @@ MimeHandlerMail::processone(const string &fn, Binc::MimeDocument& doc,
    }

    // Handle some headers. We should process rfc2047 encoding here
+    // Also there should be no 8bit chars, but there sometimes are. So
+    // we transcode as if from iso-8859-1, which is better than
+    // getting utf8 conversion errors later on
    Binc::HeaderItem hi;
+    string transcoded;
    if (doc.h.getFirstHeader("Subject", hi)) {
-	docout.title = hi.getValue();
+	transcode(hi.getValue(), transcoded, "iso-8859-1", "UTF-8");
+	docout.title = transcoded;
    }
    if (doc.h.getFirstHeader("From", hi)) {
-	docout.text += string("From: ") + hi.getValue() + string("\n");
+	transcode(hi.getValue(), transcoded, "iso-8859-1", "UTF-8");
+	docout.text += string("From: ") + transcoded + string("\n");
    }
    if (doc.h.getFirstHeader("To", hi)) {
-	docout.text += string("To: ") + hi.getValue() + string("\n");
+	transcode(hi.getValue(), transcoded, "iso-8859-1", "UTF-8");
+	docout.text += string("To: ") + transcoded + string("\n");
    }
    if (doc.h.getFirstHeader("Date", hi)) {
-	docout.text += string("Date: ") + hi.getValue() + string("\n");
+	transcode(hi.getValue(), transcoded, "iso-8859-1", "UTF-8");
+	docout.text += string("Date: ") + transcoded + string("\n");
+    }
+    if (doc.h.getFirstHeader("Subject", hi)) {
+	transcode(hi.getValue(), transcoded, "iso-8859-1", "UTF-8");
+	docout.text += string("Subject: ") + transcoded + string("\n");
    }

-    LOGDEB(("MimeHandlerMail::processone: ismultipart %d mime subtype '%s'\n", 
+    LOGDEB2(("MimeHandlerMail::processone: ismultipart %d mime subtype '%s'\n",
 	    doc.isMultipart(), doc.getSubType().c_str()));
    walkmime(conf, docout.text, doc, 0);

-    LOGDEB(("MimeHandlerMail::processone: text: '%s'\n", docout.text.c_str()));
+    //LOGDEB(("MimeHandlerMail::processone: text: '%s'\n", docout.text.c_str()));
    return MimeHandler::MHDone;
 }

@ -206,13 +217,14 @@ static void walkmime(RclConfig *cnf, string &out, Binc::MimePart& doc,
    }

    if (doc.isMultipart()) {
-	LOGDEB(("walkmime: ismultipart %d subtype '%s'\n", 
+	LOGDEB2(("walkmime: ismultipart %d subtype '%s'\n", 
 		doc.isMultipart(), doc.getSubType().c_str()));
-	// We only handle alternative and mixed for now. For
+	// We only handle alternative, related and mixed for now. For
 	// alternative, we look for a text/plain part, else html and
-	// process it For mixed, we process each part.
+	// process it For mixed and related, we process each part.
 	std::vector<Binc::MimePart>::iterator it;
-	if (!stringicmp("mixed", doc.getSubType())) {
+	if (!stringicmp("mixed", doc.getSubType()) || 
+	    !stringicmp("related", doc.getSubType())) {
 	    for (it = doc.members.begin(); it != doc.members.end();it++) {
 		walkmime(cnf, out, *it, depth+1);
 	    }
@ -247,18 +259,32 @@ static void walkmime(RclConfig *cnf, string &out, Binc::MimePart& doc,
 	if (doc.h.getFirstHeader("Content-Type", hi)) {
 	    ctt = hi.getValue();
 	}
-	LOGDEB(("walkmime:content-type: %s\n", ctt.c_str()));
+	LOGDEB2(("walkmime:content-type: %s\n", ctt.c_str()));
 	MimeHeaderValue content_type;
 	parseMimeHeaderValue(ctt, content_type);
 	if (stringlowercmp("text/plain", content_type.value) && 
 	    stringlowercmp("text/html", content_type.value)) {
 	    return;
 	}
-	string charset = "us-ascii";
+
+	// Normally the default charset is us-ascii. But it happens that
+	// 8 bit chars exist in a message that is stated as us-ascii. Ie the 
+	// mailer used by yahoo support ('KANA') does this. We could convert 
+	// to iso-8859 only if the transfer-encoding is 8 bit, or test for
+	// actual 8 bit chars, but what the heck, le'ts use 8859-1 as default
+	string charset = "iso-8859-1";
 	map<string,string>::const_iterator it;
 	it = content_type.params.find(string("charset"));
 	if (it != content_type.params.end())
 	    charset = it->second;
+	if (charset.empty() || 
+	    !stringlowercmp("us-ascii", charset) || 
+	    !stringlowercmp("default", charset) || 
+	    !stringlowercmp("x-user-defined", charset) || 
+	    !stringlowercmp("x-unknown", charset) || 
+	    !stringlowercmp("unknown", charset) ) {
+	    charset = "iso-8859-1";
+	}
 	    
 	// Content disposition
 	string ctd = "inline";
@ -277,7 +303,7 @@ static void walkmime(RclConfig *cnf, string &out, Binc::MimePart& doc,
 	    cte = hi.getValue();
 	} 

-	LOGDEB(("walkmime: final: body start offset %d, length %d\n", 
+	LOGDEB2(("walkmime: final: body start offset %d, length %d\n", 
 		 doc.getBodyStartOffset(), doc.getBodyLength()));
 	string body;
 	doc.getBody(body, 0, doc.bodylength);
--- a/src/rcldb/rcldb.cpp
+++ b/src/rcldb/rcldb.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.25 2005-03-31 10:04:07 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.26 2005-04-04 13:18:46 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 #include <stdio.h>
 #include <sys/stat.h>
@ -216,7 +216,8 @@ bool Rcl::dumb_string(const string &in, string &out)
 	return true;
    if (!unac_cpp(in, inter)) {
 	LOGERR(("dumb_string: unac_cpp failed for %s\n", in.c_str()));
-	return false;
+	// Ok, no need to stop the whole show
+	inter = "";
    }
    out.reserve(inter.length());
    for (unsigned int i = 0; i < inter.length(); i++) {
@ -268,7 +269,7 @@ truncate_to_word(string & input, string::size_type maxlen)

 bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
 {
-    LOGDEB(("Rcl::Db::add: fn %s\n", fn.c_str()));
+    LOGDEB1(("Rcl::Db::add: fn %s\n", fn.c_str()));
    if (pdata == 0)
 	return false;
    Native *ndb = (Native *)pdata;
@ -288,7 +289,7 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
    TextSplit splitter(&splitData);

    string noacc;
-    if (!unac_cpp(doc.title, noacc)) {
+    if (!dumb_string(doc.title, noacc)) {
 	LOGERR(("Rcl::Db::add: unac failed\n"));
 	return false;
    }
--- a/src/sampleconf/mimeconf
+++ b/src/sampleconf/mimeconf
@ -1,4 +1,4 @@
-# @(#$Id: mimeconf,v 1.5 2005-03-25 09:40:28 dockes Exp $  (C) 2004 J.F.Dockes
+# @(#$Id: mimeconf,v 1.6 2005-04-04 13:18:47 dockes Exp $  (C) 2004 J.F.Dockes

 # Recoll : associations of mime types to processing filters.
 # There are different sections for decompression, 'interning' for indexing
@ -49,7 +49,8 @@ application/vnd.sun.xml.writer.template = exec rclsoff
 # External viewers, launched when you double-click a result entry
 [view]
 text/plain = xemacs %f
-text/html = firefox -remote "openFile(%u)"
+#text/html = firefox -remote "openFile(%u)"
+text/html = firefox %u
 application/pdf  = xpdf %f
 application/postscript = gv %f
 application/msword = openoffice-1.1.3-swriter %f
--- a/src/utils/fstreewalk.cpp
+++ b/src/utils/fstreewalk.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: fstreewalk.cpp,v 1.3 2005-02-10 15:21:12 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: fstreewalk.cpp,v 1.4 2005-04-04 13:18:47 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif

 #ifndef TEST_FSTREEWALK
@ -7,8 +7,10 @@ static char rcsid[] = "@(#$Id: fstreewalk.cpp,v 1.3 2005-02-10 15:21:12 dockes E
 #include <dirent.h>
 #include <sys/stat.h>
 #include <errno.h>
+#include <fnmatch.h>

 #include <sstream>
+#include <list>

 #include "debuglog.h"
 #include "pathut.h"
@ -19,6 +21,7 @@ using namespace std;
 class FsTreeWalker::Internal {
    Options options;
    stringstream reason;
+    list<string> skippedNames;
    int errors;
    void logsyserr(const char *call, const string &param) 
    {
@ -53,6 +56,18 @@ int FsTreeWalker::getErrCnt()
    return data->errors;
 }

+bool FsTreeWalker::addSkippedName(const string& pattern)
+{
+    data->skippedNames.push_back(pattern);
+    return true;
+}
+
+void FsTreeWalker::clearSkippedNames()
+{
+    data->skippedNames.clear();
+}
+
+
 FsTreeWalker::Status FsTreeWalker::walk(const string &top, 
 					FsTreeWalkerCB& cb)
 {
@ -94,10 +109,23 @@ FsTreeWalker::Status FsTreeWalker::walk(const string &top,

    struct dirent *ent;
    while ((ent = readdir(d)) != 0) {
-	// We do process hidden files for now
+	// We do process hidden files for now, only skip . and ..
 	if (!strcmp(ent->d_name, ".") || !strcmp(ent->d_name, "..")) 
 	    continue;

+	if (!data->skippedNames.empty()) {
+	    list<string>::const_iterator it;
+	    for (it = data->skippedNames.begin(); 
+		 it != data->skippedNames.end(); it++) {
+		if (fnmatch(it->c_str(), ent->d_name, 0) == 0) {
+		    //fprintf(stderr, 
+		    //"Skipping [%s] because of pattern match\n", ent->d_name);
+		    goto skip;
+		}
+	    }
+	}
+
+	{
 	    string fn = top;
 	    path_cat(fn, ent->d_name);

@ -125,6 +153,10 @@ FsTreeWalker::Status FsTreeWalker::walk(const string &top,
 		    goto out;
 		}
 	    }
+	}
+
+    skip: ;
+
 	// We skip other file types (devices etc...)
    }

--- a/src/utils/fstreewalk.h
+++ b/src/utils/fstreewalk.h
@ -1,6 +1,6 @@
 #ifndef _FSTREEWALK_H_INCLUDED_
 #define _FSTREEWALK_H_INCLUDED_
-/* @(#$Id: fstreewalk.h,v 1.2 2005-02-10 15:21:12 dockes Exp $  (C) 2004 J.F.Dockes */
+/* @(#$Id: fstreewalk.h,v 1.3 2005-04-04 13:18:47 dockes Exp $  (C) 2004 J.F.Dockes */

 #include <string>

@ -22,6 +22,14 @@ class FsTreeWalker {
    Status walk(const std::string &dir, FsTreeWalkerCB& cb);
    std::string getReason();
    int getErrCnt();
+    bool addSkippedName(const std::string &pattern); // Add a pattern
+						     // for directory
+						     // entries (file
+						     // or dir) to be
+						     // ignored (ie:
+						     // #* , *~)
+    void clearSkippedNames(); // Clear all patterns
+
 private:
    class Internal;
    Internal *data;