Dijon filters 1st step: mostly working needs check and optim

2006-12-15 12:40:24 +00:00 · 2006-12-15 12:40:24 +00:00 · 33c95ef1ba
commit 33c95ef1ba
parent 1973c06346
22 changed files with 979 additions and 480 deletions
--- a/src/internfile/Filter.h
+++ b/src/internfile/Filter.h
@ -0,0 +1,164 @@
+/*
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU Library General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ */
+
+#ifndef _DIJON_FILTER_H
+#define _DIJON_FILTER_H
+
+#include <string>
+#include <set>
+#include <map>
+
+namespace Dijon
+{
+    class Filter;
+
+    /** Provides the list of MIME types supported by the filter(s).
+     * The character string is allocated with new[].
+     * This function is exported by dynamically loaded filter libraries.
+     */
+    typedef bool (get_filter_types_func)(std::set<std::string> &);
+    /** Returns what data should be passed to the filter(s).
+     * Output is cast from Filter::DataInput to int for convenience.
+     * This function is exported by dynamically loaded filter libraries.
+     * The aim is to let the client application know before-hand whether
+     * it should load documents or not.
+     */
+    typedef int (get_filter_data_input_func)(void);
+    /** Returns a Filter that handles the given MIME type.
+     * The Filter object is allocated with new.
+     * This function is exported by dynamically loaded filter libraries
+     * and serves as a factory for Filter objects, so that the client
+     * application doesn't have to know which Filter sub-types handle
+     * which MIME types.
+     */
+    typedef Filter *(get_filter_func)(const std::string &);
+
+    /// Filter interface.
+    class Filter
+    {
+    public:
+	/// Builds an empty filter.
+	Filter(const std::string &mime_type) {}
+	/// Destroys the filter.
+	virtual ~Filter() {}
+
+
+	// Enumerations.
+
+	/** What data a filter supports as input.
+	 * It can be either the whole document data, its file name, or its URI.
+	 */
+	typedef enum { DOCUMENT_DATA=0, DOCUMENT_FILE_NAME, DOCUMENT_URI } DataInput;
+
+	/** Input properties supported by the filter.
+	 * - PREFERRED_CHARSET is the charset preferred by the client application.
+	 * The filter will convert document's content to this charset if possible.
+	 * - OPERATING_MODE can be set to either view or index.
+	 */
+	typedef enum { DEFAULT_CHARSET=0, OPERATING_MODE } Properties;
+
+
+	// Information.
+
+	/// Returns what data the filter requires as input.
+	virtual DataInput get_required_data_input(void) const = 0;
+
+
+	// Initialization.
+
+	/** Sets a property, prior to calling set_document_XXX().
+	 * Returns false if the property is not supported.
+	 */
+	virtual bool set_property(Properties prop_name, const std::string &prop_value) = 0;
+
+	/** (Re)initializes the filter with the given data.
+	 * Caller should ensure the given pointer is valid until the
+	 * Filter object is destroyed, as some filters may not need to
+	 * do a deep copy of the data.
+	 * Returns false if this input is not supported or an error occured.
+	 */
+	virtual bool set_document_data(const char *data_ptr, unsigned int data_length) = 0;
+	virtual bool set_document_string(const string&) = 0;
+
+	/** (Re)initializes the filter with the given file.
+	 * Returns false if this input is not supported or an error occured.
+	 */
+	virtual bool set_document_file(const std::string &file_path) = 0;
+
+	/** (Re)initializes the filter with the given URI.
+	 * Returns false if this input is not supported or an error occured.
+	 */
+	virtual bool set_document_uri(const std::string &uri) = 0;
+
+
+	// Going from one nested document to the next.
+
+	/** Returns true if there are nested documents left to extract.
+	 * Returns false if the end of the parent document was reached
+	 * or an error occured.
+	 */
+	virtual bool has_documents(void) const = 0;
+
+	/** Moves to the next nested document.
+	 * Returns false if there are none left.
+	 */ 
+	virtual bool next_document(void) = 0;
+
+	/** Skips to the nested document with the given ipath.
+	 * Returns false if no such document exists.
+	 */
+	virtual bool skip_to_document(const std::string &ipath) = 0;
+
+
+	// Accessing documents' contents.
+
+	/// Returns the message for the most recent error that has occured.
+	virtual std::string get_error(void) const = 0;
+
+	/** Returns a dictionary of metadata extracted from the current document.
+	 * Metadata fields may include one or more of the following :
+	 * content, title, ipath, mimetype, language, charset, author, creator,
+	 * publisher, modificationdate, creationdate, size
+	 * Special considerations apply :
+	 * - content may contain binary data, watch out !
+	 * - ipath is an internal path to the nested document that can be
+	 * later passed to skip_to_document(). It may be empty if the parent
+	 * document's type doesn't allow embedding, in which case the filter
+	 * should only return one document.
+	 * - mimetype should be text/plain if the document could be handled
+	 * internally, empty if unknown. If any other value, it is expected
+	 * that the client application can pass the nested document's content
+	 * to another filter that supports this particular type.
+	 */
+	const std::map<std::string, std::string> &get_meta_data(void) const
+	{
+	    return m_metaData;
+	}
+
+    protected:
+	/// Metadata dictionary.
+	std::map<std::string, std::string> m_metaData;
+
+    private:
+	/// Filter objects cannot be copied.
+	Filter(const Filter &other);
+	/// Filter objects cannot be copied.
+	Filter& operator=(const Filter& other);
+
+    };
+}
+
+#endif // _DIJON_FILTER_H
--- a/src/internfile/Makefile
+++ b/src/internfile/Makefile
@ -1,9 +1,9 @@
-# @(#$Id: Makefile,v 1.1 2006-11-15 07:27:42 dockes Exp $  (C) 2005 J.F.Dockes
+# @(#$Id: Makefile,v 1.2 2006-12-15 12:40:02 dockes Exp $  (C) 2005 J.F.Dockes
 depth = ..
 include $(depth)/mk/sysconf

 # Only test executables get build in here
-PROGS = internfile unacpp textsplit rclconfig
+PROGS = internfile

 all: $(BIGLIB) $(PROGS) 

--- a/src/internfile/internfile.cpp
+++ b/src/internfile/internfile.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: internfile.cpp,v 1.18 2006-12-13 09:13:18 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: internfile.cpp,v 1.19 2006-12-15 12:40:02 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 /*
 *   This program is free software; you can redistribute it and/or modify
@ -32,12 +32,14 @@ using namespace std;
 #endif /* NO_NAMESPACES */

 #include "internfile.h"
+#include "rcldoc.h"
 #include "mimetype.h"
 #include "debuglog.h"
 #include "mimehandler.h"
 #include "execmd.h"
 #include "pathut.h"
 #include "wipedir.h"
+#include "rclconfig.h"

 // Execute the command to uncompress a file into a temporary one.
 static bool uncompressfile(RclConfig *conf, const string& ifn, 
@ -106,98 +108,262 @@ void FileInterner::tmpcleanup()
 // internfile
 FileInterner::FileInterner(const std::string &f, RclConfig *cnf, 
 			   const string& td, const string *imime)
-    : m_fn(f), m_cfg(cnf), m_tdir(td), m_handler(0)
+    : m_cfg(cnf), m_fn(f), m_forPreview(imime?true:false), m_tdir(td)
 {
-    // We are actually going to access the file, so it's ok
-    // performancewise to check this config variable at every call
-    // even if it can only change when we change directories
-    string usfc;
-    int usfci;
-    if (!cnf->getConfParam("usesystemfilecommand", usfc)) 
-	usfci = 0;
-    else 
-	usfci = atoi(usfc.c_str()) ? 1 : 0;
+    bool usfci = false;
+    cnf->getConfParam("usesystemfilecommand", &usfci);
    LOGDEB1(("FileInterner::FileInterner: usfci now %d\n", usfci));

-    bool forPreview = imime ? true : false;
-
    // We need to run mime type identification in any case to check
    // for a compressed file.
-    m_mime = mimetype(m_fn, m_cfg, usfci);
+    string l_mime = mimetype(m_fn, m_cfg, usfci);

    // If identification fails, try to use the input parameter. This
    // is then normally not a compressed type (it's the mime type from
    // the db), and is only set when previewing, not for indexing
-    if (m_mime.empty() && imime)
-	m_mime = *imime;
+    if (l_mime.empty() && imime)
+	l_mime = *imime;

-    if (!m_mime.empty()) {
+    if (!l_mime.empty()) {
 	// Has mime: check for a compressed file. If so, create a
 	// temporary uncompressed file, and rerun the mime type
 	// identification, then do the rest with the temp file.
 	list<string>ucmd;
-	if (m_cfg->getUncompressor(m_mime, ucmd)) {
+	if (m_cfg->getUncompressor(l_mime, ucmd)) {
 	    if (!uncompressfile(m_cfg, m_fn, ucmd, m_tdir, m_tfile)) {
 		return;
 	    }
 	    LOGDEB(("internfile: after ucomp: m_tdir %s, tfile %s\n", 
 		    m_tdir.c_str(), m_tfile.c_str()));
 	    m_fn = m_tfile;
-	    m_mime = mimetype(m_fn, m_cfg, usfci);
-	    if (m_mime.empty() && imime)
-		m_mime = *imime;
+	    l_mime = mimetype(m_fn, m_cfg, usfci);
+	    if (l_mime.empty() && imime)
+		l_mime = *imime;
 	}
    }

-    if (m_mime.empty()) {
+    if (l_mime.empty()) {
 	// No mime type. We let it through as config may warrant that
 	// we index all file names
 	LOGDEB(("internfile: (no mime) [%s]\n", m_fn.c_str()));
    }

    // Look for appropriate handler (might still return empty)
-    m_handler = getMimeHandler(m_mime, m_cfg);
+    Dijon::Filter *df = getMimeHandler(l_mime, m_cfg);

-    if (!m_handler) {
+    if (!df) {
 	// No handler for this type, for now :( if indexallfilenames
 	// is set in the config, this normally wont happen (we get mh_unknown)
-	LOGDEB(("FileInterner::FileInterner: %s: no handler\n", 
-		m_mime.c_str()));
+	LOGDEB(("FileInterner:: no handler for %s\n", l_mime.c_str()));
 	return;
    }
-    m_handler->setForPreview(forPreview);
-    LOGDEB(("FileInterner::FileInterner: %s [%s]\n", m_mime.c_str(), 
+    df->set_property(Dijon::Filter::OPERATING_MODE, 
+			    m_forPreview ? "view" : "index");
+
+    string charset = m_cfg->getDefCharset();
+    df->set_property(Dijon::Filter::DEFAULT_CHARSET, charset);
+    if (!df->set_document_file(m_fn)) {
+	LOGERR(("FileInterner:: error parsing %s\n", m_fn.c_str()));
+	return;
+    }
+    m_handlers.reserve(20);
+    m_handlers.push_back(df);
+    LOGDEB(("FileInterner::FileInterner: %s [%s]\n", l_mime.c_str(), 
 	    m_fn.c_str()));
 }

+static const unsigned int MAXHANDLERS = 20;
+
 FileInterner::Status FileInterner::internfile(Rcl::Doc& doc, string& ipath)
 {
-    if (!m_handler) {
-	LOGERR(("FileInterner::internfile: no handler !!\n"));
+    if (m_handlers.size() != 1) {
+	LOGERR(("FileInterner::internfile: bad stack size %d !!\n", 
+		m_handlers.size()));
 	return FIError;
    }

-    // Turn file into a document. The document has fields for title, body 
-    // etc.,  all text converted to utf8
-    MimeHandler::Status mhs = 
-	m_handler->mkDoc(m_cfg, m_fn, m_mime, doc, ipath);
-    FileInterner::Status ret = FIError;
-    switch (mhs) {
-    case MimeHandler::MHError: 
-	LOGERR(("FileInterner::internfile: error parsing %s\n", m_fn.c_str()));
-	break;
-    case MimeHandler::MHDone: ret = FIDone;break;
-    case MimeHandler::MHAgain: ret = FIAgain;break;
+    // Note that the vector is big enough for the maximum stack. All values
+    // over the last significant one are ""
+    vector<string> vipath(MAXHANDLERS);
+    int vipathidx = 0;
+    if (!ipath.empty()) {
+	list<string> lipath;
+	stringToTokens(ipath, lipath, "|", true);
+	vipath.insert(vipath.begin(), lipath.begin(), lipath.end());
+	if (!m_handlers.back()->skip_to_document(vipath[m_handlers.size()-1])){
+	    LOGERR(("FileInterner::internfile: can't skip\n"));
+	    return FIError;
+	}
    }

-    doc.mimetype = m_mime;
-    return ret;
+
+    /* Try to get doc from the topmost filter */
+    while (!m_handlers.empty()) {
+	if (!vipath.empty()) {
+	    
+	}
+	if (!m_handlers.back()->has_documents()) {
+	    // No docs at the current top level. Pop and see if there
+	    // is something at the previous one
+	    delete m_handlers.back();
+	    m_handlers.pop_back();
+	    continue;
+	}
+
+	if (!m_handlers.back()->next_document()) {
+	    LOGERR(("FileInterner::internfile: next_document failed\n"));
+	    return FIError;
+	}
+
+	// Look at what we've got
+	const std::map<std::string, std::string> *docdata = 
+	    &m_handlers.back()->get_meta_data();
+	map<string,string>::const_iterator it;
+	string charset;
+	it = docdata->find("charset");
+	if (it != docdata->end())
+	    charset = it->second;
+	string mimetype;
+	it = docdata->find("mimetype");
+	if (it != docdata->end())
+	    mimetype = it->second;
+
+	LOGDEB(("FileInterner::internfile:next_doc is %s\n",mimetype.c_str()));
+	// If we find a text/plain doc, we're done
+	if (!strcmp(mimetype.c_str(), "text/plain"))
+	    break;
+
+	// Got a non text/plain doc. We need to stack another
+	// filter. Check current size
+	if (m_handlers.size() > MAXHANDLERS) {
+	    // Stack too big. Skip this and go on to check if there is
+	    // something else in the current back()
+	    LOGDEB(("FileInterner::internfile: stack too high\n"));
+	    continue;
+	}
+
+	Dijon::Filter *again = getMimeHandler(mimetype, m_cfg);
+	if (!again) {
+	    // If we can't find a filter, this doc can't be handled
+	    // but there can be other ones so we go on
+	    LOGERR(("FileInterner::internfile: no filter for [%s]\n",
+		    mimetype.c_str()));
+	    continue;
+	}
+	again->set_property(Dijon::Filter::OPERATING_MODE, 
+			    m_forPreview ? "view" : "index");
+	again->set_property(Dijon::Filter::DEFAULT_CHARSET, 
+			    charset);
+	string ns;
+	const string *txt = &ns;
+	it = docdata->find("content");
+	if (it != docdata->end())
+	    txt = &it->second;
+	if (!again->set_document_string(*txt)) {
+	    LOGERR(("FileInterner::internfile: error reparsing for %s\n", 
+		    m_fn.c_str()));
+	    delete again;
+	    continue;
+	}
+	// add filter and go on
+	m_handlers.push_back(again);
+	if (!m_handlers.back()->skip_to_document(vipath[m_handlers.size()-1])){
+	    LOGERR(("FileInterner::internfile: can't skip\n"));
+	    return FIError;
+	}
+    }
+
+    if (m_handlers.empty()) {
+	LOGERR(("FileInterner::internfile: stack empty\n"));
+	return FIError;
+    }
+    if (!m_forPreview) {
+	string &ipath = doc.ipath;
+	bool hasipath = false;
+	for (vector<Dijon::Filter*>::const_iterator it = m_handlers.begin();
+	     it != m_handlers.end(); it++) {
+	    map<string,string>::const_iterator iti = 
+		(*it)->get_meta_data().find("ipath");
+	    if (iti != (*it)->get_meta_data().end()) {
+		if (!iti->second.empty())
+		    hasipath = true;
+		ipath += iti->second + "|";
+	    } else {
+		ipath += "|";
+	    }
+	}
+	if (hasipath) {
+	    LOGDEB(("IPATH [%s]\n", ipath.c_str()));
+	    string::size_type sit = ipath.find_last_not_of("|");
+	    if (sit == string::npos)
+		ipath.erase();
+	    else if (sit < ipath.length() -1)
+		ipath.erase(sit+1);
+	} else {
+	    ipath.erase();
+	}
+    }
+
+    dijontorcl(m_handlers.back(), doc);
+
+    // Destack what can be
+    while (!m_handlers.empty() && !m_handlers.back()->has_documents()) {
+	delete m_handlers.back();
+	m_handlers.pop_back();
+    }
+    if (m_handlers.empty() || !m_handlers.back()->has_documents())
+	return FIDone;
+    else 
+	return FIAgain;
+}
+
+
+bool FileInterner::dijontorcl(Dijon::Filter *df, Rcl::Doc& doc)
+{
+    const std::map<std::string, std::string> *docdata = &df->get_meta_data();
+    map<string,string>::const_iterator it;
+
+    it = docdata->find("mimetype");
+    if (it != docdata->end())
+	doc.mimetype = it->second;
+
+    it = docdata->find("origcharset");
+    if (it != docdata->end())
+	doc.origcharset = it->second;
+
+    it = docdata->find("content");
+    if (it != docdata->end())
+	doc.text = it->second;
+
+    it = docdata->find("title");
+    if (it != docdata->end())
+	doc.title = it->second;
+ 
+    it = docdata->find("keywords");
+    if (it != docdata->end())
+	doc.keywords = it->second;
+
+    it = docdata->find("modificationdate");
+    if (it != docdata->end())
+	doc.dmtime = it->second;
+
+    it = docdata->find("abstract");
+    if (it != docdata->end()) {
+	doc.abstract = it->second;
+    } else {
+	it = docdata->find("sample");
+	if (it != docdata->end()) 
+	    doc.abstract = it->second;
+    }
+    return true;
 }

 FileInterner::~FileInterner()
 {
-    delete m_handler; 
-    m_handler = 0;
+    while (!m_handlers.empty()) {
+	delete m_handlers.back();
+	m_handlers.pop_back(); 
+    }
    tmpcleanup();
 }

@ -212,6 +378,8 @@ using namespace std;
 #include "debuglog.h"
 #include "rclinit.h"
 #include "internfile.h"
+#include "rclconfig.h"
+#include "rcldoc.h"

 static string thisprog;

--- a/src/internfile/internfile.h
+++ b/src/internfile/internfile.h
@ -16,14 +16,19 @@
 */
 #ifndef _INTERNFILE_H_INCLUDED_
 #define _INTERNFILE_H_INCLUDED_
-/* @(#$Id: internfile.h,v 1.6 2006-01-30 11:15:27 dockes Exp $  (C) 2004 J.F.Dockes */
+/* @(#$Id: internfile.h,v 1.7 2006-12-15 12:40:02 dockes Exp $  (C) 2004 J.F.Dockes */

 #include <string>
+#include <vector>
+using std::string;
+using std::vector;

-#include "rclconfig.h"
-#include "rcldb.h"
+#include "Filter.h"

-class MimeHandler;
+class RclConfig;
+namespace Rcl {
+class Doc;
+}

 /// Turn external file into internal representation, according to mime
 /// type etc
@ -43,8 +48,8 @@ class FileInterner {
     *   mime type for the uncompressed version. This currently doubles up 
     *   to indicate that this object is for previewing (not indexing).
     */
-    FileInterner(const std::string &fn, RclConfig *cnf, const string& td,
-		 const std::string *mtype = 0);
+    FileInterner(const string &fn, RclConfig *cnf, const string& td,
+		 const string *mtype = 0);

    ~FileInterner();

@ -67,15 +72,16 @@ class FileInterner {
    Status internfile(Rcl::Doc& doc, string &ipath);

 private:
-    string m_fn;
-    RclConfig *m_cfg;
-    const string &m_tdir;
-    MimeHandler *m_handler;
-
-    string m_tfile;
-    string m_mime;
+    RclConfig             *m_cfg;
+    string                 m_fn;
+    bool                   m_forPreview;
+    // m_tdir and m_tfile are used only for decompressing input file if needed
+    const string&          m_tdir; 
+    string                 m_tfile;
+    vector<Dijon::Filter*> m_handlers;

    void tmpcleanup();
+    static bool dijontorcl(Dijon::Filter *, Rcl::Doc&);
 };

 #endif /* _INTERNFILE_H_INCLUDED_ */
--- a/src/internfile/mh_exec.cpp
+++ b/src/internfile/mh_exec.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: mh_exec.cpp,v 1.7 2006-12-13 09:13:18 dockes Exp $ (C) 2005 J.F.Dockes";
+static char rcsid[] = "@(#$Id: mh_exec.cpp,v 1.8 2006-12-15 12:40:02 dockes Exp $ (C) 2005 J.F.Dockes";
 #endif
 /*
 *   This program is free software; you can redistribute it and/or modify
@ -37,15 +37,15 @@ public:

 // Execute an external program to translate a file from its native format
 // to html. Then call the html parser to do the actual indexing
-MimeHandler::Status 
-MimeHandlerExec::mkDoc(RclConfig *conf, const string &fn, 
-		       const string &mtype, Rcl::Doc &docout, string&)
+bool MimeHandlerExec::next_document()
 {
+    if (m_havedoc == false)
+	return false;
+    m_havedoc = false;
    if (params.empty()) {
 	// Hu ho
-	LOGERR(("MimeHandlerExec::mkDoc: empty params for mime %s\n",
-		mtype.c_str()));
-	return MimeHandler::MHError;
+	LOGERR(("MimeHandlerExec::mkDoc: empty params\n"));
+	return false;
    }

    // Command name
@ -54,10 +54,10 @@ MimeHandlerExec::mkDoc(RclConfig *conf, const string &fn,
    // Build parameter list: delete cmd name and add the file name
    list<string>::iterator it = params.begin();
    list<string>myparams(++it, params.end());
-    myparams.push_back(fn);
+    myparams.push_back(m_fn);

    // Execute command and store the result text, which is supposedly html
-    string html;
+    string& html = m_metaData["content"];
    ExecCmd mexec;
    MEAdv adv;
    mexec.setAdvise(&adv);
@ -67,10 +67,12 @@ MimeHandlerExec::mkDoc(RclConfig *conf, const string &fn,
    if (status) {
 	LOGERR(("MimeHandlerExec: command status 0x%x: %s\n", 
 		status, cmd.c_str()));
-	return MimeHandler::MHError;
+	return false;
    }

-    // Process/index  the html
-    MimeHandlerHtml hh;
-    return hh.mkDoc(conf, fn, html, mtype, docout);
+    m_metaData["origcharset"] = m_defcharset;
+    // All recoll filters output utf-8
+    m_metaData["charset"] = "utf-8";
+    m_metaData["mimetype"] = "text/html";
+    return true;
 }
--- a/src/internfile/mh_exec.h
+++ b/src/internfile/mh_exec.h
@ -16,7 +16,7 @@
 */
 #ifndef _MH_EXEC_H_INCLUDED_
 #define _MH_EXEC_H_INCLUDED_
-/* @(#$Id: mh_exec.h,v 1.2 2006-01-30 11:15:27 dockes Exp $  (C) 2004 J.F.Dockes */
+/* @(#$Id: mh_exec.h,v 1.3 2006-12-15 12:40:02 dockes Exp $  (C) 2004 J.F.Dockes */

 #include <string>
 #include <list>
@ -29,14 +29,19 @@
    Turn external document into internal one by executing an external filter.
    The command to execute, and its parameters, come from the mimeconf file
 */
-class MimeHandlerExec : public MimeHandler {
+class MimeHandlerExec : public RecollFilter {
 public:
    std::list<std::string> params;
+    MimeHandlerExec(const string& mt) : RecollFilter(mt) {}
    virtual ~MimeHandlerExec() {}
-    virtual MimeHandler::Status 
-	mkDoc(RclConfig *conf, const std::string &fn, 
-	      const std::string &mtype, Rcl::Doc &docout, std::string&);
-
+    virtual bool set_document_file(const string &file_path) {
+	m_fn = file_path;
+	m_havedoc = true;
+	return true;
+    }
+    virtual bool next_document();
+private:
+    string m_fn;
 };

 #endif /* _MH_EXEC_H_INCLUDED_ */
--- a/src/internfile/mh_html.cpp
+++ b/src/internfile/mh_html.cpp
@ -41,36 +41,31 @@ using namespace std;
 #endif /* NO_NAMESPACES */


-MimeHandler::Status 
-MimeHandlerHtml::mkDoc(RclConfig *conf, const string &fn, 
-			const string &mtype, Rcl::Doc &docout, string&)
+bool MimeHandlerHtml::set_document_file(const string &fn)
 {
    LOGDEB(("textHtmlToDoc: %s\n", fn.c_str()));
    string otext;
    if (!file_to_string(fn, otext)) {
 	LOGINFO(("textHtmlToDoc: cant read: %s\n", fn.c_str()));
-	return MimeHandler::MHError;
+	return false;
    }
-    return mkDoc(conf, fn, otext, mtype, docout);
+    return set_document_string(otext);
 }

-MimeHandler::Status 
-MimeHandlerHtml::mkDoc(RclConfig *conf, const string &, 
-			 const string& htext,
-			 const string &mtype, Rcl::Doc &docout)
+bool MimeHandlerHtml::set_document_string(const string& htext) 
 {
-    //LOGDEB(("textHtmlToDoc: htext: %s\n", htext.c_str()));
-    // Character set handling: the initial guessed charset depends on
-    // external factors: possible hint (ie mime charset in a mail
-    // message), charset guessing, or default configured charset.
-    string charset;
-    if (!charsethint.empty()) {
-	charset = charsethint;
-    } else if (conf->getGuessCharset()) {
-	charset = csguess(htext, conf->getDefCharset());
-    } else
-	charset = conf->getDefCharset();
+    m_html = htext;
+    m_havedoc = true;
+    return true;
+}

+bool MimeHandlerHtml::next_document()
+{
+    if (m_havedoc == false)
+	return false;
+    m_havedoc = false;
+    LOGDEB(("textHtmlToDoc: next_document\n"));
+    string charset = m_defcharset;

    // - We first try to convert from the default configured charset
    //   (which may depend of the current directory) to utf-8. If this
@ -80,16 +75,16 @@ MimeHandlerHtml::mkDoc(RclConfig *conf, const string &,
    //   instead of the configuration one.
    LOGDEB(("textHtmlToDoc: charset before parsing: [%s]\n", charset.c_str()));

-    MyHtmlParser result;
+
+    MyHtmlParser p(m_metaData["content"]);
    for (int pass = 0; pass < 2; pass++) {
 	string transcoded;
 	LOGDEB(("Html::mkDoc: pass %d\n", pass));
-	MyHtmlParser p;
 	// Try transcoding. If it fails, use original text.
-	if (!transcode(htext, transcoded, charset, "UTF-8")) {
+	if (!transcode(m_html, transcoded, charset, "UTF-8")) {
 	    LOGERR(("textHtmlToDoc: transcode failed from cs '%s' to UTF-8\n",
 		    charset.c_str()));
-	    transcoded = htext;
+	    transcoded = m_html;
 	    // We don't know the charset, at all
 	    p.ocharset = p.charset = charset = "";
 	} else {
@ -102,31 +97,29 @@ MimeHandlerHtml::mkDoc(RclConfig *conf, const string &,
 	try {
 	    p.parse_html(transcoded);
 	    // No exception: ok?
-	    result = p;
 	    break;
 	} catch (bool diag) {
-	    result = p;
 	    if (diag == true)
 		break;
 	    LOGDEB(("textHtmlToDoc: charset [%s] doc charset [%s]\n",
-		    charset.c_str(),result.doccharset.c_str()));
-	    if (!result.doccharset.empty() && 
-		!samecharset(result.doccharset, result.ocharset)) {
+		    charset.c_str(), p.doccharset.c_str()));
+	    if (!p.doccharset.empty() && 
+		!samecharset(p.doccharset, p.ocharset)) {
 		LOGDEB(("textHtmlToDoc: reparse for charsets\n"));
-		charset = result.doccharset;
+		charset = p.doccharset;
 	    } else {
 		LOGERR(("textHtmlToDoc:: error: non charset exception\n"));
-		return MimeHandler::MHError;
+		return false;
 	    }
 	}
    }

-    docout.origcharset = charset;
-    docout.text = result.dump;
-    //LOGDEB(("textHtmlToDoc: dump : %s\n", result.dump.c_str()));
-    docout.title = result.title;
-    docout.keywords = result.keywords;
-    docout.abstract = result.sample;
-    docout.dmtime = result.dmtime;
-    return MimeHandler::MHDone;
+    m_metaData["origcharset"] = m_defcharset;
+    m_metaData["charset"] = "utf-8";
+    m_metaData["title"] = p.title;
+    m_metaData["keywords"] = p.keywords;
+    m_metaData["modificationdate"] = p.dmtime;
+    m_metaData["sample"] = p.sample;
+    m_metaData["mimetype"] = "text/plain";
+    return true;
 }
--- a/src/internfile/mh_html.h
+++ b/src/internfile/mh_html.h
@ -16,7 +16,7 @@
 */
 #ifndef _HTML_H_INCLUDED_
 #define _HTML_H_INCLUDED_
-/* @(#$Id: mh_html.h,v 1.7 2006-01-30 11:15:27 dockes Exp $  (C) 2004 J.F.Dockes */
+/* @(#$Id: mh_html.h,v 1.8 2006-12-15 12:40:02 dockes Exp $  (C) 2004 J.F.Dockes */

 #include <string>

@ -24,26 +24,16 @@

 /**
 Translate html document to internal one. 
-
- There are 2 interfaces, depending if we're working on a file, or
- on a string. The string form is applied to the output of external
- handlers for foreign formats: they return a result in html, which
- has the advantage to be text (easy to use in shell-scripts), and
- semi-structured (can carry titles, abstracts, whatever)
 */
-class MimeHandlerHtml : public MimeHandler {
+class MimeHandlerHtml : public RecollFilter {
 public:
-    std::string charsethint;
-
-    /** Create internal document from html file (standard interface) */
-    virtual MimeHandler::Status 
-	mkDoc(RclConfig *conf, const std::string &fn, 
-	      const std::string &mtype, Rcl::Doc &docout, std::string&);
-
-    /** Create internal doc from html string (postfilter for external ones) */
-    virtual MimeHandler::Status 
-	mkDoc(RclConfig *conf, const std::string &fn, const std::string& htext,
-	      const std::string &mtype, Rcl::Doc &docout);
+    MimeHandlerHtml(const string& mt) : RecollFilter(mt) {}
+    virtual ~MimeHandlerHtml() {}
+    virtual bool set_document_file(const string &file_path);
+    virtual bool set_document_string(const string &data);
+    virtual bool next_document();
+private:
+    string m_html;
 };

 #endif /* _HTML_H_INCLUDED_ */
--- a/src/internfile/mh_mail.cpp
+++ b/src/internfile/mh_mail.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: mh_mail.cpp,v 1.23 2006-12-07 08:06:20 dockes Exp $ (C) 2005 J.F.Dockes";
+static char rcsid[] = "@(#$Id: mh_mail.cpp,v 1.24 2006-12-15 12:40:02 dockes Exp $ (C) 2005 J.F.Dockes";
 #endif
 /*
 *   This program is free software; you can redistribute it and/or modify
@ -23,192 +23,81 @@ static char rcsid[] = "@(#$Id: mh_mail.cpp,v 1.23 2006-12-07 08:06:20 dockes Exp
 #include <errno.h>
 #include <unistd.h>
 #include <time.h>
-#include <regex.h>

 #include <map>
 #include <sstream>

 #include "mimehandler.h"
-#include "debuglog.h"
-#include "csguess.h"
 #include "readfile.h"
 #include "transcode.h"
 #include "mimeparse.h"
-#include "indextext.h"
 #include "mh_mail.h"
 #include "debuglog.h"
 #include "smallut.h"
-#include "mimeparse.h"
 #include "mh_html.h"

 // binc imap mime definitions
 #include "mime.h"

-#ifndef NO_NAMESPACES
 using namespace std;
-#endif /* NO_NAMESPACES */

 static const int maxdepth = 20;

-MimeHandlerMail::~MimeHandlerMail()
+MimeHandlerMail::~MimeHandlerMail() 
 {
-    if (m_vfp) {
-	fclose((FILE *)m_vfp);
-	m_vfp = 0;
-    }
+    delete m_bincdoc;
+    if (m_fd >= 0)
+	close(m_fd);
+    delete m_stream;
 }
-
-// We are called for two different file types: mbox-type folders
-// holding multiple messages, and maildir-type files with one message
-// ipath is non empty only when we are called for retrieving a single message
-// for preview. It is always empty during indexing, and we fill it up with 
-// the message number for the returned doc
-MimeHandler::Status 
-MimeHandlerMail::mkDoc(RclConfig *cnf, const string &fn, 
-			const string &mtype, Rcl::Doc &docout, string& ipath)
+bool MimeHandlerMail::set_document_file(const string &fn)
 {
-    LOGDEB2(("MimeHandlerMail::mkDoc: %s [%s]\n", mtype.c_str(), fn.c_str()));
-    m_conf = cnf;
-
-    if (!stringlowercmp("message/rfc822", mtype)) {
-	ipath = "";
-	int fd;
-	if ((fd = open(fn.c_str(), 0)) < 0) {
-	    LOGERR(("MimeHandlerMail::mkDoc: open(%s) errno %d\n",
-		    fn.c_str(), errno));
-	    return MimeHandler::MHError;
-	}
-	Binc::MimeDocument doc;
-	doc.parseFull(fd);
-	if (!doc.isHeaderParsed() && !doc.isAllParsed()) {
-	    LOGERR(("MimeHandlerMail::mkDoc: mime parse error for %s\n",
-		    fn.c_str()));
-	    return MimeHandler::MHError;
-	}
-	MimeHandler::Status ret = processMsg(docout, doc, 0);
-	close(fd);
-	return ret;
-    } else  if (!stringlowercmp("text/x-mail", mtype)) {
-	return processmbox(fn, docout, ipath);
-    } else // hu ho
-	return MimeHandler::MHError;
-}
-
-static const  char *frompat = "^From .* [1-2][0-9][0-9][0-9][\r]*\n$";
-static regex_t fromregex;
-static bool regcompiled;
-
-MimeHandler::Status 
-MimeHandlerMail::processmbox(const string &fn, Rcl::Doc &docout, string& ipath)
-{
-    int mtarg = 0;
-    if (ipath != "") {
-	sscanf(ipath.c_str(), "%d", &mtarg);
+    if (m_fd >= 0) {
+	close(m_fd);
+	m_fd = -1;
    }
-    LOGDEB2(("MimeHandlerMail::processmbox: fn %s, mtarg %d\n", fn.c_str(),
-	    mtarg));
-
-    FILE *fp;
-    // Open the file on first call, then save/reuse the file pointer
-    if (!m_vfp) {
-	fp = fopen(fn.c_str(), "r");
-	if (fp == 0) {
-	    LOGERR(("MimeHandlerMail::processmbox: error opening %s\n", 
-		    fn.c_str()));
-	    return MimeHandler::MHError;
-	}
-	m_vfp = fp;
-    } else {
-	fp = (FILE *)m_vfp;
+    m_fd = open(fn.c_str(), 0);
+    if (m_fd < 0) {
+	LOGERR(("MimeHandlerMail::set_document_file: open(%s) errno %d\n",
+		fn.c_str(), errno));
+	return false;
    }
-    if (!regcompiled) {
-	regcomp(&fromregex, frompat, REG_NOSUB);
-	regcompiled = true;
-    }
-
-    // If we are called to retrieve a specific message, seek to bof
-    // (then scan up to the message). This is for the case where the
-    // same object is reused to fetch several messages (else the fp is
-    // just opened no need for a seek).  We could also check if the
-    // current message number is lower than the requested one and
-    // avoid rereading the whole thing in this case. But I'm not sure
-    // we're ever used in this way (multiple retrieves on same
-    // object).  So:
-    if (mtarg > 0) {
-	fseek(fp, 0, SEEK_SET);
-	m_msgnum = 0;
-    }
-
-    off_t start, end;
-    bool iseof = false;
-    bool hademptyline = true;
-    string msgtxt;
-    do  {
-	// Look for next 'From ' Line, start of message. Set start to
-	// line after this
-	char line[501];
-	for (;;) {
-	    if (!fgets(line, 500, fp)) {
-		// Eof hit while looking for 'From ' -> file done. We'd need
-		// another return code here
-		return MimeHandler::MHError;
-	    }
-	    if (line[0] == '\n' || line[0] == '\r') {
-		hademptyline = true;
-		continue;
-	    }
-	    if (hademptyline && !regexec(&fromregex, line, 0, 0, 0)) {
-		start = ftello(fp);
-		m_msgnum++;
-		break;
-	    }
-	    hademptyline = false;
-	}
-
-	// Look for next 'From ' line or eof, end of message.
-	for (;;) {
-	    end = ftello(fp);
-	    if (!fgets(line, 500, fp)) {
-		if (ferror(fp) || feof(fp))
-		    iseof = true;
-		break;
-	    }
-	    if (hademptyline && !regexec(&fromregex, line, 0, 0, 0)) {
-		break;
-	    }
-	    if (mtarg <= 0 || m_msgnum == mtarg) {
-		msgtxt += line;
-	    }
-	    if (line[0] == '\n' || line[0] == '\r') {
-		hademptyline = true;
-	    } else {
-		hademptyline = false;
-	    }
-	}
-	fseek(fp, end, SEEK_SET);
-    } while (mtarg > 0 && m_msgnum < mtarg);
-
-    stringstream s(msgtxt);
-    LOGDEB2(("Message text: [%s]\n", msgtxt.c_str()));
-    Binc::MimeDocument doc;
-    doc.parseFull(s);
-    if (!doc.isHeaderParsed() && !doc.isAllParsed()) {
-	LOGERR(("MimeHandlerMail::processMbox: mime parse error for %s\n",
+    delete m_bincdoc;
+    m_bincdoc = new Binc::MimeDocument;
+    m_bincdoc->parseFull(m_fd);
+    if (!m_bincdoc->isHeaderParsed() && !m_bincdoc->isAllParsed()) {
+	LOGERR(("MimeHandlerMail::mkDoc: mime parse error for %s\n",
 		fn.c_str()));
-	return MimeHandler::MHError;
+	return false;
    }
-
-    MimeHandler::Status ret = processMsg(docout, doc, 0);
-
-    if (ret == MimeHandler::MHError)
-	return ret;
-    char buf[20];
-    sprintf(buf, "%d", m_msgnum);
-    ipath = buf;
-    return iseof ? MimeHandler::MHDone : 
-	(mtarg > 0) ? MimeHandler::MHDone : MimeHandler::MHAgain;
+    m_havedoc = true;
+    return true;
 }

+bool MimeHandlerMail::set_document_string(const string &msgtxt)
+{
+    LOGDEB2(("Message text: [%s]\n", msgtxt.c_str()));
+    delete m_stream;
+    m_stream = new stringstream(msgtxt);
+    delete m_bincdoc;
+    m_bincdoc = new Binc::MimeDocument;
+    m_bincdoc->parseFull(*m_stream);
+    if (!m_bincdoc->isHeaderParsed() && !m_bincdoc->isAllParsed()) {
+	LOGERR(("MimeHandlerMail::set_document_string: mime parse error\n"));
+	return false;
+    }
+    m_havedoc = true;
+    return true;
+}
+
+bool MimeHandlerMail::next_document()
+{
+    if (!m_havedoc)
+	return false;
+    m_havedoc = false;
+    m_metaData["mimetype"] = "text/plain";
+    return processMsg(m_bincdoc, 0);
+}

 // Transform a single message into a document. The subject becomes the
 // title, and any simple body part with a content-type of text or html
@ -217,58 +106,59 @@ MimeHandlerMail::processmbox(const string &fn, Rcl::Doc &docout, string& ipath)
 // If depth is not zero, we're called recursively for an
 // message/rfc822 part and we must not touch the doc fields except the
 // text
-MimeHandler::Status 
-MimeHandlerMail::processMsg(Rcl::Doc &docout, Binc::MimePart& doc, 
-			    int depth)
+bool MimeHandlerMail::processMsg(Binc::MimePart *doc, int depth)
 {
    LOGDEB2(("MimeHandlerMail::processMsg: depth %d\n", depth));
    if (depth++ >= maxdepth) {
 	// Have to stop somewhere
 	LOGDEB(("MimeHandlerMail::processMsg: maxdepth %d exceeded\n", 
 		maxdepth));
-	return MimeHandler::MHDone;
+	// Return true anyway, better to index partially than not at all
+	return true;
    }
 	
    // Handle some headers. 
+    string& text = m_metaData["content"];
    Binc::HeaderItem hi;
    string transcoded;
-    if (doc.h.getFirstHeader("From", hi)) {
+    if (doc->h.getFirstHeader("From", hi)) {
 	rfc2047_decode(hi.getValue(), transcoded);
-	docout.text += string("From: ") + transcoded + string("\n");
+	text += string("From: ") + transcoded + string("\n");
    }
-    if (doc.h.getFirstHeader("To", hi)) {
+    if (doc->h.getFirstHeader("To", hi)) {
 	rfc2047_decode(hi.getValue(), transcoded);
-	docout.text += string("To: ") + transcoded + string("\n");
+	text += string("To: ") + transcoded + string("\n");
    }
-    if (doc.h.getFirstHeader("Date", hi)) {
+    if (doc->h.getFirstHeader("Date", hi)) {
 	rfc2047_decode(hi.getValue(), transcoded);
 	if (depth == 1) {
 	    time_t t = rfc2822DateToUxTime(transcoded);
 	    if (t != (time_t)-1) {
 		char ascuxtime[100];
 		sprintf(ascuxtime, "%ld", (long)t);
-		docout.dmtime = ascuxtime;
+		m_metaData["modificationdate"] = ascuxtime;
 	    } else {
 		// Leave mtime field alone, ftime will be used instead.
 		LOGDEB(("rfc2822Date...: failed: [%s]\n", transcoded.c_str()));
 	    }
 	}
-	docout.text += string("Date: ") + transcoded + string("\n");
+	text += string("Date: ") + transcoded + string("\n");
    }
-    if (doc.h.getFirstHeader("Subject", hi)) {
+    if (doc->h.getFirstHeader("Subject", hi)) {
 	rfc2047_decode(hi.getValue(), transcoded);
 	if (depth == 1)
-	    docout.title = transcoded;
-	docout.text += string("Subject: ") + transcoded + string("\n");
+	    m_metaData["title"] = transcoded;
+	text += string("Subject: ") + transcoded + string("\n");
    }
-    docout.text += '\n';
+    text += '\n';

-    LOGDEB2(("MimeHandlerMail::processMsg:ismultipart %d mime subtype '%s'\n",
-	    doc.isMultipart(), doc.getSubType().c_str()));
-    walkmime(docout, doc, depth);
+    LOGDEB2(("MimeHandlerMail::rocessMsg:ismultipart %d mime subtype '%s'\n",
+	    doc->isMultipart(), doc->getSubType().c_str()));
+    walkmime(doc, depth);

-    LOGDEB2(("MimeHandlerMail::processMsg:text:[%s]\n", docout.text.c_str()));
-    return MimeHandler::MHDone;
+    LOGDEB2(("MimeHandlerMail::processMsg:text:[%s]\n", 
+	    m_metaData["content"].c_str()));
+    return true;
 }

 // Recursively walk the message mime parts and concatenate all the
@ -281,8 +171,7 @@ MimeHandlerMail::processMsg(Rcl::Doc &docout, Binc::MimePart& doc,
 // 
 // multipart can be mixed, alternative, parallel, digest.
 // message/rfc822 may also be of interest.
-
-void MimeHandlerMail::walkmime(Rcl::Doc& docout, Binc::MimePart& doc,int depth)
+void MimeHandlerMail::walkmime(Binc::MimePart* doc, int depth)
 {
    LOGDEB2(("MimeHandlerMail::walkmime: depth %d\n", depth));
    if (depth++ >= maxdepth) {
@ -290,28 +179,29 @@ void MimeHandlerMail::walkmime(Rcl::Doc& docout, Binc::MimePart& doc,int depth)
 	return;
    }

-    string &out = docout.text;
+    string& out = m_metaData["content"];

-    if (doc.isMultipart()) {
+    if (doc->isMultipart()) {
 	LOGDEB2(("walkmime: ismultipart %d subtype '%s'\n", 
-		doc.isMultipart(), doc.getSubType().c_str()));
+		doc->isMultipart(), doc->getSubType().c_str()));
 	// We only handle alternative, related and mixed (no digests). 
 	std::vector<Binc::MimePart>::iterator it;

-	if (!stringicmp("mixed", doc.getSubType()) || 
-	    !stringicmp("related", doc.getSubType())) {
+	if (!stringicmp("mixed", doc->getSubType()) || 
+	    !stringicmp("related", doc->getSubType())) {
 	    // Multipart mixed and related:  process each part.
-	    for (it = doc.members.begin(); it != doc.members.end();it++) {
-		walkmime(docout, *it, depth);
+	    for (it = doc->members.begin(); it != doc->members.end();it++) {
+		walkmime(&(*it), depth);
 	    }

-	} else if (!stringicmp("alternative", doc.getSubType())) {
+	} else if (!stringicmp("alternative", doc->getSubType())) {
 	    // Multipart/alternative: look for a text/plain part, then html.
 	    // Process if found
 	    std::vector<Binc::MimePart>::iterator ittxt, ithtml;
-	    ittxt = ithtml = doc.members.end();
+	    ittxt = ithtml = doc->members.end();
 	    int i = 1;
-	    for (it = doc.members.begin(); it != doc.members.end();it++, i++) {
+	    for (it = doc->members.begin(); 
+		 it != doc->members.end(); it++, i++) {
 		// Get and parse content-type header
 		Binc::HeaderItem hi;
 		if (!it->h.getFirstHeader("Content-Type", hi)) {
@ -326,12 +216,12 @@ void MimeHandlerMail::walkmime(Rcl::Doc& docout, Binc::MimePart& doc,int depth)
 		else if (!stringlowercmp("text/html", content_type.value)) 
 		    ithtml = it;
 	    }
-	    if (ittxt != doc.members.end()) {
+	    if (ittxt != doc->members.end()) {
 		LOGDEB2(("walkmime: alternative: chose text/plain part\n"))
-		walkmime(docout, *ittxt, depth);
-	    } else if (ithtml != doc.members.end()) {
+		    walkmime(&(*ittxt), depth);
+	    } else if (ithtml != doc->members.end()) {
 		LOGDEB2(("walkmime: alternative: chose text/html part\n"))
-		walkmime(docout, *ithtml, depth);
+		    walkmime(&(*ithtml), depth);
 	    }
 	}
 	return;
@ -343,7 +233,7 @@ void MimeHandlerMail::walkmime(Rcl::Doc& docout, Binc::MimePart& doc,int depth)
    // Get and parse content-type header.
    Binc::HeaderItem hi;
    string ctt = "text/plain";
-    if (doc.h.getFirstHeader("Content-Type", hi)) {
+    if (doc->h.getFirstHeader("Content-Type", hi)) {
 	ctt = hi.getValue();
    }
    LOGDEB2(("walkmime:content-type: %s\n", ctt.c_str()));
@ -352,7 +242,7 @@ void MimeHandlerMail::walkmime(Rcl::Doc& docout, Binc::MimePart& doc,int depth)
 	    
    // Get and parse Content-Disposition header
    string ctd = "inline";
-    if (doc.h.getFirstHeader("Content-Disposition", hi)) {
+    if (doc->h.getFirstHeader("Content-Disposition", hi)) {
 	ctd = hi.getValue();
    }
    MimeHeaderValue content_disposition;
@ -371,13 +261,13 @@ void MimeHandlerMail::walkmime(Rcl::Doc& docout, Binc::MimePart& doc,int depth)
    if (it != content_disposition.params.end())
 	filename = it->second;

-    if (doc.isMessageRFC822()) {
+    if (doc->isMessageRFC822()) {
 	LOGDEB2(("walkmime: message/RFC822 part\n"));
 	
 	// The first part is the already parsed message.  Call
 	// processMsg instead of walkmime so that mail headers get
 	// printed. The depth will tell it what to do
-	if (doc.members.empty()) {
+	if (doc->members.empty()) {
 	    //??
 	    return;
 	}
@ -388,7 +278,7 @@ void MimeHandlerMail::walkmime(Rcl::Doc& docout, Binc::MimePart& doc,int depth)
 	if (m_forPreview)
 	    out += "]";
 	out += "\n\n";
-	processMsg(docout, doc.members[0], depth);
+	processMsg(&doc->members[0], depth);
 	return;
    }

@ -437,14 +327,14 @@ void MimeHandlerMail::walkmime(Rcl::Doc& docout, Binc::MimePart& doc,int depth)

    // Content transfer encoding
    string cte = "7bit";
-    if (doc.h.getFirstHeader("Content-Transfer-Encoding", hi)) {
+    if (doc->h.getFirstHeader("Content-Transfer-Encoding", hi)) {
 	cte = hi.getValue();
    } 

    LOGDEB2(("walkmime: final: body start offset %d, length %d\n", 
-	     doc.getBodyStartOffset(), doc.getBodyLength()));
+	     doc->getBodyStartOffset(), doc->getBodyLength()));
    string body;
-    doc.getBody(body, 0, doc.bodylength);
+    doc->getBody(body, 0, doc->bodylength);

    // Decode according to content transfer encoding
    if (!stringlowercmp("quoted-printable", cte)) {
@ -472,22 +362,30 @@ void MimeHandlerMail::walkmime(Rcl::Doc& docout, Binc::MimePart& doc,int depth)

    // Handle html stripping and transcoding to utf8
    string utf8;
+    const string *putf8 = 0;
    if (!stringlowercmp("text/html", content_type.value)) {
-	MimeHandlerHtml mh;
-	Rcl::Doc hdoc;
-	mh.charsethint = charset;
-	mh.mkDoc(m_conf, "", body, content_type.value,  hdoc);
-	utf8 = hdoc.text;
+	MimeHandlerHtml mh("text/html");
+	mh.set_property(Dijon::Filter::OPERATING_MODE, 
+			m_forPreview ? "view" : "index");
+	mh.set_property(Dijon::Filter::DEFAULT_CHARSET, charset);
+	mh.set_document_string(body);
+	mh.next_document();
+	map<string, string>::const_iterator it = 
+	    mh.get_meta_data().find("content");
+	if (it != mh.get_meta_data().end())
+	    putf8 = &it->second;
    } else {
 	// Transcode to utf-8 
 	if (!transcode(body, utf8, charset, "UTF-8")) {
 	    LOGERR(("walkmime: transcode failed from cs '%s' to UTF-8\n",
 		    charset.c_str()));
-	    utf8 = body;
+	    putf8 = &body;
+	} else {
+	    putf8 = &utf8;
 	}
    }
-
-    out += utf8;
+    if (putf8)
+	out += *putf8;
    if (out.length() && out[out.length()-1] != '\n')
 	out += '\n';
    
--- a/src/internfile/mh_mail.h
+++ b/src/internfile/mh_mail.h
@ -16,8 +16,9 @@
 */
 #ifndef _MAIL_H_INCLUDED_
 #define _MAIL_H_INCLUDED_
-/* @(#$Id: mh_mail.h,v 1.8 2006-09-19 14:30:39 dockes Exp $  (C) 2004 J.F.Dockes */
+/* @(#$Id: mh_mail.h,v 1.9 2006-12-15 12:40:02 dockes Exp $  (C) 2004 J.F.Dockes */

+#include <sstream>
 #include "mimehandler.h"

 namespace Binc {
@ -30,26 +31,21 @@ namespace Binc {
 * for maildir files). This has to keep state while parsing a mail folder
 * file. 
 */
-class MimeHandlerMail : public MimeHandler {
+class MimeHandlerMail : public RecollFilter {
 public:
-    MimeHandlerMail() : m_vfp(0), m_msgnum(0), m_conf(0) {}
-
-    virtual MimeHandler::Status 
-	mkDoc(RclConfig *conf, const std::string &fn, 
-	      const std::string &mtype, Rcl::Doc &docout, std::string& ipath);
-
+    MimeHandlerMail(const string &mt) 
+	: RecollFilter(mt), m_bincdoc(0), m_fd(-1), m_stream(0) 
+    {}
    virtual ~MimeHandlerMail();
-
+    virtual bool set_document_file(const string &file_path);
+    virtual bool set_document_string(const string &data);
+    virtual bool next_document();
 private:
-    void      *m_vfp;    // File pointer for folder
-    int        m_msgnum; // Current message number in folder. Starts at 1
-    RclConfig *m_conf;   // Keep pointer to rclconfig around
-
-    MimeHandler::Status processmbox(const string &fn, Rcl::Doc &docout, 
-				   string &ipath);
-    MimeHandler::Status processMsg(Rcl::Doc &docout, Binc::MimePart& doc,
-				   int depth);
-    void walkmime(Rcl::Doc &docout, Binc::MimePart& doc, int depth);
+    Binc::MimeDocument *m_bincdoc;
+    bool processMsg(Binc::MimePart *doc, int depth);
+    void walkmime(Binc::MimePart* doc, int depth);
+    int m_fd;
+    std::stringstream *m_stream;
 };

 #endif /* _MAIL_H_INCLUDED_ */
--- a/src/internfile/mh_mbox.cpp
+++ b/src/internfile/mh_mbox.cpp
@ -0,0 +1,166 @@
+#ifndef lint
+static char rcsid[] = "@(#$Id: mh_mbox.cpp,v 1.1 2006-12-15 12:40:24 dockes Exp $ (C) 2005 J.F.Dockes";
+#endif
+/*
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program; if not, write to the
+ *   Free Software Foundation, Inc.,
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ */
+
+#include <stdio.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <unistd.h>
+#include <time.h>
+#include <regex.h>
+
+#include <map>
+#include <sstream>
+
+#include "mimehandler.h"
+#include "debuglog.h"
+#include "readfile.h"
+#include "mh_mbox.h"
+#include "smallut.h"
+
+using namespace std;
+
+MimeHandlerMbox::~MimeHandlerMbox()
+{
+    if (m_vfp) {
+	fclose((FILE *)m_vfp);
+	m_vfp = 0;
+    }
+}
+
+bool MimeHandlerMbox::set_document_file(const string &fn)
+{
+    LOGDEB(("MimeHandlerMbox::set_document_file(%s)\n", fn.c_str()));
+    m_fn = fn;
+    if (m_vfp) {
+	fclose((FILE *)m_vfp);
+	m_vfp = 0;
+    }
+
+    m_vfp = fopen(fn.c_str(), "r");
+    if (m_vfp == 0) {
+	LOGERR(("MimeHandlerMail::set_document_file: error opening %s\n", 
+		fn.c_str()));
+	return false;
+    }
+    m_havedoc = true;
+    return true;
+}
+
+static const  char *frompat = "^From .* [1-2][0-9][0-9][0-9][\r]*\n$";
+static regex_t fromregex;
+static bool regcompiled;
+
+bool MimeHandlerMbox::next_document()
+{
+    if (m_vfp == 0) {
+	LOGERR(("MimeHandlerMbox::next_document: not open\n"));
+	return false;
+    }
+    if (!m_havedoc) {
+	return false;
+    }
+    FILE *fp = (FILE *)m_vfp;
+    int mtarg = 0;
+    if (m_ipath != "") {
+	sscanf(m_ipath.c_str(), "%d", &mtarg);
+    } else if (m_forPreview) {
+	// Can't preview an mbox
+	return false;
+    }
+    LOGDEB(("MimeHandlerMbox::next_document: fn %s, msgnum %d mtarg %d \n", 
+	    m_fn.c_str(), m_msgnum, mtarg));
+
+    if (!regcompiled) {
+	regcomp(&fromregex, frompat, REG_NOSUB);
+	regcompiled = true;
+    }
+
+    // If we are called to retrieve a specific message, seek to bof
+    // (then scan up to the message). This is for the case where the
+    // same object is reused to fetch several messages (else the fp is
+    // just opened no need for a seek).  We could also check if the
+    // current message number is lower than the requested one and
+    // avoid rereading the whole thing in this case. But I'm not sure
+    // we're ever used in this way (multiple retrieves on same
+    // object).  So:
+    if (mtarg > 0) {
+	fseek(fp, 0, SEEK_SET);
+	m_msgnum = 0;
+    }
+
+    off_t start, end;
+    bool iseof = false;
+    bool hademptyline = true;
+    string& msgtxt = m_metaData["content"];
+    msgtxt.erase();
+    do  {
+	// Look for next 'From ' Line, start of message. Set start to
+	// line after this
+	char line[501];
+	for (;;) {
+	    if (!fgets(line, 500, fp)) {
+		// Eof hit while looking for 'From ' -> file done. We'd need
+		// another return code here
+		return false;
+	    }
+	    if (line[0] == '\n' || line[0] == '\r') {
+		hademptyline = true;
+		continue;
+	    }
+	    if (hademptyline && !regexec(&fromregex, line, 0, 0, 0)) {
+		start = ftello(fp);
+		m_msgnum++;
+		break;
+	    }
+	    hademptyline = false;
+	}
+
+	// Look for next 'From ' line or eof, end of message.
+	for (;;) {
+	    end = ftello(fp);
+	    if (!fgets(line, 500, fp)) {
+		if (ferror(fp) || feof(fp))
+		    iseof = true;
+		break;
+	    }
+	    if (hademptyline && !regexec(&fromregex, line, 0, 0, 0)) {
+		break;
+	    }
+	    if (mtarg <= 0 || m_msgnum == mtarg) {
+		msgtxt += line;
+	    }
+	    if (line[0] == '\n' || line[0] == '\r') {
+		hademptyline = true;
+	    } else {
+		hademptyline = false;
+	    }
+	}
+	fseek(fp, end, SEEK_SET);
+    } while (mtarg > 0 && m_msgnum < mtarg);
+
+    LOGDEB2(("Message text: [%s]\n", msgtxt.c_str()));
+    char buf[20];
+    sprintf(buf, "%d", m_msgnum);
+    m_metaData["ipath"] = buf;
+    m_metaData["mimetype"] = "message/rfc822";
+    if (iseof)
+	m_havedoc = false;
+    return msgtxt.empty() ? false : true;
+}
--- a/src/internfile/mh_mbox.h
+++ b/src/internfile/mh_mbox.h
@ -0,0 +1,51 @@
+/*
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program; if not, write to the
+ *   Free Software Foundation, Inc.,
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ */
+#ifndef _MBOX_H_INCLUDED_
+#define _MBOX_H_INCLUDED_
+/* @(#$Id: mh_mbox.h,v 1.1 2006-12-15 12:40:24 dockes Exp $  (C) 2004 J.F.Dockes */
+
+#include <string>
+using std::string;
+
+#include "mimehandler.h"
+
+/** 
+ * Translate a mail folder file into internal documents (also works
+ * for maildir files). This has to keep state while parsing a mail folder
+ * file. 
+ */
+class MimeHandlerMbox : public RecollFilter {
+ public:
+    MimeHandlerMbox(const string& mime) 
+	: RecollFilter(mime), m_vfp(0), m_msgnum(0) 
+    {}
+    virtual ~MimeHandlerMbox();
+    virtual bool set_document_file(const string &file_path);
+    virtual bool next_document();
+    virtual bool skip_to_document(const string& ipath) {
+	m_ipath = ipath;
+	return true;
+    }
+
+ private:
+    string     m_fn;     // File name
+    void      *m_vfp;    // File pointer for folder
+    int        m_msgnum; // Current message number in folder. Starts at 1
+    string     m_ipath;
+};
+
+#endif /* _MBOX_H_INCLUDED_ */
--- a/src/internfile/mh_text.cpp
+++ b/src/internfile/mh_text.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: mh_text.cpp,v 1.5 2006-03-20 15:14:08 dockes Exp $ (C) 2005 J.F.Dockes";
+static char rcsid[] = "@(#$Id: mh_text.cpp,v 1.6 2006-12-15 12:40:02 dockes Exp $ (C) 2005 J.F.Dockes";
 #endif
 /*
 *   This program is free software; you can redistribute it and/or modify
@ -31,34 +31,44 @@ using namespace std;
 #include "transcode.h"

 // Process a plain text file
-MimeHandler::Status MimeHandlerText::mkDoc(RclConfig *conf, const string &fn, 
-			     const string &mtype, Rcl::Doc &docout, string&)
+bool MimeHandlerText::set_document_file(const string &fn)
 {
    string otext;
    if (!file_to_string(fn, otext))
-	return MimeHandler::MHError;
-	
-    // Try to guess charset, then convert to utf-8, and fill document
-    // fields The charset guesser really doesnt work well in general
-    // and should be avoided (especially for short documents)
-    string charset;
-    if (conf->getGuessCharset()) {
-	charset = csguess(otext, conf->getDefCharset());
-    } else
-	charset = conf->getDefCharset();
+	return false;
+    return set_document_string(otext);
+}
+    
+bool MimeHandlerText::set_document_string(const string& otext)
+{
+    m_text = otext;
+    m_havedoc = true;
+    return true;
+}

+bool MimeHandlerText::next_document()
+{	
+    if (m_havedoc == false)
+	return false;
+    m_havedoc = false;
    LOGDEB1(("MimeHandlerText::mkDoc: transcod from %s to utf-8\n", 
-	     charset.c_str()));
+	     m_defcharset.c_str()));

-    string utf8;
-    if (!transcode(otext, utf8, charset, "UTF-8")) {
+    // Avoid unneeded copy. This gets a reference to an empty string which is
+    // the entry for "content"
+    string& utf8 = m_metaData["content"];
+
+    // Note that we transcode always even if defcharset is already utf-8: 
+    // this validates the encoding.
+    if (!transcode(m_text, utf8, m_defcharset, "UTF-8")) {
 	LOGERR(("MimeHandlerText::mkDoc: transcode to utf-8 failed "
-		"for charset [%s]\n", charset.c_str()));
-	otext.erase();
-	return MimeHandler::MHError;
+		"for charset [%s]\n", m_defcharset.c_str()));
+	utf8.erase();
+	return false;
    }

-    docout.origcharset = charset;
-    docout.text = utf8;
-    return MimeHandler::MHDone;
+    m_metaData["origcharset"] = m_defcharset;
+    m_metaData["charset"] = "utf-8";
+    m_metaData["mimetype"] = "text/plain";
+    return true;
 }
--- a/src/internfile/mh_text.h
+++ b/src/internfile/mh_text.h
@ -16,12 +16,11 @@
 */
 #ifndef _MH_TEXT_H_INCLUDED_
 #define _MH_TEXT_H_INCLUDED_
-/* @(#$Id: mh_text.h,v 1.2 2006-01-30 11:15:27 dockes Exp $  (C) 2004 J.F.Dockes */
+/* @(#$Id: mh_text.h,v 1.3 2006-12-15 12:40:02 dockes Exp $  (C) 2004 J.F.Dockes */

 #include <string>
+using std::string;

-#include "rclconfig.h"
-#include "rcldb.h"
 #include "mimehandler.h"

 /**
@ -29,12 +28,15 @@
 *
 * Maybe try to guess charset, or use default, then transcode to utf8
 */
-class MimeHandlerText : public MimeHandler {
+class MimeHandlerText : public RecollFilter {
 public:
-    MimeHandler::Status mkDoc(RclConfig *conf, const std::string &fn, 
-			      const std::string &mtype, Rcl::Doc &docout, 
-			      std::string&);
-    
+    MimeHandlerText(const string& mt) : RecollFilter(mt) {}
+    virtual ~MimeHandlerText() {}
+    virtual bool set_document_file(const string &file_path);
+    virtual bool set_document_string(const string&);
+    virtual bool next_document();
+private:
+    string m_text;
 };

 #endif /* _MH_TEXT_H_INCLUDED_ */
--- a/src/internfile/mh_unknown.h
+++ b/src/internfile/mh_unknown.h
@ -16,24 +16,33 @@
 */
 #ifndef _MH_UNKNOWN_H_INCLUDED_
 #define _MH_UNKNOWN_H_INCLUDED_
-/* @(#$Id: mh_unknown.h,v 1.1 2006-03-28 09:36:53 dockes Exp $  (C) 2004 J.F.Dockes */
+/* @(#$Id: mh_unknown.h,v 1.2 2006-12-15 12:40:02 dockes Exp $  (C) 2004 J.F.Dockes */

 #include <string>

-#include "rclconfig.h"
-#include "rcldb.h"
 #include "mimehandler.h"

 /**
 * Handler for files with no content handler: does nothing.
 *
 */
-class MimeHandlerUnknown : public MimeHandler {
+class MimeHandlerUnknown : public RecollFilter {
 public:
-    MimeHandler::Status mkDoc(RclConfig *conf, const std::string &fn, 
-			      const std::string &mtype, Rcl::Doc &docout, 
-			      std::string&) {
-	return MimeHandler::MHDone;
+    MimeHandlerUnknown(const string& mt) : RecollFilter(mt) {}
+    virtual ~MimeHandlerUnknown() {}
+    virtual bool set_document_string(const string&) {
+	return m_havedoc = true;
+    }
+    virtual bool set_document_file(const string&) {
+	return m_havedoc = true;
+    }
+    virtual bool next_document() {
+	if (m_havedoc == false)
+	    return false;
+	m_havedoc = false; 
+	m_metaData["content"] = "";
+	m_metaData["mimetype"] = "text/plain";
+	return true;
    }
 };

--- a/src/internfile/mimehandler.cpp
+++ b/src/internfile/mimehandler.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: mimehandler.cpp,v 1.19 2006-12-13 09:13:18 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: mimehandler.cpp,v 1.20 2006-12-15 12:40:02 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 /*
 *   This program is free software; you can redistribute it and/or modify
@ -20,37 +20,40 @@ static char rcsid[] = "@(#$Id: mimehandler.cpp,v 1.19 2006-12-13 09:13:18 dockes

 #include <iostream>
 #include <string>
-#ifndef NO_NAMESPACES
+
 using namespace std;
-#endif /* NO_NAMESPACES */

 #include "mimehandler.h"
 #include "debuglog.h"
+#include "rclconfig.h"
 #include "smallut.h"
+
+#include "mh_exec.h"
 #include "mh_html.h"
 #include "mh_mail.h"
+#include "mh_mbox.h"
 #include "mh_text.h"
-#include "mh_exec.h"
 #include "mh_unknown.h"
  
 /** Create internal handler object appropriate for given mime type */
-static MimeHandler *mhFactory(const string &mime)
+static Dijon::Filter *mhFactory(const string &mime)
 {
    if (!stringlowercmp("text/plain", mime))
-	return new MimeHandlerText;
+	return new MimeHandlerText("text/plain");
    else if (!stringlowercmp("text/html", mime))
-	return new MimeHandlerHtml;
+	return new MimeHandlerHtml("text/html");
    else if (!stringlowercmp("text/x-mail", mime))
-	return new MimeHandlerMail;
+	return new MimeHandlerMbox("text/x-mail");
    else if (!stringlowercmp("message/rfc822", mime))
-	return new MimeHandlerMail;
-    return 0;
+	return new MimeHandlerMail("message/rfc822");
+    else 
+	return new MimeHandlerUnknown("application/octet-stream");
 }

 /**
 * Return handler object for given mime type:
 */
-MimeHandler *getMimeHandler(const string &mtype, RclConfig *cfg)
+Dijon::Filter *getMimeHandler(const string &mtype, RclConfig *cfg)
 {
    // Get handler definition for mime type
    string hs;
@ -78,7 +81,7 @@ MimeHandler *getMimeHandler(const string &mtype, RclConfig *cfg)
 			mtype.c_str(), hs.c_str()));
 		return 0;
 	    }
-	    MimeHandlerExec *h = new MimeHandlerExec;
+	    MimeHandlerExec *h = new MimeHandlerExec(mtype.c_str());
 	    it++;
 	    h->params.push_back(cfg->findFilter(*it++));
 	    h->params.insert(h->params.end(), it, toks.end());
@ -93,7 +96,8 @@ MimeHandler *getMimeHandler(const string &mtype, RclConfig *cfg)
    bool indexunknown = false;
    cfg->getConfParam("indexallfilenames", &indexunknown);
    if (indexunknown) {
-	return new MimeHandlerUnknown;
+	LOGDEB(("getMimeHandler: returning MimeHandlerUnknown\n"));
+	return new MimeHandlerUnknown("application/octet-stream");
    } else {
 	return 0;
    }
--- a/src/internfile/mimehandler.h
+++ b/src/internfile/mimehandler.h
@ -16,60 +16,74 @@
 */
 #ifndef _MIMEHANDLER_H_INCLUDED_
 #define _MIMEHANDLER_H_INCLUDED_
-/* @(#$Id: mimehandler.h,v 1.12 2006-03-29 13:08:08 dockes Exp $  (C) 2004 J.F.Dockes */
+/* @(#$Id: mimehandler.h,v 1.13 2006-12-15 12:40:02 dockes Exp $  (C) 2004 J.F.Dockes */

 #include <string>
 #include <list>
+using std::string;
+using std::list;

-#include "rclconfig.h"
-#include "rcldb.h"
+#include <Filter.h>

+class RclConfig;

-/**
- * Document interner class. 
- */
-class MimeHandler {
- public:
-    MimeHandler() : m_forPreview(false) {}
-    virtual ~MimeHandler() {}
+class RecollFilter : public Dijon::Filter {
+public:
+    RecollFilter(const string& mtype)
+	: Dijon::Filter(mtype), m_forPreview(false), m_havedoc(false)
+    {}
+    virtual ~RecollFilter() {}
+    virtual bool set_property(Properties p, const string &v) {
+	switch (p) {
+	case DEFAULT_CHARSET: 
+	    m_defcharset = v;
+	    break;
+	case OPERATING_MODE: 
+	    if (!v.empty() && v[0] == 'v') 
+		m_forPreview = true; 
+	    else 
+		m_forPreview = false;
+	    break;
+	}
+	return true;
+    }

-    /// Status from mkDoc method.
-    enum Status {MHError, MHDone, MHAgain};
-    /**
-     * Transform external data into internal utf8 document
-     *
-     * @param conf the global configuration
-     * @param filename File from which the data comes from
-     * @param mimetype its mime type (from the mimemap configuration file)
-     * @param outdoc   The output document
-     * @param ipath the access path for the document inside the file. 
-     *              For mono-document file types, this will always be empty. 
-     *              It is used, for example for mbox files which may contain
-     *              multiple emails. If this is not empty in input, then the
-     *              caller is requesting a single document (ie: for display).
-     *              If this is empty (during indexation), it will be filled-up
-     *              by the function, and all the file's documents will be 
-     *              returned by successive calls.
-     * @return The return value indicates if there are more documents to be 
-     *         fetched from the same file.
-     */
-    virtual MimeHandler::Status mkDoc(RclConfig * conf, 
-				      const std::string &filename, 
-				      const std::string &mimetype, 
-				      Rcl::Doc& outdoc,
-				      string& ipath) = 0;
+    // We don't use this for now
+    virtual bool set_document_uri(const std::string &) {return false;}

-    virtual void setForPreview(bool onoff) {m_forPreview = onoff;};
+    // Default implementations
+    virtual bool set_document_string(const std::string &) {return false;}
+    virtual bool set_document_data(const char *cp, unsigned int sz) {
+	return set_document_string(string(cp, sz));
+    }

- protected:
-    bool m_forPreview;
+    virtual bool has_documents() const {return m_havedoc;}
+
+    // Most doc types are single-doc
+    virtual bool skip_to_document(const string& s) {
+	if (s.empty())
+	    return true;
+	return false;
+    }
+
+    virtual DataInput get_required_data_input() const 
+    {return DOCUMENT_FILE_NAME;}
+    virtual string get_error() const {
+	return m_reason;
+    }
+
+protected:
+    bool   m_forPreview;
+    string m_defcharset;
+    string m_reason;
+    bool   m_havedoc;
 };

 /**
 * Return indexing handler object for the given mime type
 * returned pointer should be deleted by caller
 */
-extern MimeHandler *getMimeHandler(const std::string &mtyp, RclConfig *cfg);
+extern Dijon::Filter *getMimeHandler(const std::string &mtyp, RclConfig *cfg);

 /// Can this mime type be interned ?
 extern bool canIntern(const std::string mimetype, RclConfig *cfg);
--- a/src/internfile/myhtmlparse.h
+++ b/src/internfile/myhtmlparse.h
@ -37,11 +37,13 @@ class MyHtmlParser : public HtmlParser {
    bool in_body_tag; 
    bool in_pre_tag;
    bool pending_space;
-    string title, sample, keywords, dump, dmtime;
+    bool indexing_allowed;
+    string title, sample, keywords, dmtime;
+    string localdump;
+    string &dump;
    string ocharset; // This is the charset our user thinks the doc was
    string charset; // This is the charset it was supposedly converted to
    string doccharset; // Set this to value of charset parameter in header
-    bool indexing_allowed;
    void process_text(const string &text);
    void opening_tag(const string &tag, const map<string,string> &p);
    void closing_tag(const string &tag);
@ -52,5 +54,16 @@ class MyHtmlParser : public HtmlParser {
 	in_body_tag(false),
 	in_pre_tag(false),
 	pending_space(false),
-	indexing_allowed(true) { }
+	indexing_allowed(true),
+	dump(localdump)
+    { }
+    MyHtmlParser(string& buf) :
+	in_script_tag(false),
+	in_style_tag(false),
+	in_body_tag(false),
+	in_pre_tag(false),
+	pending_space(false),
+	indexing_allowed(true),
+	dump(buf)
+    { }
 };
--- a/src/lib/Makefile
+++ b/src/lib/Makefile
@ -8,8 +8,8 @@ LIBS = librcl.a

 all: $(LIBS)

-OBJS =  rclaspell.o rclconfig.o rclinit.o textsplit.o unacpp.o csguess.o indexer.o mimetype.o htmlparse.o internfile.o mh_exec.o mh_html.o mh_mail.o mh_text.o mimehandler.o myhtmlparse.o docseq.o history.o sortseq.o pathhash.o rcldb.o searchdata.o stemdb.o base64.o conftree.o copyfile.o debuglog.o execmd.o fstreewalk.o idfile.o md5.o mimeparse.o pathut.o readfile.o smallut.o transcode.o wipedir.o
-DEPS =  rclaspell.dep.stamp rclconfig.dep.stamp rclinit.dep.stamp textsplit.dep.stamp unacpp.dep.stamp csguess.dep.stamp indexer.dep.stamp mimetype.dep.stamp htmlparse.dep.stamp internfile.dep.stamp mh_exec.dep.stamp mh_html.dep.stamp mh_mail.dep.stamp mh_text.dep.stamp mimehandler.dep.stamp myhtmlparse.dep.stamp docseq.dep.stamp history.dep.stamp sortseq.dep.stamp pathhash.dep.stamp rcldb.dep.stamp searchdata.dep.stamp stemdb.dep.stamp base64.dep.stamp conftree.dep.stamp copyfile.dep.stamp debuglog.dep.stamp execmd.dep.stamp fstreewalk.dep.stamp idfile.dep.stamp md5.dep.stamp mimeparse.dep.stamp pathut.dep.stamp readfile.dep.stamp smallut.dep.stamp transcode.dep.stamp wipedir.dep.stamp
+OBJS =  rclaspell.o rclconfig.o rclinit.o textsplit.o unacpp.o csguess.o indexer.o mimetype.o htmlparse.o myhtmlparse.o mimehandler.o internfile.o mh_exec.o mh_html.o mh_mail.o mh_mbox.o mh_text.o docseq.o history.o sortseq.o pathhash.o rcldb.o searchdata.o stemdb.o base64.o conftree.o copyfile.o debuglog.o execmd.o fstreewalk.o idfile.o md5.o mimeparse.o pathut.o readfile.o smallut.o transcode.o wipedir.o
+DEPS =  rclaspell.dep.stamp rclconfig.dep.stamp rclinit.dep.stamp textsplit.dep.stamp unacpp.dep.stamp csguess.dep.stamp indexer.dep.stamp mimetype.dep.stamp htmlparse.dep.stamp myhtmlparse.dep.stamp mimehandler.dep.stamp internfile.dep.stamp mh_exec.dep.stamp mh_html.dep.stamp mh_mail.dep.stamp mh_mbox.dep.stamp mh_text.dep.stamp docseq.dep.stamp history.dep.stamp sortseq.dep.stamp pathhash.dep.stamp rcldb.dep.stamp searchdata.dep.stamp stemdb.dep.stamp base64.dep.stamp conftree.dep.stamp copyfile.dep.stamp debuglog.dep.stamp execmd.dep.stamp fstreewalk.dep.stamp idfile.dep.stamp md5.dep.stamp mimeparse.dep.stamp pathut.dep.stamp readfile.dep.stamp smallut.dep.stamp transcode.dep.stamp wipedir.dep.stamp

 librcl.a : $(DEPS) $(OBJS) unac.o
 	ar ru librcl.a $(OBJS) unac.o
@ -35,6 +35,10 @@ mimetype.o : ../index/mimetype.cpp
 	$(CXX) $(ALL_CXXFLAGS) -c ../index/mimetype.cpp
 htmlparse.o : ../internfile/htmlparse.cpp
 	$(CXX) $(ALL_CXXFLAGS) -c ../internfile/htmlparse.cpp
+myhtmlparse.o : ../internfile/myhtmlparse.cpp
+	$(CXX) $(ALL_CXXFLAGS) -c ../internfile/myhtmlparse.cpp
+mimehandler.o : ../internfile/mimehandler.cpp
+	$(CXX) $(ALL_CXXFLAGS) -c ../internfile/mimehandler.cpp
 internfile.o : ../internfile/internfile.cpp
 	$(CXX) $(ALL_CXXFLAGS) -c ../internfile/internfile.cpp
 mh_exec.o : ../internfile/mh_exec.cpp
@ -43,12 +47,10 @@ mh_html.o : ../internfile/mh_html.cpp
 	$(CXX) $(ALL_CXXFLAGS) -c ../internfile/mh_html.cpp
 mh_mail.o : ../internfile/mh_mail.cpp
 	$(CXX) $(ALL_CXXFLAGS) -c ../internfile/mh_mail.cpp
+mh_mbox.o : ../internfile/mh_mbox.cpp
+	$(CXX) $(ALL_CXXFLAGS) -c ../internfile/mh_mbox.cpp
 mh_text.o : ../internfile/mh_text.cpp
 	$(CXX) $(ALL_CXXFLAGS) -c ../internfile/mh_text.cpp
-mimehandler.o : ../internfile/mimehandler.cpp
-	$(CXX) $(ALL_CXXFLAGS) -c ../internfile/mimehandler.cpp
-myhtmlparse.o : ../internfile/myhtmlparse.cpp
-	$(CXX) $(ALL_CXXFLAGS) -c ../internfile/myhtmlparse.cpp
 docseq.o : ../query/docseq.cpp
 	$(CXX) $(ALL_CXXFLAGS) -c ../query/docseq.cpp
 history.o : ../query/history.cpp
@ -124,6 +126,12 @@ mimetype.dep.stamp : ../index/mimetype.cpp
 htmlparse.dep.stamp : ../internfile/htmlparse.cpp
 	$(CXX) -M $(ALL_CXXFLAGS) ../internfile/htmlparse.cpp > htmlparse.dep
 	touch htmlparse.dep.stamp
+myhtmlparse.dep.stamp : ../internfile/myhtmlparse.cpp
+	$(CXX) -M $(ALL_CXXFLAGS) ../internfile/myhtmlparse.cpp > myhtmlparse.dep
+	touch myhtmlparse.dep.stamp
+mimehandler.dep.stamp : ../internfile/mimehandler.cpp
+	$(CXX) -M $(ALL_CXXFLAGS) ../internfile/mimehandler.cpp > mimehandler.dep
+	touch mimehandler.dep.stamp
 internfile.dep.stamp : ../internfile/internfile.cpp
 	$(CXX) -M $(ALL_CXXFLAGS) ../internfile/internfile.cpp > internfile.dep
 	touch internfile.dep.stamp
@ -136,15 +144,12 @@ mh_html.dep.stamp : ../internfile/mh_html.cpp
 mh_mail.dep.stamp : ../internfile/mh_mail.cpp
 	$(CXX) -M $(ALL_CXXFLAGS) ../internfile/mh_mail.cpp > mh_mail.dep
 	touch mh_mail.dep.stamp
+mh_mbox.dep.stamp : ../internfile/mh_mbox.cpp
+	$(CXX) -M $(ALL_CXXFLAGS) ../internfile/mh_mbox.cpp > mh_mbox.dep
+	touch mh_mbox.dep.stamp
 mh_text.dep.stamp : ../internfile/mh_text.cpp
 	$(CXX) -M $(ALL_CXXFLAGS) ../internfile/mh_text.cpp > mh_text.dep
 	touch mh_text.dep.stamp
-mimehandler.dep.stamp : ../internfile/mimehandler.cpp
-	$(CXX) -M $(ALL_CXXFLAGS) ../internfile/mimehandler.cpp > mimehandler.dep
-	touch mimehandler.dep.stamp
-myhtmlparse.dep.stamp : ../internfile/myhtmlparse.cpp
-	$(CXX) -M $(ALL_CXXFLAGS) ../internfile/myhtmlparse.cpp > myhtmlparse.dep
-	touch myhtmlparse.dep.stamp
 docseq.dep.stamp : ../query/docseq.cpp
 	$(CXX) -M $(ALL_CXXFLAGS) ../query/docseq.cpp > docseq.dep
 	touch docseq.dep.stamp
@ -217,13 +222,14 @@ include csguess.dep
 include indexer.dep
 include mimetype.dep
 include htmlparse.dep
+include myhtmlparse.dep
+include mimehandler.dep
 include internfile.dep
 include mh_exec.dep
 include mh_html.dep
 include mh_mail.dep
+include mh_mbox.dep
 include mh_text.dep
-include mimehandler.dep
-include myhtmlparse.dep
 include docseq.dep
 include history.dep
 include sortseq.dep
--- a/src/lib/mkMake
+++ b/src/lib/mkMake
@ -13,13 +13,14 @@ ${depth}/index/csguess.cpp \
 ${depth}/index/indexer.cpp \
 ${depth}/index/mimetype.cpp \
 ${depth}/internfile/htmlparse.cpp \
+${depth}/internfile/myhtmlparse.cpp \
+${depth}/internfile/mimehandler.cpp \
 ${depth}/internfile/internfile.cpp \
 ${depth}/internfile/mh_exec.cpp \
 ${depth}/internfile/mh_html.cpp \
 ${depth}/internfile/mh_mail.cpp \
+${depth}/internfile/mh_mbox.cpp \
 ${depth}/internfile/mh_text.cpp \
-${depth}/internfile/mimehandler.cpp \
-${depth}/internfile/myhtmlparse.cpp \
 ${depth}/query/docseq.cpp \
 ${depth}/query/history.cpp \
 ${depth}/query/sortseq.cpp \
--- a/src/utils/smallut.cpp
+++ b/src/utils/smallut.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: smallut.cpp,v 1.22 2006-12-14 13:53:43 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: smallut.cpp,v 1.23 2006-12-15 12:40:02 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 /*
 *   This program is free software; you can redistribute it and/or modify
@ -260,13 +260,14 @@ bool stringToStrings(const string &s, std::list<string> &tokens)
 }

 void stringToTokens(const string& str, list<string>& tokens,
-		    const string& delims)
+		    const string& delims, bool skipinit)
 {
-    string::size_type startPos, pos;
+    string::size_type startPos = 0, pos;

    for (pos = 0;;) { 
        // Skip initial delims, break if this eats all.
-        if ((startPos = str.find_first_not_of(delims, pos)) == string::npos)
+        if (skipinit && 
+	    (startPos = str.find_first_not_of(delims, pos)) == string::npos)
 	    break;
        // Find next delimiter or end of string (end of token)
        pos = str.find_first_of(delims, startPos);
--- a/src/utils/smallut.h
+++ b/src/utils/smallut.h
@ -16,7 +16,7 @@
 */
 #ifndef _SMALLUT_H_INCLUDED_
 #define _SMALLUT_H_INCLUDED_
-/* @(#$Id: smallut.h,v 1.22 2006-12-14 13:53:43 dockes Exp $  (C) 2004 J.F.Dockes */
+/* @(#$Id: smallut.h,v 1.23 2006-12-15 12:40:02 dockes Exp $  (C) 2004 J.F.Dockes */
 #include <string>
 #include <list>
 #include <map>
@ -51,7 +51,7 @@ extern bool stringToStrings(const string &s, list<string> &tokens);
 * Split input string. No handling of quoting
 */
 extern void stringToTokens(const string &s, list<string> &tokens, 
-			   const string &delims = " \t");
+			   const string &delims = " \t", bool skipinit=true);

 /** Convert string to boolean */
 extern bool stringToBool(const string &s);