From 33c95ef1ba259dbf001a2ba2cf426fc94dfc5cd2 Mon Sep 17 00:00:00 2001 From: dockes Date: Fri, 15 Dec 2006 12:40:24 +0000 Subject: [PATCH] Dijon filters 1st step: mostly working needs check and optim --- src/internfile/Filter.h | 164 +++++++++++++++++ src/internfile/Makefile | 4 +- src/internfile/internfile.cpp | 258 +++++++++++++++++++++----- src/internfile/internfile.h | 32 ++-- src/internfile/mh_exec.cpp | 28 +-- src/internfile/mh_exec.h | 17 +- src/internfile/mh_html.cpp | 71 ++++---- src/internfile/mh_html.h | 28 +-- src/internfile/mh_mail.cpp | 318 +++++++++++---------------------- src/internfile/mh_mail.h | 32 ++-- src/internfile/mh_mbox.cpp | 166 +++++++++++++++++ src/internfile/mh_mbox.h | 51 ++++++ src/internfile/mh_text.cpp | 54 +++--- src/internfile/mh_text.h | 18 +- src/internfile/mh_unknown.h | 25 ++- src/internfile/mimehandler.cpp | 30 ++-- src/internfile/mimehandler.h | 92 ++++++---- src/internfile/myhtmlparse.h | 19 +- src/lib/Makefile | 34 ++-- src/lib/mkMake | 5 +- src/utils/smallut.cpp | 9 +- src/utils/smallut.h | 4 +- 22 files changed, 979 insertions(+), 480 deletions(-) create mode 100644 src/internfile/Filter.h create mode 100644 src/internfile/mh_mbox.cpp create mode 100644 src/internfile/mh_mbox.h diff --git a/src/internfile/Filter.h b/src/internfile/Filter.h new file mode 100644 index 00000000..4bc30602 --- /dev/null +++ b/src/internfile/Filter.h @@ -0,0 +1,164 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Library General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ + +#ifndef _DIJON_FILTER_H +#define _DIJON_FILTER_H + +#include +#include +#include + +namespace Dijon +{ + class Filter; + + /** Provides the list of MIME types supported by the filter(s). + * The character string is allocated with new[]. + * This function is exported by dynamically loaded filter libraries. + */ + typedef bool (get_filter_types_func)(std::set &); + /** Returns what data should be passed to the filter(s). + * Output is cast from Filter::DataInput to int for convenience. + * This function is exported by dynamically loaded filter libraries. + * The aim is to let the client application know before-hand whether + * it should load documents or not. + */ + typedef int (get_filter_data_input_func)(void); + /** Returns a Filter that handles the given MIME type. + * The Filter object is allocated with new. + * This function is exported by dynamically loaded filter libraries + * and serves as a factory for Filter objects, so that the client + * application doesn't have to know which Filter sub-types handle + * which MIME types. + */ + typedef Filter *(get_filter_func)(const std::string &); + + /// Filter interface. + class Filter + { + public: + /// Builds an empty filter. + Filter(const std::string &mime_type) {} + /// Destroys the filter. + virtual ~Filter() {} + + + // Enumerations. + + /** What data a filter supports as input. + * It can be either the whole document data, its file name, or its URI. + */ + typedef enum { DOCUMENT_DATA=0, DOCUMENT_FILE_NAME, DOCUMENT_URI } DataInput; + + /** Input properties supported by the filter. + * - PREFERRED_CHARSET is the charset preferred by the client application. + * The filter will convert document's content to this charset if possible. + * - OPERATING_MODE can be set to either view or index. + */ + typedef enum { DEFAULT_CHARSET=0, OPERATING_MODE } Properties; + + + // Information. + + /// Returns what data the filter requires as input. + virtual DataInput get_required_data_input(void) const = 0; + + + // Initialization. + + /** Sets a property, prior to calling set_document_XXX(). + * Returns false if the property is not supported. + */ + virtual bool set_property(Properties prop_name, const std::string &prop_value) = 0; + + /** (Re)initializes the filter with the given data. + * Caller should ensure the given pointer is valid until the + * Filter object is destroyed, as some filters may not need to + * do a deep copy of the data. + * Returns false if this input is not supported or an error occured. + */ + virtual bool set_document_data(const char *data_ptr, unsigned int data_length) = 0; + virtual bool set_document_string(const string&) = 0; + + /** (Re)initializes the filter with the given file. + * Returns false if this input is not supported or an error occured. + */ + virtual bool set_document_file(const std::string &file_path) = 0; + + /** (Re)initializes the filter with the given URI. + * Returns false if this input is not supported or an error occured. + */ + virtual bool set_document_uri(const std::string &uri) = 0; + + + // Going from one nested document to the next. + + /** Returns true if there are nested documents left to extract. + * Returns false if the end of the parent document was reached + * or an error occured. + */ + virtual bool has_documents(void) const = 0; + + /** Moves to the next nested document. + * Returns false if there are none left. + */ + virtual bool next_document(void) = 0; + + /** Skips to the nested document with the given ipath. + * Returns false if no such document exists. + */ + virtual bool skip_to_document(const std::string &ipath) = 0; + + + // Accessing documents' contents. + + /// Returns the message for the most recent error that has occured. + virtual std::string get_error(void) const = 0; + + /** Returns a dictionary of metadata extracted from the current document. + * Metadata fields may include one or more of the following : + * content, title, ipath, mimetype, language, charset, author, creator, + * publisher, modificationdate, creationdate, size + * Special considerations apply : + * - content may contain binary data, watch out ! + * - ipath is an internal path to the nested document that can be + * later passed to skip_to_document(). It may be empty if the parent + * document's type doesn't allow embedding, in which case the filter + * should only return one document. + * - mimetype should be text/plain if the document could be handled + * internally, empty if unknown. If any other value, it is expected + * that the client application can pass the nested document's content + * to another filter that supports this particular type. + */ + const std::map &get_meta_data(void) const + { + return m_metaData; + } + + protected: + /// Metadata dictionary. + std::map m_metaData; + + private: + /// Filter objects cannot be copied. + Filter(const Filter &other); + /// Filter objects cannot be copied. + Filter& operator=(const Filter& other); + + }; +} + +#endif // _DIJON_FILTER_H diff --git a/src/internfile/Makefile b/src/internfile/Makefile index 69ffb566..07a950a4 100644 --- a/src/internfile/Makefile +++ b/src/internfile/Makefile @@ -1,9 +1,9 @@ -# @(#$Id: Makefile,v 1.1 2006-11-15 07:27:42 dockes Exp $ (C) 2005 J.F.Dockes +# @(#$Id: Makefile,v 1.2 2006-12-15 12:40:02 dockes Exp $ (C) 2005 J.F.Dockes depth = .. include $(depth)/mk/sysconf # Only test executables get build in here -PROGS = internfile unacpp textsplit rclconfig +PROGS = internfile all: $(BIGLIB) $(PROGS) diff --git a/src/internfile/internfile.cpp b/src/internfile/internfile.cpp index 46bf5eb4..82381c78 100644 --- a/src/internfile/internfile.cpp +++ b/src/internfile/internfile.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: internfile.cpp,v 1.18 2006-12-13 09:13:18 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: internfile.cpp,v 1.19 2006-12-15 12:40:02 dockes Exp $ (C) 2004 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -32,12 +32,14 @@ using namespace std; #endif /* NO_NAMESPACES */ #include "internfile.h" +#include "rcldoc.h" #include "mimetype.h" #include "debuglog.h" #include "mimehandler.h" #include "execmd.h" #include "pathut.h" #include "wipedir.h" +#include "rclconfig.h" // Execute the command to uncompress a file into a temporary one. static bool uncompressfile(RclConfig *conf, const string& ifn, @@ -106,98 +108,262 @@ void FileInterner::tmpcleanup() // internfile FileInterner::FileInterner(const std::string &f, RclConfig *cnf, const string& td, const string *imime) - : m_fn(f), m_cfg(cnf), m_tdir(td), m_handler(0) + : m_cfg(cnf), m_fn(f), m_forPreview(imime?true:false), m_tdir(td) { - // We are actually going to access the file, so it's ok - // performancewise to check this config variable at every call - // even if it can only change when we change directories - string usfc; - int usfci; - if (!cnf->getConfParam("usesystemfilecommand", usfc)) - usfci = 0; - else - usfci = atoi(usfc.c_str()) ? 1 : 0; + bool usfci = false; + cnf->getConfParam("usesystemfilecommand", &usfci); LOGDEB1(("FileInterner::FileInterner: usfci now %d\n", usfci)); - bool forPreview = imime ? true : false; - // We need to run mime type identification in any case to check // for a compressed file. - m_mime = mimetype(m_fn, m_cfg, usfci); + string l_mime = mimetype(m_fn, m_cfg, usfci); // If identification fails, try to use the input parameter. This // is then normally not a compressed type (it's the mime type from // the db), and is only set when previewing, not for indexing - if (m_mime.empty() && imime) - m_mime = *imime; + if (l_mime.empty() && imime) + l_mime = *imime; - if (!m_mime.empty()) { + if (!l_mime.empty()) { // Has mime: check for a compressed file. If so, create a // temporary uncompressed file, and rerun the mime type // identification, then do the rest with the temp file. listucmd; - if (m_cfg->getUncompressor(m_mime, ucmd)) { + if (m_cfg->getUncompressor(l_mime, ucmd)) { if (!uncompressfile(m_cfg, m_fn, ucmd, m_tdir, m_tfile)) { return; } LOGDEB(("internfile: after ucomp: m_tdir %s, tfile %s\n", m_tdir.c_str(), m_tfile.c_str())); m_fn = m_tfile; - m_mime = mimetype(m_fn, m_cfg, usfci); - if (m_mime.empty() && imime) - m_mime = *imime; + l_mime = mimetype(m_fn, m_cfg, usfci); + if (l_mime.empty() && imime) + l_mime = *imime; } } - if (m_mime.empty()) { + if (l_mime.empty()) { // No mime type. We let it through as config may warrant that // we index all file names LOGDEB(("internfile: (no mime) [%s]\n", m_fn.c_str())); } // Look for appropriate handler (might still return empty) - m_handler = getMimeHandler(m_mime, m_cfg); + Dijon::Filter *df = getMimeHandler(l_mime, m_cfg); - if (!m_handler) { + if (!df) { // No handler for this type, for now :( if indexallfilenames // is set in the config, this normally wont happen (we get mh_unknown) - LOGDEB(("FileInterner::FileInterner: %s: no handler\n", - m_mime.c_str())); + LOGDEB(("FileInterner:: no handler for %s\n", l_mime.c_str())); return; } - m_handler->setForPreview(forPreview); - LOGDEB(("FileInterner::FileInterner: %s [%s]\n", m_mime.c_str(), + df->set_property(Dijon::Filter::OPERATING_MODE, + m_forPreview ? "view" : "index"); + + string charset = m_cfg->getDefCharset(); + df->set_property(Dijon::Filter::DEFAULT_CHARSET, charset); + if (!df->set_document_file(m_fn)) { + LOGERR(("FileInterner:: error parsing %s\n", m_fn.c_str())); + return; + } + m_handlers.reserve(20); + m_handlers.push_back(df); + LOGDEB(("FileInterner::FileInterner: %s [%s]\n", l_mime.c_str(), m_fn.c_str())); } +static const unsigned int MAXHANDLERS = 20; + FileInterner::Status FileInterner::internfile(Rcl::Doc& doc, string& ipath) { - if (!m_handler) { - LOGERR(("FileInterner::internfile: no handler !!\n")); + if (m_handlers.size() != 1) { + LOGERR(("FileInterner::internfile: bad stack size %d !!\n", + m_handlers.size())); return FIError; } - // Turn file into a document. The document has fields for title, body - // etc., all text converted to utf8 - MimeHandler::Status mhs = - m_handler->mkDoc(m_cfg, m_fn, m_mime, doc, ipath); - FileInterner::Status ret = FIError; - switch (mhs) { - case MimeHandler::MHError: - LOGERR(("FileInterner::internfile: error parsing %s\n", m_fn.c_str())); - break; - case MimeHandler::MHDone: ret = FIDone;break; - case MimeHandler::MHAgain: ret = FIAgain;break; + // Note that the vector is big enough for the maximum stack. All values + // over the last significant one are "" + vector vipath(MAXHANDLERS); + int vipathidx = 0; + if (!ipath.empty()) { + list lipath; + stringToTokens(ipath, lipath, "|", true); + vipath.insert(vipath.begin(), lipath.begin(), lipath.end()); + if (!m_handlers.back()->skip_to_document(vipath[m_handlers.size()-1])){ + LOGERR(("FileInterner::internfile: can't skip\n")); + return FIError; + } } - doc.mimetype = m_mime; - return ret; + + /* Try to get doc from the topmost filter */ + while (!m_handlers.empty()) { + if (!vipath.empty()) { + + } + if (!m_handlers.back()->has_documents()) { + // No docs at the current top level. Pop and see if there + // is something at the previous one + delete m_handlers.back(); + m_handlers.pop_back(); + continue; + } + + if (!m_handlers.back()->next_document()) { + LOGERR(("FileInterner::internfile: next_document failed\n")); + return FIError; + } + + // Look at what we've got + const std::map *docdata = + &m_handlers.back()->get_meta_data(); + map::const_iterator it; + string charset; + it = docdata->find("charset"); + if (it != docdata->end()) + charset = it->second; + string mimetype; + it = docdata->find("mimetype"); + if (it != docdata->end()) + mimetype = it->second; + + LOGDEB(("FileInterner::internfile:next_doc is %s\n",mimetype.c_str())); + // If we find a text/plain doc, we're done + if (!strcmp(mimetype.c_str(), "text/plain")) + break; + + // Got a non text/plain doc. We need to stack another + // filter. Check current size + if (m_handlers.size() > MAXHANDLERS) { + // Stack too big. Skip this and go on to check if there is + // something else in the current back() + LOGDEB(("FileInterner::internfile: stack too high\n")); + continue; + } + + Dijon::Filter *again = getMimeHandler(mimetype, m_cfg); + if (!again) { + // If we can't find a filter, this doc can't be handled + // but there can be other ones so we go on + LOGERR(("FileInterner::internfile: no filter for [%s]\n", + mimetype.c_str())); + continue; + } + again->set_property(Dijon::Filter::OPERATING_MODE, + m_forPreview ? "view" : "index"); + again->set_property(Dijon::Filter::DEFAULT_CHARSET, + charset); + string ns; + const string *txt = &ns; + it = docdata->find("content"); + if (it != docdata->end()) + txt = &it->second; + if (!again->set_document_string(*txt)) { + LOGERR(("FileInterner::internfile: error reparsing for %s\n", + m_fn.c_str())); + delete again; + continue; + } + // add filter and go on + m_handlers.push_back(again); + if (!m_handlers.back()->skip_to_document(vipath[m_handlers.size()-1])){ + LOGERR(("FileInterner::internfile: can't skip\n")); + return FIError; + } + } + + if (m_handlers.empty()) { + LOGERR(("FileInterner::internfile: stack empty\n")); + return FIError; + } + if (!m_forPreview) { + string &ipath = doc.ipath; + bool hasipath = false; + for (vector::const_iterator it = m_handlers.begin(); + it != m_handlers.end(); it++) { + map::const_iterator iti = + (*it)->get_meta_data().find("ipath"); + if (iti != (*it)->get_meta_data().end()) { + if (!iti->second.empty()) + hasipath = true; + ipath += iti->second + "|"; + } else { + ipath += "|"; + } + } + if (hasipath) { + LOGDEB(("IPATH [%s]\n", ipath.c_str())); + string::size_type sit = ipath.find_last_not_of("|"); + if (sit == string::npos) + ipath.erase(); + else if (sit < ipath.length() -1) + ipath.erase(sit+1); + } else { + ipath.erase(); + } + } + + dijontorcl(m_handlers.back(), doc); + + // Destack what can be + while (!m_handlers.empty() && !m_handlers.back()->has_documents()) { + delete m_handlers.back(); + m_handlers.pop_back(); + } + if (m_handlers.empty() || !m_handlers.back()->has_documents()) + return FIDone; + else + return FIAgain; +} + + +bool FileInterner::dijontorcl(Dijon::Filter *df, Rcl::Doc& doc) +{ + const std::map *docdata = &df->get_meta_data(); + map::const_iterator it; + + it = docdata->find("mimetype"); + if (it != docdata->end()) + doc.mimetype = it->second; + + it = docdata->find("origcharset"); + if (it != docdata->end()) + doc.origcharset = it->second; + + it = docdata->find("content"); + if (it != docdata->end()) + doc.text = it->second; + + it = docdata->find("title"); + if (it != docdata->end()) + doc.title = it->second; + + it = docdata->find("keywords"); + if (it != docdata->end()) + doc.keywords = it->second; + + it = docdata->find("modificationdate"); + if (it != docdata->end()) + doc.dmtime = it->second; + + it = docdata->find("abstract"); + if (it != docdata->end()) { + doc.abstract = it->second; + } else { + it = docdata->find("sample"); + if (it != docdata->end()) + doc.abstract = it->second; + } + return true; } FileInterner::~FileInterner() { - delete m_handler; - m_handler = 0; + while (!m_handlers.empty()) { + delete m_handlers.back(); + m_handlers.pop_back(); + } tmpcleanup(); } @@ -212,6 +378,8 @@ using namespace std; #include "debuglog.h" #include "rclinit.h" #include "internfile.h" +#include "rclconfig.h" +#include "rcldoc.h" static string thisprog; diff --git a/src/internfile/internfile.h b/src/internfile/internfile.h index 79078ef6..0eac6360 100644 --- a/src/internfile/internfile.h +++ b/src/internfile/internfile.h @@ -16,14 +16,19 @@ */ #ifndef _INTERNFILE_H_INCLUDED_ #define _INTERNFILE_H_INCLUDED_ -/* @(#$Id: internfile.h,v 1.6 2006-01-30 11:15:27 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: internfile.h,v 1.7 2006-12-15 12:40:02 dockes Exp $ (C) 2004 J.F.Dockes */ #include +#include +using std::string; +using std::vector; -#include "rclconfig.h" -#include "rcldb.h" +#include "Filter.h" -class MimeHandler; +class RclConfig; +namespace Rcl { +class Doc; +} /// Turn external file into internal representation, according to mime /// type etc @@ -43,8 +48,8 @@ class FileInterner { * mime type for the uncompressed version. This currently doubles up * to indicate that this object is for previewing (not indexing). */ - FileInterner(const std::string &fn, RclConfig *cnf, const string& td, - const std::string *mtype = 0); + FileInterner(const string &fn, RclConfig *cnf, const string& td, + const string *mtype = 0); ~FileInterner(); @@ -67,15 +72,16 @@ class FileInterner { Status internfile(Rcl::Doc& doc, string &ipath); private: - string m_fn; - RclConfig *m_cfg; - const string &m_tdir; - MimeHandler *m_handler; - - string m_tfile; - string m_mime; + RclConfig *m_cfg; + string m_fn; + bool m_forPreview; + // m_tdir and m_tfile are used only for decompressing input file if needed + const string& m_tdir; + string m_tfile; + vector m_handlers; void tmpcleanup(); + static bool dijontorcl(Dijon::Filter *, Rcl::Doc&); }; #endif /* _INTERNFILE_H_INCLUDED_ */ diff --git a/src/internfile/mh_exec.cpp b/src/internfile/mh_exec.cpp index f9192b2c..14c57865 100644 --- a/src/internfile/mh_exec.cpp +++ b/src/internfile/mh_exec.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: mh_exec.cpp,v 1.7 2006-12-13 09:13:18 dockes Exp $ (C) 2005 J.F.Dockes"; +static char rcsid[] = "@(#$Id: mh_exec.cpp,v 1.8 2006-12-15 12:40:02 dockes Exp $ (C) 2005 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -37,15 +37,15 @@ public: // Execute an external program to translate a file from its native format // to html. Then call the html parser to do the actual indexing -MimeHandler::Status -MimeHandlerExec::mkDoc(RclConfig *conf, const string &fn, - const string &mtype, Rcl::Doc &docout, string&) +bool MimeHandlerExec::next_document() { + if (m_havedoc == false) + return false; + m_havedoc = false; if (params.empty()) { // Hu ho - LOGERR(("MimeHandlerExec::mkDoc: empty params for mime %s\n", - mtype.c_str())); - return MimeHandler::MHError; + LOGERR(("MimeHandlerExec::mkDoc: empty params\n")); + return false; } // Command name @@ -54,10 +54,10 @@ MimeHandlerExec::mkDoc(RclConfig *conf, const string &fn, // Build parameter list: delete cmd name and add the file name list::iterator it = params.begin(); listmyparams(++it, params.end()); - myparams.push_back(fn); + myparams.push_back(m_fn); // Execute command and store the result text, which is supposedly html - string html; + string& html = m_metaData["content"]; ExecCmd mexec; MEAdv adv; mexec.setAdvise(&adv); @@ -67,10 +67,12 @@ MimeHandlerExec::mkDoc(RclConfig *conf, const string &fn, if (status) { LOGERR(("MimeHandlerExec: command status 0x%x: %s\n", status, cmd.c_str())); - return MimeHandler::MHError; + return false; } - // Process/index the html - MimeHandlerHtml hh; - return hh.mkDoc(conf, fn, html, mtype, docout); + m_metaData["origcharset"] = m_defcharset; + // All recoll filters output utf-8 + m_metaData["charset"] = "utf-8"; + m_metaData["mimetype"] = "text/html"; + return true; } diff --git a/src/internfile/mh_exec.h b/src/internfile/mh_exec.h index 40b88e01..c402106c 100644 --- a/src/internfile/mh_exec.h +++ b/src/internfile/mh_exec.h @@ -16,7 +16,7 @@ */ #ifndef _MH_EXEC_H_INCLUDED_ #define _MH_EXEC_H_INCLUDED_ -/* @(#$Id: mh_exec.h,v 1.2 2006-01-30 11:15:27 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: mh_exec.h,v 1.3 2006-12-15 12:40:02 dockes Exp $ (C) 2004 J.F.Dockes */ #include #include @@ -29,14 +29,19 @@ Turn external document into internal one by executing an external filter. The command to execute, and its parameters, come from the mimeconf file */ -class MimeHandlerExec : public MimeHandler { +class MimeHandlerExec : public RecollFilter { public: std::list params; + MimeHandlerExec(const string& mt) : RecollFilter(mt) {} virtual ~MimeHandlerExec() {} - virtual MimeHandler::Status - mkDoc(RclConfig *conf, const std::string &fn, - const std::string &mtype, Rcl::Doc &docout, std::string&); - + virtual bool set_document_file(const string &file_path) { + m_fn = file_path; + m_havedoc = true; + return true; + } + virtual bool next_document(); +private: + string m_fn; }; #endif /* _MH_EXEC_H_INCLUDED_ */ diff --git a/src/internfile/mh_html.cpp b/src/internfile/mh_html.cpp index 112f42e7..8479ceca 100644 --- a/src/internfile/mh_html.cpp +++ b/src/internfile/mh_html.cpp @@ -41,36 +41,31 @@ using namespace std; #endif /* NO_NAMESPACES */ -MimeHandler::Status -MimeHandlerHtml::mkDoc(RclConfig *conf, const string &fn, - const string &mtype, Rcl::Doc &docout, string&) +bool MimeHandlerHtml::set_document_file(const string &fn) { LOGDEB(("textHtmlToDoc: %s\n", fn.c_str())); string otext; if (!file_to_string(fn, otext)) { LOGINFO(("textHtmlToDoc: cant read: %s\n", fn.c_str())); - return MimeHandler::MHError; + return false; } - return mkDoc(conf, fn, otext, mtype, docout); + return set_document_string(otext); } -MimeHandler::Status -MimeHandlerHtml::mkDoc(RclConfig *conf, const string &, - const string& htext, - const string &mtype, Rcl::Doc &docout) +bool MimeHandlerHtml::set_document_string(const string& htext) { - //LOGDEB(("textHtmlToDoc: htext: %s\n", htext.c_str())); - // Character set handling: the initial guessed charset depends on - // external factors: possible hint (ie mime charset in a mail - // message), charset guessing, or default configured charset. - string charset; - if (!charsethint.empty()) { - charset = charsethint; - } else if (conf->getGuessCharset()) { - charset = csguess(htext, conf->getDefCharset()); - } else - charset = conf->getDefCharset(); + m_html = htext; + m_havedoc = true; + return true; +} +bool MimeHandlerHtml::next_document() +{ + if (m_havedoc == false) + return false; + m_havedoc = false; + LOGDEB(("textHtmlToDoc: next_document\n")); + string charset = m_defcharset; // - We first try to convert from the default configured charset // (which may depend of the current directory) to utf-8. If this @@ -80,16 +75,16 @@ MimeHandlerHtml::mkDoc(RclConfig *conf, const string &, // instead of the configuration one. LOGDEB(("textHtmlToDoc: charset before parsing: [%s]\n", charset.c_str())); - MyHtmlParser result; + + MyHtmlParser p(m_metaData["content"]); for (int pass = 0; pass < 2; pass++) { string transcoded; LOGDEB(("Html::mkDoc: pass %d\n", pass)); - MyHtmlParser p; // Try transcoding. If it fails, use original text. - if (!transcode(htext, transcoded, charset, "UTF-8")) { + if (!transcode(m_html, transcoded, charset, "UTF-8")) { LOGERR(("textHtmlToDoc: transcode failed from cs '%s' to UTF-8\n", charset.c_str())); - transcoded = htext; + transcoded = m_html; // We don't know the charset, at all p.ocharset = p.charset = charset = ""; } else { @@ -102,31 +97,29 @@ MimeHandlerHtml::mkDoc(RclConfig *conf, const string &, try { p.parse_html(transcoded); // No exception: ok? - result = p; break; } catch (bool diag) { - result = p; if (diag == true) break; LOGDEB(("textHtmlToDoc: charset [%s] doc charset [%s]\n", - charset.c_str(),result.doccharset.c_str())); - if (!result.doccharset.empty() && - !samecharset(result.doccharset, result.ocharset)) { + charset.c_str(), p.doccharset.c_str())); + if (!p.doccharset.empty() && + !samecharset(p.doccharset, p.ocharset)) { LOGDEB(("textHtmlToDoc: reparse for charsets\n")); - charset = result.doccharset; + charset = p.doccharset; } else { LOGERR(("textHtmlToDoc:: error: non charset exception\n")); - return MimeHandler::MHError; + return false; } } } - docout.origcharset = charset; - docout.text = result.dump; - //LOGDEB(("textHtmlToDoc: dump : %s\n", result.dump.c_str())); - docout.title = result.title; - docout.keywords = result.keywords; - docout.abstract = result.sample; - docout.dmtime = result.dmtime; - return MimeHandler::MHDone; + m_metaData["origcharset"] = m_defcharset; + m_metaData["charset"] = "utf-8"; + m_metaData["title"] = p.title; + m_metaData["keywords"] = p.keywords; + m_metaData["modificationdate"] = p.dmtime; + m_metaData["sample"] = p.sample; + m_metaData["mimetype"] = "text/plain"; + return true; } diff --git a/src/internfile/mh_html.h b/src/internfile/mh_html.h index 15a9d5f8..db1cf8da 100644 --- a/src/internfile/mh_html.h +++ b/src/internfile/mh_html.h @@ -16,7 +16,7 @@ */ #ifndef _HTML_H_INCLUDED_ #define _HTML_H_INCLUDED_ -/* @(#$Id: mh_html.h,v 1.7 2006-01-30 11:15:27 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: mh_html.h,v 1.8 2006-12-15 12:40:02 dockes Exp $ (C) 2004 J.F.Dockes */ #include @@ -24,26 +24,16 @@ /** Translate html document to internal one. - - There are 2 interfaces, depending if we're working on a file, or - on a string. The string form is applied to the output of external - handlers for foreign formats: they return a result in html, which - has the advantage to be text (easy to use in shell-scripts), and - semi-structured (can carry titles, abstracts, whatever) */ -class MimeHandlerHtml : public MimeHandler { +class MimeHandlerHtml : public RecollFilter { public: - std::string charsethint; - - /** Create internal document from html file (standard interface) */ - virtual MimeHandler::Status - mkDoc(RclConfig *conf, const std::string &fn, - const std::string &mtype, Rcl::Doc &docout, std::string&); - - /** Create internal doc from html string (postfilter for external ones) */ - virtual MimeHandler::Status - mkDoc(RclConfig *conf, const std::string &fn, const std::string& htext, - const std::string &mtype, Rcl::Doc &docout); + MimeHandlerHtml(const string& mt) : RecollFilter(mt) {} + virtual ~MimeHandlerHtml() {} + virtual bool set_document_file(const string &file_path); + virtual bool set_document_string(const string &data); + virtual bool next_document(); +private: + string m_html; }; #endif /* _HTML_H_INCLUDED_ */ diff --git a/src/internfile/mh_mail.cpp b/src/internfile/mh_mail.cpp index 65ff74e2..41611740 100644 --- a/src/internfile/mh_mail.cpp +++ b/src/internfile/mh_mail.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: mh_mail.cpp,v 1.23 2006-12-07 08:06:20 dockes Exp $ (C) 2005 J.F.Dockes"; +static char rcsid[] = "@(#$Id: mh_mail.cpp,v 1.24 2006-12-15 12:40:02 dockes Exp $ (C) 2005 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -23,192 +23,81 @@ static char rcsid[] = "@(#$Id: mh_mail.cpp,v 1.23 2006-12-07 08:06:20 dockes Exp #include #include #include -#include #include #include #include "mimehandler.h" -#include "debuglog.h" -#include "csguess.h" #include "readfile.h" #include "transcode.h" #include "mimeparse.h" -#include "indextext.h" #include "mh_mail.h" #include "debuglog.h" #include "smallut.h" -#include "mimeparse.h" #include "mh_html.h" // binc imap mime definitions #include "mime.h" -#ifndef NO_NAMESPACES using namespace std; -#endif /* NO_NAMESPACES */ static const int maxdepth = 20; -MimeHandlerMail::~MimeHandlerMail() +MimeHandlerMail::~MimeHandlerMail() { - if (m_vfp) { - fclose((FILE *)m_vfp); - m_vfp = 0; - } + delete m_bincdoc; + if (m_fd >= 0) + close(m_fd); + delete m_stream; } - -// We are called for two different file types: mbox-type folders -// holding multiple messages, and maildir-type files with one message -// ipath is non empty only when we are called for retrieving a single message -// for preview. It is always empty during indexing, and we fill it up with -// the message number for the returned doc -MimeHandler::Status -MimeHandlerMail::mkDoc(RclConfig *cnf, const string &fn, - const string &mtype, Rcl::Doc &docout, string& ipath) +bool MimeHandlerMail::set_document_file(const string &fn) { - LOGDEB2(("MimeHandlerMail::mkDoc: %s [%s]\n", mtype.c_str(), fn.c_str())); - m_conf = cnf; - - if (!stringlowercmp("message/rfc822", mtype)) { - ipath = ""; - int fd; - if ((fd = open(fn.c_str(), 0)) < 0) { - LOGERR(("MimeHandlerMail::mkDoc: open(%s) errno %d\n", - fn.c_str(), errno)); - return MimeHandler::MHError; - } - Binc::MimeDocument doc; - doc.parseFull(fd); - if (!doc.isHeaderParsed() && !doc.isAllParsed()) { - LOGERR(("MimeHandlerMail::mkDoc: mime parse error for %s\n", - fn.c_str())); - return MimeHandler::MHError; - } - MimeHandler::Status ret = processMsg(docout, doc, 0); - close(fd); - return ret; - } else if (!stringlowercmp("text/x-mail", mtype)) { - return processmbox(fn, docout, ipath); - } else // hu ho - return MimeHandler::MHError; -} - -static const char *frompat = "^From .* [1-2][0-9][0-9][0-9][\r]*\n$"; -static regex_t fromregex; -static bool regcompiled; - -MimeHandler::Status -MimeHandlerMail::processmbox(const string &fn, Rcl::Doc &docout, string& ipath) -{ - int mtarg = 0; - if (ipath != "") { - sscanf(ipath.c_str(), "%d", &mtarg); + if (m_fd >= 0) { + close(m_fd); + m_fd = -1; } - LOGDEB2(("MimeHandlerMail::processmbox: fn %s, mtarg %d\n", fn.c_str(), - mtarg)); - - FILE *fp; - // Open the file on first call, then save/reuse the file pointer - if (!m_vfp) { - fp = fopen(fn.c_str(), "r"); - if (fp == 0) { - LOGERR(("MimeHandlerMail::processmbox: error opening %s\n", - fn.c_str())); - return MimeHandler::MHError; - } - m_vfp = fp; - } else { - fp = (FILE *)m_vfp; + m_fd = open(fn.c_str(), 0); + if (m_fd < 0) { + LOGERR(("MimeHandlerMail::set_document_file: open(%s) errno %d\n", + fn.c_str(), errno)); + return false; } - if (!regcompiled) { - regcomp(&fromregex, frompat, REG_NOSUB); - regcompiled = true; - } - - // If we are called to retrieve a specific message, seek to bof - // (then scan up to the message). This is for the case where the - // same object is reused to fetch several messages (else the fp is - // just opened no need for a seek). We could also check if the - // current message number is lower than the requested one and - // avoid rereading the whole thing in this case. But I'm not sure - // we're ever used in this way (multiple retrieves on same - // object). So: - if (mtarg > 0) { - fseek(fp, 0, SEEK_SET); - m_msgnum = 0; - } - - off_t start, end; - bool iseof = false; - bool hademptyline = true; - string msgtxt; - do { - // Look for next 'From ' Line, start of message. Set start to - // line after this - char line[501]; - for (;;) { - if (!fgets(line, 500, fp)) { - // Eof hit while looking for 'From ' -> file done. We'd need - // another return code here - return MimeHandler::MHError; - } - if (line[0] == '\n' || line[0] == '\r') { - hademptyline = true; - continue; - } - if (hademptyline && !regexec(&fromregex, line, 0, 0, 0)) { - start = ftello(fp); - m_msgnum++; - break; - } - hademptyline = false; - } - - // Look for next 'From ' line or eof, end of message. - for (;;) { - end = ftello(fp); - if (!fgets(line, 500, fp)) { - if (ferror(fp) || feof(fp)) - iseof = true; - break; - } - if (hademptyline && !regexec(&fromregex, line, 0, 0, 0)) { - break; - } - if (mtarg <= 0 || m_msgnum == mtarg) { - msgtxt += line; - } - if (line[0] == '\n' || line[0] == '\r') { - hademptyline = true; - } else { - hademptyline = false; - } - } - fseek(fp, end, SEEK_SET); - } while (mtarg > 0 && m_msgnum < mtarg); - - stringstream s(msgtxt); - LOGDEB2(("Message text: [%s]\n", msgtxt.c_str())); - Binc::MimeDocument doc; - doc.parseFull(s); - if (!doc.isHeaderParsed() && !doc.isAllParsed()) { - LOGERR(("MimeHandlerMail::processMbox: mime parse error for %s\n", + delete m_bincdoc; + m_bincdoc = new Binc::MimeDocument; + m_bincdoc->parseFull(m_fd); + if (!m_bincdoc->isHeaderParsed() && !m_bincdoc->isAllParsed()) { + LOGERR(("MimeHandlerMail::mkDoc: mime parse error for %s\n", fn.c_str())); - return MimeHandler::MHError; + return false; } - - MimeHandler::Status ret = processMsg(docout, doc, 0); - - if (ret == MimeHandler::MHError) - return ret; - char buf[20]; - sprintf(buf, "%d", m_msgnum); - ipath = buf; - return iseof ? MimeHandler::MHDone : - (mtarg > 0) ? MimeHandler::MHDone : MimeHandler::MHAgain; + m_havedoc = true; + return true; } +bool MimeHandlerMail::set_document_string(const string &msgtxt) +{ + LOGDEB2(("Message text: [%s]\n", msgtxt.c_str())); + delete m_stream; + m_stream = new stringstream(msgtxt); + delete m_bincdoc; + m_bincdoc = new Binc::MimeDocument; + m_bincdoc->parseFull(*m_stream); + if (!m_bincdoc->isHeaderParsed() && !m_bincdoc->isAllParsed()) { + LOGERR(("MimeHandlerMail::set_document_string: mime parse error\n")); + return false; + } + m_havedoc = true; + return true; +} + +bool MimeHandlerMail::next_document() +{ + if (!m_havedoc) + return false; + m_havedoc = false; + m_metaData["mimetype"] = "text/plain"; + return processMsg(m_bincdoc, 0); +} // Transform a single message into a document. The subject becomes the // title, and any simple body part with a content-type of text or html @@ -217,58 +106,59 @@ MimeHandlerMail::processmbox(const string &fn, Rcl::Doc &docout, string& ipath) // If depth is not zero, we're called recursively for an // message/rfc822 part and we must not touch the doc fields except the // text -MimeHandler::Status -MimeHandlerMail::processMsg(Rcl::Doc &docout, Binc::MimePart& doc, - int depth) +bool MimeHandlerMail::processMsg(Binc::MimePart *doc, int depth) { LOGDEB2(("MimeHandlerMail::processMsg: depth %d\n", depth)); if (depth++ >= maxdepth) { // Have to stop somewhere LOGDEB(("MimeHandlerMail::processMsg: maxdepth %d exceeded\n", maxdepth)); - return MimeHandler::MHDone; + // Return true anyway, better to index partially than not at all + return true; } // Handle some headers. + string& text = m_metaData["content"]; Binc::HeaderItem hi; string transcoded; - if (doc.h.getFirstHeader("From", hi)) { + if (doc->h.getFirstHeader("From", hi)) { rfc2047_decode(hi.getValue(), transcoded); - docout.text += string("From: ") + transcoded + string("\n"); + text += string("From: ") + transcoded + string("\n"); } - if (doc.h.getFirstHeader("To", hi)) { + if (doc->h.getFirstHeader("To", hi)) { rfc2047_decode(hi.getValue(), transcoded); - docout.text += string("To: ") + transcoded + string("\n"); + text += string("To: ") + transcoded + string("\n"); } - if (doc.h.getFirstHeader("Date", hi)) { + if (doc->h.getFirstHeader("Date", hi)) { rfc2047_decode(hi.getValue(), transcoded); if (depth == 1) { time_t t = rfc2822DateToUxTime(transcoded); if (t != (time_t)-1) { char ascuxtime[100]; sprintf(ascuxtime, "%ld", (long)t); - docout.dmtime = ascuxtime; + m_metaData["modificationdate"] = ascuxtime; } else { // Leave mtime field alone, ftime will be used instead. LOGDEB(("rfc2822Date...: failed: [%s]\n", transcoded.c_str())); } } - docout.text += string("Date: ") + transcoded + string("\n"); + text += string("Date: ") + transcoded + string("\n"); } - if (doc.h.getFirstHeader("Subject", hi)) { + if (doc->h.getFirstHeader("Subject", hi)) { rfc2047_decode(hi.getValue(), transcoded); if (depth == 1) - docout.title = transcoded; - docout.text += string("Subject: ") + transcoded + string("\n"); + m_metaData["title"] = transcoded; + text += string("Subject: ") + transcoded + string("\n"); } - docout.text += '\n'; + text += '\n'; - LOGDEB2(("MimeHandlerMail::processMsg:ismultipart %d mime subtype '%s'\n", - doc.isMultipart(), doc.getSubType().c_str())); - walkmime(docout, doc, depth); + LOGDEB2(("MimeHandlerMail::rocessMsg:ismultipart %d mime subtype '%s'\n", + doc->isMultipart(), doc->getSubType().c_str())); + walkmime(doc, depth); - LOGDEB2(("MimeHandlerMail::processMsg:text:[%s]\n", docout.text.c_str())); - return MimeHandler::MHDone; + LOGDEB2(("MimeHandlerMail::processMsg:text:[%s]\n", + m_metaData["content"].c_str())); + return true; } // Recursively walk the message mime parts and concatenate all the @@ -281,8 +171,7 @@ MimeHandlerMail::processMsg(Rcl::Doc &docout, Binc::MimePart& doc, // // multipart can be mixed, alternative, parallel, digest. // message/rfc822 may also be of interest. - -void MimeHandlerMail::walkmime(Rcl::Doc& docout, Binc::MimePart& doc,int depth) +void MimeHandlerMail::walkmime(Binc::MimePart* doc, int depth) { LOGDEB2(("MimeHandlerMail::walkmime: depth %d\n", depth)); if (depth++ >= maxdepth) { @@ -290,28 +179,29 @@ void MimeHandlerMail::walkmime(Rcl::Doc& docout, Binc::MimePart& doc,int depth) return; } - string &out = docout.text; + string& out = m_metaData["content"]; - if (doc.isMultipart()) { + if (doc->isMultipart()) { LOGDEB2(("walkmime: ismultipart %d subtype '%s'\n", - doc.isMultipart(), doc.getSubType().c_str())); + doc->isMultipart(), doc->getSubType().c_str())); // We only handle alternative, related and mixed (no digests). std::vector::iterator it; - if (!stringicmp("mixed", doc.getSubType()) || - !stringicmp("related", doc.getSubType())) { + if (!stringicmp("mixed", doc->getSubType()) || + !stringicmp("related", doc->getSubType())) { // Multipart mixed and related: process each part. - for (it = doc.members.begin(); it != doc.members.end();it++) { - walkmime(docout, *it, depth); + for (it = doc->members.begin(); it != doc->members.end();it++) { + walkmime(&(*it), depth); } - } else if (!stringicmp("alternative", doc.getSubType())) { + } else if (!stringicmp("alternative", doc->getSubType())) { // Multipart/alternative: look for a text/plain part, then html. // Process if found std::vector::iterator ittxt, ithtml; - ittxt = ithtml = doc.members.end(); + ittxt = ithtml = doc->members.end(); int i = 1; - for (it = doc.members.begin(); it != doc.members.end();it++, i++) { + for (it = doc->members.begin(); + it != doc->members.end(); it++, i++) { // Get and parse content-type header Binc::HeaderItem hi; if (!it->h.getFirstHeader("Content-Type", hi)) { @@ -326,12 +216,12 @@ void MimeHandlerMail::walkmime(Rcl::Doc& docout, Binc::MimePart& doc,int depth) else if (!stringlowercmp("text/html", content_type.value)) ithtml = it; } - if (ittxt != doc.members.end()) { + if (ittxt != doc->members.end()) { LOGDEB2(("walkmime: alternative: chose text/plain part\n")) - walkmime(docout, *ittxt, depth); - } else if (ithtml != doc.members.end()) { + walkmime(&(*ittxt), depth); + } else if (ithtml != doc->members.end()) { LOGDEB2(("walkmime: alternative: chose text/html part\n")) - walkmime(docout, *ithtml, depth); + walkmime(&(*ithtml), depth); } } return; @@ -343,7 +233,7 @@ void MimeHandlerMail::walkmime(Rcl::Doc& docout, Binc::MimePart& doc,int depth) // Get and parse content-type header. Binc::HeaderItem hi; string ctt = "text/plain"; - if (doc.h.getFirstHeader("Content-Type", hi)) { + if (doc->h.getFirstHeader("Content-Type", hi)) { ctt = hi.getValue(); } LOGDEB2(("walkmime:content-type: %s\n", ctt.c_str())); @@ -352,7 +242,7 @@ void MimeHandlerMail::walkmime(Rcl::Doc& docout, Binc::MimePart& doc,int depth) // Get and parse Content-Disposition header string ctd = "inline"; - if (doc.h.getFirstHeader("Content-Disposition", hi)) { + if (doc->h.getFirstHeader("Content-Disposition", hi)) { ctd = hi.getValue(); } MimeHeaderValue content_disposition; @@ -371,13 +261,13 @@ void MimeHandlerMail::walkmime(Rcl::Doc& docout, Binc::MimePart& doc,int depth) if (it != content_disposition.params.end()) filename = it->second; - if (doc.isMessageRFC822()) { + if (doc->isMessageRFC822()) { LOGDEB2(("walkmime: message/RFC822 part\n")); // The first part is the already parsed message. Call // processMsg instead of walkmime so that mail headers get // printed. The depth will tell it what to do - if (doc.members.empty()) { + if (doc->members.empty()) { //?? return; } @@ -388,7 +278,7 @@ void MimeHandlerMail::walkmime(Rcl::Doc& docout, Binc::MimePart& doc,int depth) if (m_forPreview) out += "]"; out += "\n\n"; - processMsg(docout, doc.members[0], depth); + processMsg(&doc->members[0], depth); return; } @@ -437,14 +327,14 @@ void MimeHandlerMail::walkmime(Rcl::Doc& docout, Binc::MimePart& doc,int depth) // Content transfer encoding string cte = "7bit"; - if (doc.h.getFirstHeader("Content-Transfer-Encoding", hi)) { + if (doc->h.getFirstHeader("Content-Transfer-Encoding", hi)) { cte = hi.getValue(); } LOGDEB2(("walkmime: final: body start offset %d, length %d\n", - doc.getBodyStartOffset(), doc.getBodyLength())); + doc->getBodyStartOffset(), doc->getBodyLength())); string body; - doc.getBody(body, 0, doc.bodylength); + doc->getBody(body, 0, doc->bodylength); // Decode according to content transfer encoding if (!stringlowercmp("quoted-printable", cte)) { @@ -472,22 +362,30 @@ void MimeHandlerMail::walkmime(Rcl::Doc& docout, Binc::MimePart& doc,int depth) // Handle html stripping and transcoding to utf8 string utf8; + const string *putf8 = 0; if (!stringlowercmp("text/html", content_type.value)) { - MimeHandlerHtml mh; - Rcl::Doc hdoc; - mh.charsethint = charset; - mh.mkDoc(m_conf, "", body, content_type.value, hdoc); - utf8 = hdoc.text; + MimeHandlerHtml mh("text/html"); + mh.set_property(Dijon::Filter::OPERATING_MODE, + m_forPreview ? "view" : "index"); + mh.set_property(Dijon::Filter::DEFAULT_CHARSET, charset); + mh.set_document_string(body); + mh.next_document(); + map::const_iterator it = + mh.get_meta_data().find("content"); + if (it != mh.get_meta_data().end()) + putf8 = &it->second; } else { // Transcode to utf-8 if (!transcode(body, utf8, charset, "UTF-8")) { LOGERR(("walkmime: transcode failed from cs '%s' to UTF-8\n", charset.c_str())); - utf8 = body; + putf8 = &body; + } else { + putf8 = &utf8; } } - - out += utf8; + if (putf8) + out += *putf8; if (out.length() && out[out.length()-1] != '\n') out += '\n'; diff --git a/src/internfile/mh_mail.h b/src/internfile/mh_mail.h index 32ab7680..94abb303 100644 --- a/src/internfile/mh_mail.h +++ b/src/internfile/mh_mail.h @@ -16,8 +16,9 @@ */ #ifndef _MAIL_H_INCLUDED_ #define _MAIL_H_INCLUDED_ -/* @(#$Id: mh_mail.h,v 1.8 2006-09-19 14:30:39 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: mh_mail.h,v 1.9 2006-12-15 12:40:02 dockes Exp $ (C) 2004 J.F.Dockes */ +#include #include "mimehandler.h" namespace Binc { @@ -30,26 +31,21 @@ namespace Binc { * for maildir files). This has to keep state while parsing a mail folder * file. */ -class MimeHandlerMail : public MimeHandler { +class MimeHandlerMail : public RecollFilter { public: - MimeHandlerMail() : m_vfp(0), m_msgnum(0), m_conf(0) {} - - virtual MimeHandler::Status - mkDoc(RclConfig *conf, const std::string &fn, - const std::string &mtype, Rcl::Doc &docout, std::string& ipath); - + MimeHandlerMail(const string &mt) + : RecollFilter(mt), m_bincdoc(0), m_fd(-1), m_stream(0) + {} virtual ~MimeHandlerMail(); - + virtual bool set_document_file(const string &file_path); + virtual bool set_document_string(const string &data); + virtual bool next_document(); private: - void *m_vfp; // File pointer for folder - int m_msgnum; // Current message number in folder. Starts at 1 - RclConfig *m_conf; // Keep pointer to rclconfig around - - MimeHandler::Status processmbox(const string &fn, Rcl::Doc &docout, - string &ipath); - MimeHandler::Status processMsg(Rcl::Doc &docout, Binc::MimePart& doc, - int depth); - void walkmime(Rcl::Doc &docout, Binc::MimePart& doc, int depth); + Binc::MimeDocument *m_bincdoc; + bool processMsg(Binc::MimePart *doc, int depth); + void walkmime(Binc::MimePart* doc, int depth); + int m_fd; + std::stringstream *m_stream; }; #endif /* _MAIL_H_INCLUDED_ */ diff --git a/src/internfile/mh_mbox.cpp b/src/internfile/mh_mbox.cpp new file mode 100644 index 00000000..9607a3f6 --- /dev/null +++ b/src/internfile/mh_mbox.cpp @@ -0,0 +1,166 @@ +#ifndef lint +static char rcsid[] = "@(#$Id: mh_mbox.cpp,v 1.1 2006-12-15 12:40:24 dockes Exp $ (C) 2005 J.F.Dockes"; +#endif +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the + * Free Software Foundation, Inc., + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "mimehandler.h" +#include "debuglog.h" +#include "readfile.h" +#include "mh_mbox.h" +#include "smallut.h" + +using namespace std; + +MimeHandlerMbox::~MimeHandlerMbox() +{ + if (m_vfp) { + fclose((FILE *)m_vfp); + m_vfp = 0; + } +} + +bool MimeHandlerMbox::set_document_file(const string &fn) +{ + LOGDEB(("MimeHandlerMbox::set_document_file(%s)\n", fn.c_str())); + m_fn = fn; + if (m_vfp) { + fclose((FILE *)m_vfp); + m_vfp = 0; + } + + m_vfp = fopen(fn.c_str(), "r"); + if (m_vfp == 0) { + LOGERR(("MimeHandlerMail::set_document_file: error opening %s\n", + fn.c_str())); + return false; + } + m_havedoc = true; + return true; +} + +static const char *frompat = "^From .* [1-2][0-9][0-9][0-9][\r]*\n$"; +static regex_t fromregex; +static bool regcompiled; + +bool MimeHandlerMbox::next_document() +{ + if (m_vfp == 0) { + LOGERR(("MimeHandlerMbox::next_document: not open\n")); + return false; + } + if (!m_havedoc) { + return false; + } + FILE *fp = (FILE *)m_vfp; + int mtarg = 0; + if (m_ipath != "") { + sscanf(m_ipath.c_str(), "%d", &mtarg); + } else if (m_forPreview) { + // Can't preview an mbox + return false; + } + LOGDEB(("MimeHandlerMbox::next_document: fn %s, msgnum %d mtarg %d \n", + m_fn.c_str(), m_msgnum, mtarg)); + + if (!regcompiled) { + regcomp(&fromregex, frompat, REG_NOSUB); + regcompiled = true; + } + + // If we are called to retrieve a specific message, seek to bof + // (then scan up to the message). This is for the case where the + // same object is reused to fetch several messages (else the fp is + // just opened no need for a seek). We could also check if the + // current message number is lower than the requested one and + // avoid rereading the whole thing in this case. But I'm not sure + // we're ever used in this way (multiple retrieves on same + // object). So: + if (mtarg > 0) { + fseek(fp, 0, SEEK_SET); + m_msgnum = 0; + } + + off_t start, end; + bool iseof = false; + bool hademptyline = true; + string& msgtxt = m_metaData["content"]; + msgtxt.erase(); + do { + // Look for next 'From ' Line, start of message. Set start to + // line after this + char line[501]; + for (;;) { + if (!fgets(line, 500, fp)) { + // Eof hit while looking for 'From ' -> file done. We'd need + // another return code here + return false; + } + if (line[0] == '\n' || line[0] == '\r') { + hademptyline = true; + continue; + } + if (hademptyline && !regexec(&fromregex, line, 0, 0, 0)) { + start = ftello(fp); + m_msgnum++; + break; + } + hademptyline = false; + } + + // Look for next 'From ' line or eof, end of message. + for (;;) { + end = ftello(fp); + if (!fgets(line, 500, fp)) { + if (ferror(fp) || feof(fp)) + iseof = true; + break; + } + if (hademptyline && !regexec(&fromregex, line, 0, 0, 0)) { + break; + } + if (mtarg <= 0 || m_msgnum == mtarg) { + msgtxt += line; + } + if (line[0] == '\n' || line[0] == '\r') { + hademptyline = true; + } else { + hademptyline = false; + } + } + fseek(fp, end, SEEK_SET); + } while (mtarg > 0 && m_msgnum < mtarg); + + LOGDEB2(("Message text: [%s]\n", msgtxt.c_str())); + char buf[20]; + sprintf(buf, "%d", m_msgnum); + m_metaData["ipath"] = buf; + m_metaData["mimetype"] = "message/rfc822"; + if (iseof) + m_havedoc = false; + return msgtxt.empty() ? false : true; +} diff --git a/src/internfile/mh_mbox.h b/src/internfile/mh_mbox.h new file mode 100644 index 00000000..392dbd1e --- /dev/null +++ b/src/internfile/mh_mbox.h @@ -0,0 +1,51 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the + * Free Software Foundation, Inc., + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#ifndef _MBOX_H_INCLUDED_ +#define _MBOX_H_INCLUDED_ +/* @(#$Id: mh_mbox.h,v 1.1 2006-12-15 12:40:24 dockes Exp $ (C) 2004 J.F.Dockes */ + +#include +using std::string; + +#include "mimehandler.h" + +/** + * Translate a mail folder file into internal documents (also works + * for maildir files). This has to keep state while parsing a mail folder + * file. + */ +class MimeHandlerMbox : public RecollFilter { + public: + MimeHandlerMbox(const string& mime) + : RecollFilter(mime), m_vfp(0), m_msgnum(0) + {} + virtual ~MimeHandlerMbox(); + virtual bool set_document_file(const string &file_path); + virtual bool next_document(); + virtual bool skip_to_document(const string& ipath) { + m_ipath = ipath; + return true; + } + + private: + string m_fn; // File name + void *m_vfp; // File pointer for folder + int m_msgnum; // Current message number in folder. Starts at 1 + string m_ipath; +}; + +#endif /* _MBOX_H_INCLUDED_ */ diff --git a/src/internfile/mh_text.cpp b/src/internfile/mh_text.cpp index c45c8680..5a76981f 100644 --- a/src/internfile/mh_text.cpp +++ b/src/internfile/mh_text.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: mh_text.cpp,v 1.5 2006-03-20 15:14:08 dockes Exp $ (C) 2005 J.F.Dockes"; +static char rcsid[] = "@(#$Id: mh_text.cpp,v 1.6 2006-12-15 12:40:02 dockes Exp $ (C) 2005 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -31,34 +31,44 @@ using namespace std; #include "transcode.h" // Process a plain text file -MimeHandler::Status MimeHandlerText::mkDoc(RclConfig *conf, const string &fn, - const string &mtype, Rcl::Doc &docout, string&) +bool MimeHandlerText::set_document_file(const string &fn) { string otext; if (!file_to_string(fn, otext)) - return MimeHandler::MHError; - - // Try to guess charset, then convert to utf-8, and fill document - // fields The charset guesser really doesnt work well in general - // and should be avoided (especially for short documents) - string charset; - if (conf->getGuessCharset()) { - charset = csguess(otext, conf->getDefCharset()); - } else - charset = conf->getDefCharset(); + return false; + return set_document_string(otext); +} + +bool MimeHandlerText::set_document_string(const string& otext) +{ + m_text = otext; + m_havedoc = true; + return true; +} +bool MimeHandlerText::next_document() +{ + if (m_havedoc == false) + return false; + m_havedoc = false; LOGDEB1(("MimeHandlerText::mkDoc: transcod from %s to utf-8\n", - charset.c_str())); + m_defcharset.c_str())); - string utf8; - if (!transcode(otext, utf8, charset, "UTF-8")) { + // Avoid unneeded copy. This gets a reference to an empty string which is + // the entry for "content" + string& utf8 = m_metaData["content"]; + + // Note that we transcode always even if defcharset is already utf-8: + // this validates the encoding. + if (!transcode(m_text, utf8, m_defcharset, "UTF-8")) { LOGERR(("MimeHandlerText::mkDoc: transcode to utf-8 failed " - "for charset [%s]\n", charset.c_str())); - otext.erase(); - return MimeHandler::MHError; + "for charset [%s]\n", m_defcharset.c_str())); + utf8.erase(); + return false; } - docout.origcharset = charset; - docout.text = utf8; - return MimeHandler::MHDone; + m_metaData["origcharset"] = m_defcharset; + m_metaData["charset"] = "utf-8"; + m_metaData["mimetype"] = "text/plain"; + return true; } diff --git a/src/internfile/mh_text.h b/src/internfile/mh_text.h index a04422ab..baa78c12 100644 --- a/src/internfile/mh_text.h +++ b/src/internfile/mh_text.h @@ -16,12 +16,11 @@ */ #ifndef _MH_TEXT_H_INCLUDED_ #define _MH_TEXT_H_INCLUDED_ -/* @(#$Id: mh_text.h,v 1.2 2006-01-30 11:15:27 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: mh_text.h,v 1.3 2006-12-15 12:40:02 dockes Exp $ (C) 2004 J.F.Dockes */ #include +using std::string; -#include "rclconfig.h" -#include "rcldb.h" #include "mimehandler.h" /** @@ -29,12 +28,15 @@ * * Maybe try to guess charset, or use default, then transcode to utf8 */ -class MimeHandlerText : public MimeHandler { +class MimeHandlerText : public RecollFilter { public: - MimeHandler::Status mkDoc(RclConfig *conf, const std::string &fn, - const std::string &mtype, Rcl::Doc &docout, - std::string&); - + MimeHandlerText(const string& mt) : RecollFilter(mt) {} + virtual ~MimeHandlerText() {} + virtual bool set_document_file(const string &file_path); + virtual bool set_document_string(const string&); + virtual bool next_document(); +private: + string m_text; }; #endif /* _MH_TEXT_H_INCLUDED_ */ diff --git a/src/internfile/mh_unknown.h b/src/internfile/mh_unknown.h index c4469c5e..92c33576 100644 --- a/src/internfile/mh_unknown.h +++ b/src/internfile/mh_unknown.h @@ -16,24 +16,33 @@ */ #ifndef _MH_UNKNOWN_H_INCLUDED_ #define _MH_UNKNOWN_H_INCLUDED_ -/* @(#$Id: mh_unknown.h,v 1.1 2006-03-28 09:36:53 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: mh_unknown.h,v 1.2 2006-12-15 12:40:02 dockes Exp $ (C) 2004 J.F.Dockes */ #include -#include "rclconfig.h" -#include "rcldb.h" #include "mimehandler.h" /** * Handler for files with no content handler: does nothing. * */ -class MimeHandlerUnknown : public MimeHandler { +class MimeHandlerUnknown : public RecollFilter { public: - MimeHandler::Status mkDoc(RclConfig *conf, const std::string &fn, - const std::string &mtype, Rcl::Doc &docout, - std::string&) { - return MimeHandler::MHDone; + MimeHandlerUnknown(const string& mt) : RecollFilter(mt) {} + virtual ~MimeHandlerUnknown() {} + virtual bool set_document_string(const string&) { + return m_havedoc = true; + } + virtual bool set_document_file(const string&) { + return m_havedoc = true; + } + virtual bool next_document() { + if (m_havedoc == false) + return false; + m_havedoc = false; + m_metaData["content"] = ""; + m_metaData["mimetype"] = "text/plain"; + return true; } }; diff --git a/src/internfile/mimehandler.cpp b/src/internfile/mimehandler.cpp index 5812eda1..e3ae8e71 100644 --- a/src/internfile/mimehandler.cpp +++ b/src/internfile/mimehandler.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: mimehandler.cpp,v 1.19 2006-12-13 09:13:18 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: mimehandler.cpp,v 1.20 2006-12-15 12:40:02 dockes Exp $ (C) 2004 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -20,37 +20,40 @@ static char rcsid[] = "@(#$Id: mimehandler.cpp,v 1.19 2006-12-13 09:13:18 dockes #include #include -#ifndef NO_NAMESPACES + using namespace std; -#endif /* NO_NAMESPACES */ #include "mimehandler.h" #include "debuglog.h" +#include "rclconfig.h" #include "smallut.h" + +#include "mh_exec.h" #include "mh_html.h" #include "mh_mail.h" +#include "mh_mbox.h" #include "mh_text.h" -#include "mh_exec.h" #include "mh_unknown.h" /** Create internal handler object appropriate for given mime type */ -static MimeHandler *mhFactory(const string &mime) +static Dijon::Filter *mhFactory(const string &mime) { if (!stringlowercmp("text/plain", mime)) - return new MimeHandlerText; + return new MimeHandlerText("text/plain"); else if (!stringlowercmp("text/html", mime)) - return new MimeHandlerHtml; + return new MimeHandlerHtml("text/html"); else if (!stringlowercmp("text/x-mail", mime)) - return new MimeHandlerMail; + return new MimeHandlerMbox("text/x-mail"); else if (!stringlowercmp("message/rfc822", mime)) - return new MimeHandlerMail; - return 0; + return new MimeHandlerMail("message/rfc822"); + else + return new MimeHandlerUnknown("application/octet-stream"); } /** * Return handler object for given mime type: */ -MimeHandler *getMimeHandler(const string &mtype, RclConfig *cfg) +Dijon::Filter *getMimeHandler(const string &mtype, RclConfig *cfg) { // Get handler definition for mime type string hs; @@ -78,7 +81,7 @@ MimeHandler *getMimeHandler(const string &mtype, RclConfig *cfg) mtype.c_str(), hs.c_str())); return 0; } - MimeHandlerExec *h = new MimeHandlerExec; + MimeHandlerExec *h = new MimeHandlerExec(mtype.c_str()); it++; h->params.push_back(cfg->findFilter(*it++)); h->params.insert(h->params.end(), it, toks.end()); @@ -93,7 +96,8 @@ MimeHandler *getMimeHandler(const string &mtype, RclConfig *cfg) bool indexunknown = false; cfg->getConfParam("indexallfilenames", &indexunknown); if (indexunknown) { - return new MimeHandlerUnknown; + LOGDEB(("getMimeHandler: returning MimeHandlerUnknown\n")); + return new MimeHandlerUnknown("application/octet-stream"); } else { return 0; } diff --git a/src/internfile/mimehandler.h b/src/internfile/mimehandler.h index 032cd9f1..340d5a21 100644 --- a/src/internfile/mimehandler.h +++ b/src/internfile/mimehandler.h @@ -16,60 +16,74 @@ */ #ifndef _MIMEHANDLER_H_INCLUDED_ #define _MIMEHANDLER_H_INCLUDED_ -/* @(#$Id: mimehandler.h,v 1.12 2006-03-29 13:08:08 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: mimehandler.h,v 1.13 2006-12-15 12:40:02 dockes Exp $ (C) 2004 J.F.Dockes */ #include #include +using std::string; +using std::list; -#include "rclconfig.h" -#include "rcldb.h" +#include +class RclConfig; -/** - * Document interner class. - */ -class MimeHandler { - public: - MimeHandler() : m_forPreview(false) {} - virtual ~MimeHandler() {} +class RecollFilter : public Dijon::Filter { +public: + RecollFilter(const string& mtype) + : Dijon::Filter(mtype), m_forPreview(false), m_havedoc(false) + {} + virtual ~RecollFilter() {} + virtual bool set_property(Properties p, const string &v) { + switch (p) { + case DEFAULT_CHARSET: + m_defcharset = v; + break; + case OPERATING_MODE: + if (!v.empty() && v[0] == 'v') + m_forPreview = true; + else + m_forPreview = false; + break; + } + return true; + } - /// Status from mkDoc method. - enum Status {MHError, MHDone, MHAgain}; - /** - * Transform external data into internal utf8 document - * - * @param conf the global configuration - * @param filename File from which the data comes from - * @param mimetype its mime type (from the mimemap configuration file) - * @param outdoc The output document - * @param ipath the access path for the document inside the file. - * For mono-document file types, this will always be empty. - * It is used, for example for mbox files which may contain - * multiple emails. If this is not empty in input, then the - * caller is requesting a single document (ie: for display). - * If this is empty (during indexation), it will be filled-up - * by the function, and all the file's documents will be - * returned by successive calls. - * @return The return value indicates if there are more documents to be - * fetched from the same file. - */ - virtual MimeHandler::Status mkDoc(RclConfig * conf, - const std::string &filename, - const std::string &mimetype, - Rcl::Doc& outdoc, - string& ipath) = 0; + // We don't use this for now + virtual bool set_document_uri(const std::string &) {return false;} - virtual void setForPreview(bool onoff) {m_forPreview = onoff;}; + // Default implementations + virtual bool set_document_string(const std::string &) {return false;} + virtual bool set_document_data(const char *cp, unsigned int sz) { + return set_document_string(string(cp, sz)); + } - protected: - bool m_forPreview; + virtual bool has_documents() const {return m_havedoc;} + + // Most doc types are single-doc + virtual bool skip_to_document(const string& s) { + if (s.empty()) + return true; + return false; + } + + virtual DataInput get_required_data_input() const + {return DOCUMENT_FILE_NAME;} + virtual string get_error() const { + return m_reason; + } + +protected: + bool m_forPreview; + string m_defcharset; + string m_reason; + bool m_havedoc; }; /** * Return indexing handler object for the given mime type * returned pointer should be deleted by caller */ -extern MimeHandler *getMimeHandler(const std::string &mtyp, RclConfig *cfg); +extern Dijon::Filter *getMimeHandler(const std::string &mtyp, RclConfig *cfg); /// Can this mime type be interned ? extern bool canIntern(const std::string mimetype, RclConfig *cfg); diff --git a/src/internfile/myhtmlparse.h b/src/internfile/myhtmlparse.h index 3c855d68..ec4b8c94 100644 --- a/src/internfile/myhtmlparse.h +++ b/src/internfile/myhtmlparse.h @@ -37,11 +37,13 @@ class MyHtmlParser : public HtmlParser { bool in_body_tag; bool in_pre_tag; bool pending_space; - string title, sample, keywords, dump, dmtime; + bool indexing_allowed; + string title, sample, keywords, dmtime; + string localdump; + string &dump; string ocharset; // This is the charset our user thinks the doc was string charset; // This is the charset it was supposedly converted to string doccharset; // Set this to value of charset parameter in header - bool indexing_allowed; void process_text(const string &text); void opening_tag(const string &tag, const map &p); void closing_tag(const string &tag); @@ -52,5 +54,16 @@ class MyHtmlParser : public HtmlParser { in_body_tag(false), in_pre_tag(false), pending_space(false), - indexing_allowed(true) { } + indexing_allowed(true), + dump(localdump) + { } + MyHtmlParser(string& buf) : + in_script_tag(false), + in_style_tag(false), + in_body_tag(false), + in_pre_tag(false), + pending_space(false), + indexing_allowed(true), + dump(buf) + { } }; diff --git a/src/lib/Makefile b/src/lib/Makefile index 91691a31..0ff8cf83 100644 --- a/src/lib/Makefile +++ b/src/lib/Makefile @@ -8,8 +8,8 @@ LIBS = librcl.a all: $(LIBS) -OBJS = rclaspell.o rclconfig.o rclinit.o textsplit.o unacpp.o csguess.o indexer.o mimetype.o htmlparse.o internfile.o mh_exec.o mh_html.o mh_mail.o mh_text.o mimehandler.o myhtmlparse.o docseq.o history.o sortseq.o pathhash.o rcldb.o searchdata.o stemdb.o base64.o conftree.o copyfile.o debuglog.o execmd.o fstreewalk.o idfile.o md5.o mimeparse.o pathut.o readfile.o smallut.o transcode.o wipedir.o -DEPS = rclaspell.dep.stamp rclconfig.dep.stamp rclinit.dep.stamp textsplit.dep.stamp unacpp.dep.stamp csguess.dep.stamp indexer.dep.stamp mimetype.dep.stamp htmlparse.dep.stamp internfile.dep.stamp mh_exec.dep.stamp mh_html.dep.stamp mh_mail.dep.stamp mh_text.dep.stamp mimehandler.dep.stamp myhtmlparse.dep.stamp docseq.dep.stamp history.dep.stamp sortseq.dep.stamp pathhash.dep.stamp rcldb.dep.stamp searchdata.dep.stamp stemdb.dep.stamp base64.dep.stamp conftree.dep.stamp copyfile.dep.stamp debuglog.dep.stamp execmd.dep.stamp fstreewalk.dep.stamp idfile.dep.stamp md5.dep.stamp mimeparse.dep.stamp pathut.dep.stamp readfile.dep.stamp smallut.dep.stamp transcode.dep.stamp wipedir.dep.stamp +OBJS = rclaspell.o rclconfig.o rclinit.o textsplit.o unacpp.o csguess.o indexer.o mimetype.o htmlparse.o myhtmlparse.o mimehandler.o internfile.o mh_exec.o mh_html.o mh_mail.o mh_mbox.o mh_text.o docseq.o history.o sortseq.o pathhash.o rcldb.o searchdata.o stemdb.o base64.o conftree.o copyfile.o debuglog.o execmd.o fstreewalk.o idfile.o md5.o mimeparse.o pathut.o readfile.o smallut.o transcode.o wipedir.o +DEPS = rclaspell.dep.stamp rclconfig.dep.stamp rclinit.dep.stamp textsplit.dep.stamp unacpp.dep.stamp csguess.dep.stamp indexer.dep.stamp mimetype.dep.stamp htmlparse.dep.stamp myhtmlparse.dep.stamp mimehandler.dep.stamp internfile.dep.stamp mh_exec.dep.stamp mh_html.dep.stamp mh_mail.dep.stamp mh_mbox.dep.stamp mh_text.dep.stamp docseq.dep.stamp history.dep.stamp sortseq.dep.stamp pathhash.dep.stamp rcldb.dep.stamp searchdata.dep.stamp stemdb.dep.stamp base64.dep.stamp conftree.dep.stamp copyfile.dep.stamp debuglog.dep.stamp execmd.dep.stamp fstreewalk.dep.stamp idfile.dep.stamp md5.dep.stamp mimeparse.dep.stamp pathut.dep.stamp readfile.dep.stamp smallut.dep.stamp transcode.dep.stamp wipedir.dep.stamp librcl.a : $(DEPS) $(OBJS) unac.o ar ru librcl.a $(OBJS) unac.o @@ -35,6 +35,10 @@ mimetype.o : ../index/mimetype.cpp $(CXX) $(ALL_CXXFLAGS) -c ../index/mimetype.cpp htmlparse.o : ../internfile/htmlparse.cpp $(CXX) $(ALL_CXXFLAGS) -c ../internfile/htmlparse.cpp +myhtmlparse.o : ../internfile/myhtmlparse.cpp + $(CXX) $(ALL_CXXFLAGS) -c ../internfile/myhtmlparse.cpp +mimehandler.o : ../internfile/mimehandler.cpp + $(CXX) $(ALL_CXXFLAGS) -c ../internfile/mimehandler.cpp internfile.o : ../internfile/internfile.cpp $(CXX) $(ALL_CXXFLAGS) -c ../internfile/internfile.cpp mh_exec.o : ../internfile/mh_exec.cpp @@ -43,12 +47,10 @@ mh_html.o : ../internfile/mh_html.cpp $(CXX) $(ALL_CXXFLAGS) -c ../internfile/mh_html.cpp mh_mail.o : ../internfile/mh_mail.cpp $(CXX) $(ALL_CXXFLAGS) -c ../internfile/mh_mail.cpp +mh_mbox.o : ../internfile/mh_mbox.cpp + $(CXX) $(ALL_CXXFLAGS) -c ../internfile/mh_mbox.cpp mh_text.o : ../internfile/mh_text.cpp $(CXX) $(ALL_CXXFLAGS) -c ../internfile/mh_text.cpp -mimehandler.o : ../internfile/mimehandler.cpp - $(CXX) $(ALL_CXXFLAGS) -c ../internfile/mimehandler.cpp -myhtmlparse.o : ../internfile/myhtmlparse.cpp - $(CXX) $(ALL_CXXFLAGS) -c ../internfile/myhtmlparse.cpp docseq.o : ../query/docseq.cpp $(CXX) $(ALL_CXXFLAGS) -c ../query/docseq.cpp history.o : ../query/history.cpp @@ -124,6 +126,12 @@ mimetype.dep.stamp : ../index/mimetype.cpp htmlparse.dep.stamp : ../internfile/htmlparse.cpp $(CXX) -M $(ALL_CXXFLAGS) ../internfile/htmlparse.cpp > htmlparse.dep touch htmlparse.dep.stamp +myhtmlparse.dep.stamp : ../internfile/myhtmlparse.cpp + $(CXX) -M $(ALL_CXXFLAGS) ../internfile/myhtmlparse.cpp > myhtmlparse.dep + touch myhtmlparse.dep.stamp +mimehandler.dep.stamp : ../internfile/mimehandler.cpp + $(CXX) -M $(ALL_CXXFLAGS) ../internfile/mimehandler.cpp > mimehandler.dep + touch mimehandler.dep.stamp internfile.dep.stamp : ../internfile/internfile.cpp $(CXX) -M $(ALL_CXXFLAGS) ../internfile/internfile.cpp > internfile.dep touch internfile.dep.stamp @@ -136,15 +144,12 @@ mh_html.dep.stamp : ../internfile/mh_html.cpp mh_mail.dep.stamp : ../internfile/mh_mail.cpp $(CXX) -M $(ALL_CXXFLAGS) ../internfile/mh_mail.cpp > mh_mail.dep touch mh_mail.dep.stamp +mh_mbox.dep.stamp : ../internfile/mh_mbox.cpp + $(CXX) -M $(ALL_CXXFLAGS) ../internfile/mh_mbox.cpp > mh_mbox.dep + touch mh_mbox.dep.stamp mh_text.dep.stamp : ../internfile/mh_text.cpp $(CXX) -M $(ALL_CXXFLAGS) ../internfile/mh_text.cpp > mh_text.dep touch mh_text.dep.stamp -mimehandler.dep.stamp : ../internfile/mimehandler.cpp - $(CXX) -M $(ALL_CXXFLAGS) ../internfile/mimehandler.cpp > mimehandler.dep - touch mimehandler.dep.stamp -myhtmlparse.dep.stamp : ../internfile/myhtmlparse.cpp - $(CXX) -M $(ALL_CXXFLAGS) ../internfile/myhtmlparse.cpp > myhtmlparse.dep - touch myhtmlparse.dep.stamp docseq.dep.stamp : ../query/docseq.cpp $(CXX) -M $(ALL_CXXFLAGS) ../query/docseq.cpp > docseq.dep touch docseq.dep.stamp @@ -217,13 +222,14 @@ include csguess.dep include indexer.dep include mimetype.dep include htmlparse.dep +include myhtmlparse.dep +include mimehandler.dep include internfile.dep include mh_exec.dep include mh_html.dep include mh_mail.dep +include mh_mbox.dep include mh_text.dep -include mimehandler.dep -include myhtmlparse.dep include docseq.dep include history.dep include sortseq.dep diff --git a/src/lib/mkMake b/src/lib/mkMake index 2cd96407..a117d0a6 100755 --- a/src/lib/mkMake +++ b/src/lib/mkMake @@ -13,13 +13,14 @@ ${depth}/index/csguess.cpp \ ${depth}/index/indexer.cpp \ ${depth}/index/mimetype.cpp \ ${depth}/internfile/htmlparse.cpp \ +${depth}/internfile/myhtmlparse.cpp \ +${depth}/internfile/mimehandler.cpp \ ${depth}/internfile/internfile.cpp \ ${depth}/internfile/mh_exec.cpp \ ${depth}/internfile/mh_html.cpp \ ${depth}/internfile/mh_mail.cpp \ +${depth}/internfile/mh_mbox.cpp \ ${depth}/internfile/mh_text.cpp \ -${depth}/internfile/mimehandler.cpp \ -${depth}/internfile/myhtmlparse.cpp \ ${depth}/query/docseq.cpp \ ${depth}/query/history.cpp \ ${depth}/query/sortseq.cpp \ diff --git a/src/utils/smallut.cpp b/src/utils/smallut.cpp index e6067150..ed19f89b 100644 --- a/src/utils/smallut.cpp +++ b/src/utils/smallut.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: smallut.cpp,v 1.22 2006-12-14 13:53:43 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: smallut.cpp,v 1.23 2006-12-15 12:40:02 dockes Exp $ (C) 2004 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -260,13 +260,14 @@ bool stringToStrings(const string &s, std::list &tokens) } void stringToTokens(const string& str, list& tokens, - const string& delims) + const string& delims, bool skipinit) { - string::size_type startPos, pos; + string::size_type startPos = 0, pos; for (pos = 0;;) { // Skip initial delims, break if this eats all. - if ((startPos = str.find_first_not_of(delims, pos)) == string::npos) + if (skipinit && + (startPos = str.find_first_not_of(delims, pos)) == string::npos) break; // Find next delimiter or end of string (end of token) pos = str.find_first_of(delims, startPos); diff --git a/src/utils/smallut.h b/src/utils/smallut.h index f71a7e45..3636d0b1 100644 --- a/src/utils/smallut.h +++ b/src/utils/smallut.h @@ -16,7 +16,7 @@ */ #ifndef _SMALLUT_H_INCLUDED_ #define _SMALLUT_H_INCLUDED_ -/* @(#$Id: smallut.h,v 1.22 2006-12-14 13:53:43 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: smallut.h,v 1.23 2006-12-15 12:40:02 dockes Exp $ (C) 2004 J.F.Dockes */ #include #include #include @@ -51,7 +51,7 @@ extern bool stringToStrings(const string &s, list &tokens); * Split input string. No handling of quoting */ extern void stringToTokens(const string &s, list &tokens, - const string &delims = " \t"); + const string &delims = " \t", bool skipinit=true); /** Convert string to boolean */ extern bool stringToBool(const string &s);