diff --git a/src/internfile/mh_exec.cpp b/src/internfile/mh_exec.cpp new file mode 100644 index 00000000..c068128d --- /dev/null +++ b/src/internfile/mh_exec.cpp @@ -0,0 +1,45 @@ +#ifndef lint +static char rcsid[] = "@(#$Id: mh_exec.cpp,v 1.1 2005-11-18 13:23:46 dockes Exp $ (C) 2005 J.F.Dockes"; +#endif + +#include "execmd.h" +#include "mh_exec.h" +#include "mh_html.h" +#include "debuglog.h" + +using namespace std; + +// Execute an external program to translate a file from its native format +// to html. Then call the html parser to do the actual indexing +MimeHandler::Status +MimeHandlerExec::mkDoc(RclConfig *conf, const string &fn, + const string &mtype, Rcl::Doc &docout, string&) +{ + if (params.empty()) { + // Hu ho + LOGERR(("MimeHandlerExec::mkDoc: empty params for mime %s\n", + mtype.c_str())); + return MimeHandler::MHError; + } + // Command name + string cmd = find_filter(conf, params.front()); + + // Build parameter list: delete cmd name and add the file name + list::iterator it = params.begin(); + listmyparams(++it, params.end()); + myparams.push_back(fn); + + // Execute command and store the result text, which is supposedly html + string html; + ExecCmd exec; + int status = exec.doexec(cmd, myparams, 0, &html); + if (status) { + LOGERR(("MimeHandlerExec: command status 0x%x: %s\n", + status, cmd.c_str())); + return MimeHandler::MHError; + } + + // Process/index the html + MimeHandlerHtml hh; + return hh.mkDoc(conf, fn, html, mtype, docout); +} diff --git a/src/internfile/mh_exec.h b/src/internfile/mh_exec.h new file mode 100644 index 00000000..de1a5f72 --- /dev/null +++ b/src/internfile/mh_exec.h @@ -0,0 +1,26 @@ +#ifndef _MH_EXEC_H_INCLUDED_ +#define _MH_EXEC_H_INCLUDED_ +/* @(#$Id: mh_exec.h,v 1.1 2005-11-18 13:23:46 dockes Exp $ (C) 2004 J.F.Dockes */ + +#include +#include + +#include "rclconfig.h" +#include "rcldb.h" +#include "mimehandler.h" + +/** + Turn external document into internal one by executing an external filter. + The command to execute, and its parameters, come from the mimeconf file +*/ +class MimeHandlerExec : public MimeHandler { + public: + std::list params; + virtual ~MimeHandlerExec() {} + virtual MimeHandler::Status + mkDoc(RclConfig *conf, const std::string &fn, + const std::string &mtype, Rcl::Doc &docout, std::string&); + +}; + +#endif /* _MH_EXEC_H_INCLUDED_ */ diff --git a/src/internfile/mh_html.cpp b/src/internfile/mh_html.cpp index 119055ea..2cb94195 100644 --- a/src/internfile/mh_html.cpp +++ b/src/internfile/mh_html.cpp @@ -32,7 +32,7 @@ #include "mimeparse.h" #include "myhtmlparse.h" #include "indextext.h" -#include "html.h" +#include "mh_html.h" #include using namespace std; diff --git a/src/internfile/mh_html.h b/src/internfile/mh_html.h index 704a8d2b..0b69a82f 100644 --- a/src/internfile/mh_html.h +++ b/src/internfile/mh_html.h @@ -1,25 +1,33 @@ #ifndef _HTML_H_INCLUDED_ #define _HTML_H_INCLUDED_ -/* @(#$Id: mh_html.h,v 1.5 2005-11-08 21:02:55 dockes Exp $ (C) 2004 J.F.Dockes */ -#include "mimehandler.h" +/* @(#$Id: mh_html.h,v 1.6 2005-11-18 13:23:46 dockes Exp $ (C) 2004 J.F.Dockes */ + #include -/// Translate html document to an internal one. -/// -/// There are 2 interfaces, depending if we're working on a file, or -/// on a string. The string form is applied to the output of external -/// handlers for foreign formats: they return a result in html, which -/// has the advantage to be text (easy to use in shell-scripts), and -/// semi-structured (can carry titles, abstracts, whatever) +#include "mimehandler.h" + +/** + Translate html document to internal one. + + There are 2 interfaces, depending if we're working on a file, or + on a string. The string form is applied to the output of external + handlers for foreign formats: they return a result in html, which + has the advantage to be text (easy to use in shell-scripts), and + semi-structured (can carry titles, abstracts, whatever) +*/ class MimeHandlerHtml : public MimeHandler { public: std::string charsethint; - /// Create internal document from html file (standard interface) - virtual MimeHandler::Status mkDoc(RclConfig *conf, const string &fn, - const string &mtype, Rcl::Doc &docout, string&); - /// Create internal doc from html string (postfilter for external ones) - virtual MimeHandler::Status mkDoc(RclConfig *conf, const string &fn, - const string& htext, - const string &mtype, Rcl::Doc &docout); + + /** Create internal document from html file (standard interface) */ + virtual MimeHandler::Status + mkDoc(RclConfig *conf, const std::string &fn, + const std::string &mtype, Rcl::Doc &docout, std::string&); + + /** Create internal doc from html string (postfilter for external ones) */ + virtual MimeHandler::Status + mkDoc(RclConfig *conf, const std::string &fn, const std::string& htext, + const std::string &mtype, Rcl::Doc &docout); }; + #endif /* _HTML_H_INCLUDED_ */ diff --git a/src/internfile/mh_mail.cpp b/src/internfile/mh_mail.cpp index 6aae9d3e..fda908dc 100644 --- a/src/internfile/mh_mail.cpp +++ b/src/internfile/mh_mail.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: mh_mail.cpp,v 1.9 2005-11-08 21:02:55 dockes Exp $ (C) 2005 J.F.Dockes"; +static char rcsid[] = "@(#$Id: mh_mail.cpp,v 1.10 2005-11-18 13:23:46 dockes Exp $ (C) 2005 J.F.Dockes"; #endif #include @@ -19,11 +19,11 @@ using std::map; #include "transcode.h" #include "mimeparse.h" #include "indextext.h" -#include "mail.h" +#include "mh_mail.h" #include "debuglog.h" #include "smallut.h" #include "mimeparse.h" -#include "html.h" +#include "mh_html.h" // binc imap mime definitions #include "mime.h" diff --git a/src/internfile/mh_mail.h b/src/internfile/mh_mail.h index 1aaf795c..e03276c5 100644 --- a/src/internfile/mh_mail.h +++ b/src/internfile/mh_mail.h @@ -1,14 +1,29 @@ #ifndef _MAIL_H_INCLUDED_ #define _MAIL_H_INCLUDED_ -/* @(#$Id: mh_mail.h,v 1.3 2005-11-08 21:02:55 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: mh_mail.h,v 1.4 2005-11-18 13:23:46 dockes Exp $ (C) 2004 J.F.Dockes */ + #include "mimehandler.h" + namespace Binc { class MimeDocument; } -/// Translate a mail folder file into internal documents (also works -/// for maildir files) +/** + Translate a mail folder file into internal documents (also works + for maildir files). This has to keep state while parsing a mail folder + file. +*/ class MimeHandlerMail : public MimeHandler { + public: + MimeHandlerMail() : vfp(0), msgnum(0), conf(0) {} + + virtual MimeHandler::Status + mkDoc(RclConfig *conf, const std::string &fn, + const std::string &mtype, Rcl::Doc &docout, std::string& ipath); + + virtual ~MimeHandlerMail(); + + private: void *vfp; int msgnum; RclConfig *conf; @@ -16,12 +31,6 @@ class MimeHandlerMail : public MimeHandler { Rcl::Doc &docout); MimeHandler::Status processmbox(const string &fn, Rcl::Doc &docout, string &ipath); - public: - MimeHandlerMail() : vfp(0), msgnum(0), conf(0) {} - virtual ~MimeHandlerMail(); - virtual MimeHandler::Status - mkDoc(RclConfig *conf, const string &fn, - const string &mtype, Rcl::Doc &docout, string& ipath); }; #endif /* _MAIL_H_INCLUDED_ */ diff --git a/src/internfile/mh_text.cpp b/src/internfile/mh_text.cpp new file mode 100644 index 00000000..b172acbd --- /dev/null +++ b/src/internfile/mh_text.cpp @@ -0,0 +1,46 @@ +#ifndef lint +static char rcsid[] = "@(#$Id: mh_text.cpp,v 1.1 2005-11-18 13:23:46 dockes Exp $ (C) 2005 J.F.Dockes"; +#endif + +#include +#include +using namespace std; + +#include "mh_text.h" +#include "csguess.h" +#include "debuglog.h" +#include "readfile.h" +#include "transcode.h" + +// Process a plain text file +MimeHandler::Status MimeHandlerText::mkDoc(RclConfig *conf, const string &fn, + const string &mtype, Rcl::Doc &docout, string&) +{ + string otext; + if (!file_to_string(fn, otext)) + return MimeHandler::MHError; + + // Try to guess charset, then convert to utf-8, and fill document + // fields The charset guesser really doesnt work well in general + // and should be avoided (especially for short documents) + string charset; + if (conf->getGuessCharset()) { + charset = csguess(otext, conf->getDefCharset()); + } else + charset = conf->getDefCharset(); + string utf8; + LOGDEB1(("textPlainToDoc: transcod from %s to %s\n", charset, "UTF-8")); + + if (!transcode(otext, utf8, charset, "UTF-8")) { + cerr << "textPlainToDoc: transcode failed: charset '" << charset + << "' to UTF-8: "<< utf8 << endl; + otext.erase(); + return MimeHandler::MHError; + } + + Rcl::Doc out; + out.origcharset = charset; + out.text = utf8; + docout = out; + return MimeHandler::MHDone; +} diff --git a/src/internfile/mh_text.h b/src/internfile/mh_text.h new file mode 100644 index 00000000..695fc630 --- /dev/null +++ b/src/internfile/mh_text.h @@ -0,0 +1,24 @@ +#ifndef _MH_TEXT_H_INCLUDED_ +#define _MH_TEXT_H_INCLUDED_ +/* @(#$Id: mh_text.h,v 1.1 2005-11-18 13:23:46 dockes Exp $ (C) 2004 J.F.Dockes */ + +#include + +#include "rclconfig.h" +#include "rcldb.h" +#include "mimehandler.h" + +/** + * Handler for text/plain files. + * + * Maybe try to guess charset, or use default, then transcode to utf8 + */ +class MimeHandlerText : public MimeHandler { + public: + MimeHandler::Status mkDoc(RclConfig *conf, const std::string &fn, + const std::string &mtype, Rcl::Doc &docout, + std::string&); + +}; + +#endif /* _MH_TEXT_H_INCLUDED_ */ diff --git a/src/internfile/mimehandler.cpp b/src/internfile/mimehandler.cpp index 023c22cc..bb928f83 100644 --- a/src/internfile/mimehandler.cpp +++ b/src/internfile/mimehandler.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: mimehandler.cpp,v 1.11 2005-11-16 15:07:20 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: mimehandler.cpp,v 1.12 2005-11-18 13:23:46 dockes Exp $ (C) 2004 J.F.Dockes"; #endif #include @@ -7,103 +7,15 @@ static char rcsid[] = "@(#$Id: mimehandler.cpp,v 1.11 2005-11-16 15:07:20 dockes using namespace std; #include "mimehandler.h" -#include "readfile.h" -#include "csguess.h" -#include "transcode.h" #include "debuglog.h" #include "smallut.h" -#include "html.h" -#include "mail.h" -#include "execmd.h" -#include "pathut.h" - -class MimeHandlerText : public MimeHandler { - public: - MimeHandler::Status mkDoc(RclConfig *conf, const string &fn, - const string &mtype, Rcl::Doc &docout, string&); - -}; - -// Process a plain text file -MimeHandler::Status MimeHandlerText::mkDoc(RclConfig *conf, const string &fn, - const string &mtype, Rcl::Doc &docout, string&) -{ - string otext; - if (!file_to_string(fn, otext)) - return MimeHandler::MHError; - - // Try to guess charset, then convert to utf-8, and fill document - // fields The charset guesser really doesnt work well in general - // and should be avoided (especially for short documents) - string charset; - if (conf->getGuessCharset()) { - charset = csguess(otext, conf->getDefCharset()); - } else - charset = conf->getDefCharset(); - string utf8; - LOGDEB1(("textPlainToDoc: transcod from %s to %s\n", charset, "UTF-8")); - - if (!transcode(otext, utf8, charset, "UTF-8")) { - cerr << "textPlainToDoc: transcode failed: charset '" << charset - << "' to UTF-8: "<< utf8 << endl; - otext.erase(); - return MimeHandler::MHError; - } - - Rcl::Doc out; - out.origcharset = charset; - out.text = utf8; - docout = out; - return MimeHandler::MHDone; -} - -class MimeHandlerExec : public MimeHandler { - public: - list params; - virtual ~MimeHandlerExec() {} - virtual MimeHandler::Status mkDoc(RclConfig *conf, const string &fn, - const string &mtype, Rcl::Doc &docout, - string&); - -}; - - -// Execute an external program to translate a file from its native format -// to html. Then call the html parser to do the actual indexing -MimeHandler::Status -MimeHandlerExec::mkDoc(RclConfig *conf, const string &fn, - const string &mtype, Rcl::Doc &docout, string&) -{ - if (params.empty()) { - // Hu ho - LOGERR(("MimeHandlerExec::mkDoc: empty params for mime %s\n", - mtype.c_str())); - return MimeHandler::MHError; - } - // Command name - string cmd = find_filter(conf, params.front()); - - // Build parameter list: delete cmd name and add the file name - list::iterator it = params.begin(); - listmyparams(++it, params.end()); - myparams.push_back(fn); - - // Execute command and store the result text, which is supposedly html - string html; - ExecCmd exec; - int status = exec.doexec(cmd, myparams, 0, &html); - if (status) { - LOGERR(("MimeHandlerExec: command status 0x%x: %s\n", - status, cmd.c_str())); - return MimeHandler::MHError; - } - - // Process/index the html - MimeHandlerHtml hh; - return hh.mkDoc(conf, fn, html, mtype, docout); -} - -static MimeHandler *mhfact(const string &mime) +#include "mh_html.h" +#include "mh_mail.h" +#include "mh_text.h" +#include "mh_exec.h" + +/** Create internal handler object appropriate for given mime type */ +static MimeHandler *mhFactory(const string &mime) { if (!stringlowercmp("text/plain", mime)) return new MimeHandlerText; @@ -117,9 +29,9 @@ static MimeHandler *mhfact(const string &mime) } /** - * Return handler function for given mime type + * Return handler object for given mime type: */ -MimeHandler *getMimeHandler(const std::string &mtype, ConfTree *mhandlers) +MimeHandler *getMimeHandler(const string &mtype, ConfTree *mhandlers) { // Return handler definition for mime type string hs; @@ -138,7 +50,7 @@ MimeHandler *getMimeHandler(const std::string &mtype, ConfTree *mhandlers) // Retrieve handler function according to type if (!stringlowercmp("internal", toks.front())) { - return mhfact(mtype); + return mhFactory(mtype); } else if (!stringlowercmp("dll", toks.front())) { return 0; } else if (!stringlowercmp("exec", toks.front())) { @@ -160,7 +72,7 @@ MimeHandler *getMimeHandler(const std::string &mtype, ConfTree *mhandlers) /** * Return external viewer exec string for given mime type */ -string getMimeViewer(const std::string &mtype, ConfTree *mhandlers) +string getMimeViewer(const string &mtype, ConfTree *mhandlers) { string hs; mhandlers->get(mtype, hs, "view"); @@ -170,7 +82,7 @@ string getMimeViewer(const std::string &mtype, ConfTree *mhandlers) /** * Return icon name */ -string getMimeIconName(const std::string &mtype, ConfTree *mhandlers) +string getMimeIconName(const string &mtype, ConfTree *mhandlers) { string hs; mhandlers->get(mtype, hs, "icons"); @@ -180,7 +92,7 @@ string getMimeIconName(const std::string &mtype, ConfTree *mhandlers) /** * Return decompression command line for given mime type */ -bool getUncompressor(const std::string &mtype, ConfTree *mhandlers, +bool getUncompressor(const string &mtype, ConfTree *mhandlers, list& cmd) { string hs; diff --git a/src/lib/Makefile b/src/lib/Makefile index 85da0170..f3b92b35 100644 --- a/src/lib/Makefile +++ b/src/lib/Makefile @@ -7,17 +7,17 @@ all: depend $(LIBS) OBJS = base64.o conftree.o csguess.o debuglog.o \ execmd.o wipedir.o \ - fstreewalk.o html.o mail.o htmlparse.o idfile.o indexer.o \ - internfile.o md5.o \ + fstreewalk.o mh_html.o mh_mail.o mh_exec.o mh_text.o htmlparse.o \ + idfile.o indexer.o internfile.o md5.o \ mimehandler.o mimeparse.o mimetype.o myhtmlparse.o pathhash.o pathut.o \ rclconfig.o rcldb.o rclinit.o readfile.o smallut.o \ textsplit.o transcode.o \ unacpp.o unac.o SRCS = ../utils/conftree.cpp ../index/csguess.cpp ../utils/debuglog.cpp \ ../utils/execmd.cpp ../utils/idfile.cpp ../utils/md5.cpp \ - ../utils/wipedir.cpp \ - ../utils/fstreewalk.cpp ../common/html.cpp ../common/mail.cpp \ - ../common/htmlparse.cpp \ + ../utils/wipedir.cpp ../utils/fstreewalk.cpp \ + ../common/mh_html.cpp ../common/mh_mail.cpp ../common/mh_exec.cpp \ + ../common/mh_text.cpp ../common/htmlparse.cpp \ ../index/indexer.cpp ../common/internfile.cpp \ ../common/mimehandler.cpp ../utils/mimeparse.cpp ../index/mimetype.cpp \ ../common/myhtmlparse.cpp ../common/pathhash.cpp ../utils/pathut.cpp \ @@ -46,7 +46,13 @@ wipedir.o : ../utils/wipedir.cpp $(CXX) $(CXXFLAGS) -c $< fstreewalk.o : ../utils/fstreewalk.cpp $(CXX) $(CXXFLAGS) -c $< -html.o : ../common/html.cpp +mh_html.o : ../common/mh_html.cpp + $(CXX) $(CXXFLAGS) -c $< +mh_exec.o : ../common/mh_exec.cpp + $(CXX) $(CXXFLAGS) -c $< +mh_text.o : ../common/mh_text.cpp + $(CXX) $(CXXFLAGS) -c $< +mh_html.o : ../common/mh_html.cpp $(CXX) $(CXXFLAGS) -c $< htmlparse.o : ../common/htmlparse.cpp $(CXX) $(CXXFLAGS) -c $< @@ -56,7 +62,7 @@ indexer.o : ../index/indexer.cpp $(CXX) $(CXXFLAGS) -c $< internfile.o : ../common/internfile.cpp $(CXX) $(CXXFLAGS) -c $< -mail.o : ../common/mail.cpp +mh_mail.o : ../common/mh_mail.cpp $(CXX) $(CXXFLAGS) -c $< mimehandler.o : ../common/mimehandler.cpp $(CXX) $(CXXFLAGS) -c $<