restructuring on mimehandler files

This commit is contained in:
dockes 2005-11-18 13:23:46 +00:00
parent e2053c1d1b
commit 6cba3b65c1
10 changed files with 214 additions and 138 deletions

View File

@ -0,0 +1,45 @@
#ifndef lint
static char rcsid[] = "@(#$Id: mh_exec.cpp,v 1.1 2005-11-18 13:23:46 dockes Exp $ (C) 2005 J.F.Dockes";
#endif
#include "execmd.h"
#include "mh_exec.h"
#include "mh_html.h"
#include "debuglog.h"
using namespace std;
// Execute an external program to translate a file from its native format
// to html. Then call the html parser to do the actual indexing
MimeHandler::Status
MimeHandlerExec::mkDoc(RclConfig *conf, const string &fn,
const string &mtype, Rcl::Doc &docout, string&)
{
if (params.empty()) {
// Hu ho
LOGERR(("MimeHandlerExec::mkDoc: empty params for mime %s\n",
mtype.c_str()));
return MimeHandler::MHError;
}
// Command name
string cmd = find_filter(conf, params.front());
// Build parameter list: delete cmd name and add the file name
list<string>::iterator it = params.begin();
list<string>myparams(++it, params.end());
myparams.push_back(fn);
// Execute command and store the result text, which is supposedly html
string html;
ExecCmd exec;
int status = exec.doexec(cmd, myparams, 0, &html);
if (status) {
LOGERR(("MimeHandlerExec: command status 0x%x: %s\n",
status, cmd.c_str()));
return MimeHandler::MHError;
}
// Process/index the html
MimeHandlerHtml hh;
return hh.mkDoc(conf, fn, html, mtype, docout);
}

26
src/internfile/mh_exec.h Normal file
View File

@ -0,0 +1,26 @@
#ifndef _MH_EXEC_H_INCLUDED_
#define _MH_EXEC_H_INCLUDED_
/* @(#$Id: mh_exec.h,v 1.1 2005-11-18 13:23:46 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string>
#include <list>
#include "rclconfig.h"
#include "rcldb.h"
#include "mimehandler.h"
/**
Turn external document into internal one by executing an external filter.
The command to execute, and its parameters, come from the mimeconf file
*/
class MimeHandlerExec : public MimeHandler {
public:
std::list<std::string> params;
virtual ~MimeHandlerExec() {}
virtual MimeHandler::Status
mkDoc(RclConfig *conf, const std::string &fn,
const std::string &mtype, Rcl::Doc &docout, std::string&);
};
#endif /* _MH_EXEC_H_INCLUDED_ */

View File

@ -32,7 +32,7 @@
#include "mimeparse.h"
#include "myhtmlparse.h"
#include "indextext.h"
#include "html.h"
#include "mh_html.h"
#include <iostream>
using namespace std;

View File

@ -1,25 +1,33 @@
#ifndef _HTML_H_INCLUDED_
#define _HTML_H_INCLUDED_
/* @(#$Id: mh_html.h,v 1.5 2005-11-08 21:02:55 dockes Exp $ (C) 2004 J.F.Dockes */
#include "mimehandler.h"
/* @(#$Id: mh_html.h,v 1.6 2005-11-18 13:23:46 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string>
/// Translate html document to an internal one.
///
/// There are 2 interfaces, depending if we're working on a file, or
/// on a string. The string form is applied to the output of external
/// handlers for foreign formats: they return a result in html, which
/// has the advantage to be text (easy to use in shell-scripts), and
/// semi-structured (can carry titles, abstracts, whatever)
#include "mimehandler.h"
/**
Translate html document to internal one.
There are 2 interfaces, depending if we're working on a file, or
on a string. The string form is applied to the output of external
handlers for foreign formats: they return a result in html, which
has the advantage to be text (easy to use in shell-scripts), and
semi-structured (can carry titles, abstracts, whatever)
*/
class MimeHandlerHtml : public MimeHandler {
public:
std::string charsethint;
/// Create internal document from html file (standard interface)
virtual MimeHandler::Status mkDoc(RclConfig *conf, const string &fn,
const string &mtype, Rcl::Doc &docout, string&);
/// Create internal doc from html string (postfilter for external ones)
virtual MimeHandler::Status mkDoc(RclConfig *conf, const string &fn,
const string& htext,
const string &mtype, Rcl::Doc &docout);
/** Create internal document from html file (standard interface) */
virtual MimeHandler::Status
mkDoc(RclConfig *conf, const std::string &fn,
const std::string &mtype, Rcl::Doc &docout, std::string&);
/** Create internal doc from html string (postfilter for external ones) */
virtual MimeHandler::Status
mkDoc(RclConfig *conf, const std::string &fn, const std::string& htext,
const std::string &mtype, Rcl::Doc &docout);
};
#endif /* _HTML_H_INCLUDED_ */

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: mh_mail.cpp,v 1.9 2005-11-08 21:02:55 dockes Exp $ (C) 2005 J.F.Dockes";
static char rcsid[] = "@(#$Id: mh_mail.cpp,v 1.10 2005-11-18 13:23:46 dockes Exp $ (C) 2005 J.F.Dockes";
#endif
#include <stdio.h>
@ -19,11 +19,11 @@ using std::map;
#include "transcode.h"
#include "mimeparse.h"
#include "indextext.h"
#include "mail.h"
#include "mh_mail.h"
#include "debuglog.h"
#include "smallut.h"
#include "mimeparse.h"
#include "html.h"
#include "mh_html.h"
// binc imap mime definitions
#include "mime.h"

View File

@ -1,14 +1,29 @@
#ifndef _MAIL_H_INCLUDED_
#define _MAIL_H_INCLUDED_
/* @(#$Id: mh_mail.h,v 1.3 2005-11-08 21:02:55 dockes Exp $ (C) 2004 J.F.Dockes */
/* @(#$Id: mh_mail.h,v 1.4 2005-11-18 13:23:46 dockes Exp $ (C) 2004 J.F.Dockes */
#include "mimehandler.h"
namespace Binc {
class MimeDocument;
}
/// Translate a mail folder file into internal documents (also works
/// for maildir files)
/**
Translate a mail folder file into internal documents (also works
for maildir files). This has to keep state while parsing a mail folder
file.
*/
class MimeHandlerMail : public MimeHandler {
public:
MimeHandlerMail() : vfp(0), msgnum(0), conf(0) {}
virtual MimeHandler::Status
mkDoc(RclConfig *conf, const std::string &fn,
const std::string &mtype, Rcl::Doc &docout, std::string& ipath);
virtual ~MimeHandlerMail();
private:
void *vfp;
int msgnum;
RclConfig *conf;
@ -16,12 +31,6 @@ class MimeHandlerMail : public MimeHandler {
Rcl::Doc &docout);
MimeHandler::Status processmbox(const string &fn, Rcl::Doc &docout,
string &ipath);
public:
MimeHandlerMail() : vfp(0), msgnum(0), conf(0) {}
virtual ~MimeHandlerMail();
virtual MimeHandler::Status
mkDoc(RclConfig *conf, const string &fn,
const string &mtype, Rcl::Doc &docout, string& ipath);
};
#endif /* _MAIL_H_INCLUDED_ */

View File

@ -0,0 +1,46 @@
#ifndef lint
static char rcsid[] = "@(#$Id: mh_text.cpp,v 1.1 2005-11-18 13:23:46 dockes Exp $ (C) 2005 J.F.Dockes";
#endif
#include <iostream>
#include <string>
using namespace std;
#include "mh_text.h"
#include "csguess.h"
#include "debuglog.h"
#include "readfile.h"
#include "transcode.h"
// Process a plain text file
MimeHandler::Status MimeHandlerText::mkDoc(RclConfig *conf, const string &fn,
const string &mtype, Rcl::Doc &docout, string&)
{
string otext;
if (!file_to_string(fn, otext))
return MimeHandler::MHError;
// Try to guess charset, then convert to utf-8, and fill document
// fields The charset guesser really doesnt work well in general
// and should be avoided (especially for short documents)
string charset;
if (conf->getGuessCharset()) {
charset = csguess(otext, conf->getDefCharset());
} else
charset = conf->getDefCharset();
string utf8;
LOGDEB1(("textPlainToDoc: transcod from %s to %s\n", charset, "UTF-8"));
if (!transcode(otext, utf8, charset, "UTF-8")) {
cerr << "textPlainToDoc: transcode failed: charset '" << charset
<< "' to UTF-8: "<< utf8 << endl;
otext.erase();
return MimeHandler::MHError;
}
Rcl::Doc out;
out.origcharset = charset;
out.text = utf8;
docout = out;
return MimeHandler::MHDone;
}

24
src/internfile/mh_text.h Normal file
View File

@ -0,0 +1,24 @@
#ifndef _MH_TEXT_H_INCLUDED_
#define _MH_TEXT_H_INCLUDED_
/* @(#$Id: mh_text.h,v 1.1 2005-11-18 13:23:46 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string>
#include "rclconfig.h"
#include "rcldb.h"
#include "mimehandler.h"
/**
* Handler for text/plain files.
*
* Maybe try to guess charset, or use default, then transcode to utf8
*/
class MimeHandlerText : public MimeHandler {
public:
MimeHandler::Status mkDoc(RclConfig *conf, const std::string &fn,
const std::string &mtype, Rcl::Doc &docout,
std::string&);
};
#endif /* _MH_TEXT_H_INCLUDED_ */

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: mimehandler.cpp,v 1.11 2005-11-16 15:07:20 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: mimehandler.cpp,v 1.12 2005-11-18 13:23:46 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
#include <iostream>
@ -7,103 +7,15 @@ static char rcsid[] = "@(#$Id: mimehandler.cpp,v 1.11 2005-11-16 15:07:20 dockes
using namespace std;
#include "mimehandler.h"
#include "readfile.h"
#include "csguess.h"
#include "transcode.h"
#include "debuglog.h"
#include "smallut.h"
#include "html.h"
#include "mail.h"
#include "execmd.h"
#include "pathut.h"
class MimeHandlerText : public MimeHandler {
public:
MimeHandler::Status mkDoc(RclConfig *conf, const string &fn,
const string &mtype, Rcl::Doc &docout, string&);
};
// Process a plain text file
MimeHandler::Status MimeHandlerText::mkDoc(RclConfig *conf, const string &fn,
const string &mtype, Rcl::Doc &docout, string&)
{
string otext;
if (!file_to_string(fn, otext))
return MimeHandler::MHError;
// Try to guess charset, then convert to utf-8, and fill document
// fields The charset guesser really doesnt work well in general
// and should be avoided (especially for short documents)
string charset;
if (conf->getGuessCharset()) {
charset = csguess(otext, conf->getDefCharset());
} else
charset = conf->getDefCharset();
string utf8;
LOGDEB1(("textPlainToDoc: transcod from %s to %s\n", charset, "UTF-8"));
if (!transcode(otext, utf8, charset, "UTF-8")) {
cerr << "textPlainToDoc: transcode failed: charset '" << charset
<< "' to UTF-8: "<< utf8 << endl;
otext.erase();
return MimeHandler::MHError;
}
Rcl::Doc out;
out.origcharset = charset;
out.text = utf8;
docout = out;
return MimeHandler::MHDone;
}
class MimeHandlerExec : public MimeHandler {
public:
list<string> params;
virtual ~MimeHandlerExec() {}
virtual MimeHandler::Status mkDoc(RclConfig *conf, const string &fn,
const string &mtype, Rcl::Doc &docout,
string&);
};
// Execute an external program to translate a file from its native format
// to html. Then call the html parser to do the actual indexing
MimeHandler::Status
MimeHandlerExec::mkDoc(RclConfig *conf, const string &fn,
const string &mtype, Rcl::Doc &docout, string&)
{
if (params.empty()) {
// Hu ho
LOGERR(("MimeHandlerExec::mkDoc: empty params for mime %s\n",
mtype.c_str()));
return MimeHandler::MHError;
}
// Command name
string cmd = find_filter(conf, params.front());
// Build parameter list: delete cmd name and add the file name
list<string>::iterator it = params.begin();
list<string>myparams(++it, params.end());
myparams.push_back(fn);
// Execute command and store the result text, which is supposedly html
string html;
ExecCmd exec;
int status = exec.doexec(cmd, myparams, 0, &html);
if (status) {
LOGERR(("MimeHandlerExec: command status 0x%x: %s\n",
status, cmd.c_str()));
return MimeHandler::MHError;
}
// Process/index the html
MimeHandlerHtml hh;
return hh.mkDoc(conf, fn, html, mtype, docout);
}
static MimeHandler *mhfact(const string &mime)
#include "mh_html.h"
#include "mh_mail.h"
#include "mh_text.h"
#include "mh_exec.h"
/** Create internal handler object appropriate for given mime type */
static MimeHandler *mhFactory(const string &mime)
{
if (!stringlowercmp("text/plain", mime))
return new MimeHandlerText;
@ -117,9 +29,9 @@ static MimeHandler *mhfact(const string &mime)
}
/**
* Return handler function for given mime type
* Return handler object for given mime type:
*/
MimeHandler *getMimeHandler(const std::string &mtype, ConfTree *mhandlers)
MimeHandler *getMimeHandler(const string &mtype, ConfTree *mhandlers)
{
// Return handler definition for mime type
string hs;
@ -138,7 +50,7 @@ MimeHandler *getMimeHandler(const std::string &mtype, ConfTree *mhandlers)
// Retrieve handler function according to type
if (!stringlowercmp("internal", toks.front())) {
return mhfact(mtype);
return mhFactory(mtype);
} else if (!stringlowercmp("dll", toks.front())) {
return 0;
} else if (!stringlowercmp("exec", toks.front())) {
@ -160,7 +72,7 @@ MimeHandler *getMimeHandler(const std::string &mtype, ConfTree *mhandlers)
/**
* Return external viewer exec string for given mime type
*/
string getMimeViewer(const std::string &mtype, ConfTree *mhandlers)
string getMimeViewer(const string &mtype, ConfTree *mhandlers)
{
string hs;
mhandlers->get(mtype, hs, "view");
@ -170,7 +82,7 @@ string getMimeViewer(const std::string &mtype, ConfTree *mhandlers)
/**
* Return icon name
*/
string getMimeIconName(const std::string &mtype, ConfTree *mhandlers)
string getMimeIconName(const string &mtype, ConfTree *mhandlers)
{
string hs;
mhandlers->get(mtype, hs, "icons");
@ -180,7 +92,7 @@ string getMimeIconName(const std::string &mtype, ConfTree *mhandlers)
/**
* Return decompression command line for given mime type
*/
bool getUncompressor(const std::string &mtype, ConfTree *mhandlers,
bool getUncompressor(const string &mtype, ConfTree *mhandlers,
list<string>& cmd)
{
string hs;

View File

@ -7,17 +7,17 @@ all: depend $(LIBS)
OBJS = base64.o conftree.o csguess.o debuglog.o \
execmd.o wipedir.o \
fstreewalk.o html.o mail.o htmlparse.o idfile.o indexer.o \
internfile.o md5.o \
fstreewalk.o mh_html.o mh_mail.o mh_exec.o mh_text.o htmlparse.o \
idfile.o indexer.o internfile.o md5.o \
mimehandler.o mimeparse.o mimetype.o myhtmlparse.o pathhash.o pathut.o \
rclconfig.o rcldb.o rclinit.o readfile.o smallut.o \
textsplit.o transcode.o \
unacpp.o unac.o
SRCS = ../utils/conftree.cpp ../index/csguess.cpp ../utils/debuglog.cpp \
../utils/execmd.cpp ../utils/idfile.cpp ../utils/md5.cpp \
../utils/wipedir.cpp \
../utils/fstreewalk.cpp ../common/html.cpp ../common/mail.cpp \
../common/htmlparse.cpp \
../utils/wipedir.cpp ../utils/fstreewalk.cpp \
../common/mh_html.cpp ../common/mh_mail.cpp ../common/mh_exec.cpp \
../common/mh_text.cpp ../common/htmlparse.cpp \
../index/indexer.cpp ../common/internfile.cpp \
../common/mimehandler.cpp ../utils/mimeparse.cpp ../index/mimetype.cpp \
../common/myhtmlparse.cpp ../common/pathhash.cpp ../utils/pathut.cpp \
@ -46,7 +46,13 @@ wipedir.o : ../utils/wipedir.cpp
$(CXX) $(CXXFLAGS) -c $<
fstreewalk.o : ../utils/fstreewalk.cpp
$(CXX) $(CXXFLAGS) -c $<
html.o : ../common/html.cpp
mh_html.o : ../common/mh_html.cpp
$(CXX) $(CXXFLAGS) -c $<
mh_exec.o : ../common/mh_exec.cpp
$(CXX) $(CXXFLAGS) -c $<
mh_text.o : ../common/mh_text.cpp
$(CXX) $(CXXFLAGS) -c $<
mh_html.o : ../common/mh_html.cpp
$(CXX) $(CXXFLAGS) -c $<
htmlparse.o : ../common/htmlparse.cpp
$(CXX) $(CXXFLAGS) -c $<
@ -56,7 +62,7 @@ indexer.o : ../index/indexer.cpp
$(CXX) $(CXXFLAGS) -c $<
internfile.o : ../common/internfile.cpp
$(CXX) $(CXXFLAGS) -c $<
mail.o : ../common/mail.cpp
mh_mail.o : ../common/mh_mail.cpp
$(CXX) $(CXXFLAGS) -c $<
mimehandler.o : ../common/mimehandler.cpp
$(CXX) $(CXXFLAGS) -c $<