allow specifying format and charset for ext filters. Cache and reuse filters

This commit is contained in:
dockes 2008-10-04 14:26:59 +00:00
parent d05694fb82
commit 9082f3bf65
15 changed files with 208 additions and 61 deletions

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: indexer.cpp,v 1.68 2008-07-29 06:25:29 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: indexer.cpp,v 1.69 2008-10-04 14:26:59 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -455,20 +455,17 @@ DbIndexer::processone(const std::string &fn, const struct stat *stp,
// The not so nice point was that the file name was not
// indexed.
//
// We now index at least the file name. We use a dirty
// hack to ensure that the indexing will be retried each
// time: the stored number as decimal ascii mtime is
// prefixed with a '+', which doesnt change its value for
// atoll() but is tested by rcldb::needUpdate()
// Reset the date as set by the handler if any
// We now index at least the file name and the mod time.
// We change the signature to ensure that the indexing will
// be retried every time. This can make indexing passes quite
// slower if there are many files of types with no helper
doc.fmtime.erase();
// Go through:
}
if (doc.fmtime.empty()) {
// Set the date if this was not done in the document handler
doc.fmtime = (fis == FileInterner::FIError) ? plus + ascdate :
ascdate;
doc.fmtime = ascdate;
}
// Internal access path for multi-document files
@ -492,6 +489,14 @@ DbIndexer::processone(const std::string &fn, const struct stat *stp,
// need for reversible formatting
sprintf(cbuf, "%ld%ld", (long)stp->st_size, (long)stp->st_mtime);
doc.sig = cbuf;
// If there was an error, ensure indexing will be
// retried. This is for the once missing, later installed
// filter case. It can make indexing much slower (if there are
// myriads of such files, the ext script is executed for them
// and fails every time)
if (fis == FileInterner::FIError) {
doc.sig += plus;
}
// Add document to database. If there is an ipath, add it as a children
// of the file document.

View File

@ -157,11 +157,13 @@ namespace Dijon
* that the client application can pass the nested document's content
* to another filter that supports this particular type.
*/
const std::map<std::string, std::string> &get_meta_data(void) const
virtual const std::map<std::string, std::string> &get_meta_data(void) const
{
return m_metaData;
}
virtual void clear() {m_metaData.clear();}
protected:
/// The MIME type handled by the filter.
std::string m_mimeType;

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: internfile.cpp,v 1.43 2008-10-03 06:23:23 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: internfile.cpp,v 1.44 2008-10-04 14:26:59 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -203,8 +203,9 @@ FileInterner::~FileInterner()
{
tmpcleanup();
for (vector<Dijon::Filter*>::iterator it = m_handlers.begin();
it != m_handlers.end(); it++)
delete *it;
it != m_handlers.end(); it++) {
returnMimeHandler(*it);
}
// m_tempfiles will take care of itself
}
@ -283,8 +284,10 @@ static inline bool getKeyValue(const map<string, string>& docdata,
it = docdata.find(key);
if (it != docdata.end()) {
value = it->second;
LOGDEB2(("getKeyValue: [%s]->[%s]\n", key.c_str(), value.c_str()));
return true;
}
LOGDEB2(("getKeyValue: no value for [%s]\n", key.c_str()));
return false;
}
@ -314,7 +317,7 @@ bool FileInterner::dijontorcl(Rcl::Doc& doc)
} else if (it->first == Rcl::Doc::keyoc) {
doc.origcharset = it->second;
} else if (it->first == keymt || it->first == keycs) {
// don't need these.
// don't need/want these.
} else {
doc.meta[it->first] = it->second;
}
@ -338,7 +341,6 @@ void FileInterner::collectIpathAndMT(Rcl::Doc& doc, string& ipath) const
// If there is no ipath stack, the mimetype is the one from the file
doc.mimetype = m_mimetype;
LOGDEB2(("INITIAL mimetype: %s\n", doc.mimetype.c_str()));
string ipathel;
for (vector<Dijon::Filter*>::const_iterator hit = m_handlers.begin();
@ -382,7 +384,7 @@ void FileInterner::popHandler()
m_tempfiles.pop_back();
m_tmpflgs[i] = false;
}
delete m_handlers.back();
returnMimeHandler(m_handlers.back());
m_handlers.pop_back();
}
@ -430,8 +432,8 @@ int FileInterner::addHandler()
m_forPreview ? "view" : "index");
newflt->set_property(Dijon::Filter::DEFAULT_CHARSET, charset);
// Get content: we don't use getkeyvalue() here to avoid copying
// the text, which may be big.
// Get current content: we don't use getkeyvalue() here to avoid
// copying the text, which may be big.
string ns;
const string *txt = &ns;
{
@ -469,9 +471,8 @@ int FileInterner::addHandler()
}
// Information and debug after a next_document error
void FileInterner::processNextDocError()
void FileInterner::processNextDocError(Rcl::Doc &doc, string& ipath)
{
Rcl::Doc doc; string ipath;
collectIpathAndMT(doc, ipath);
m_reason = m_handlers.back()->get_error();
checkExternalMissing(m_reason);
@ -530,7 +531,7 @@ FileInterner::Status FileInterner::internfile(Rcl::Doc& doc, string& ipath)
// might be ie an error while decoding an attachment, but we
// still want to process the rest of the mbox! For preview: fatal.
if (!m_handlers.back()->next_document()) {
processNextDocError(); // Debug etc.
processNextDocError(doc, ipath);
if (m_forPreview)
return FIError;
popHandler();

View File

@ -16,7 +16,7 @@
*/
#ifndef _INTERNFILE_H_INCLUDED_
#define _INTERNFILE_H_INCLUDED_
/* @(#$Id: internfile.h,v 1.19 2008-10-03 06:23:23 dockes Exp $ (C) 2004 J.F.Dockes */
/* @(#$Id: internfile.h,v 1.20 2008-10-04 14:26:59 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string>
#include <vector>
@ -144,7 +144,7 @@ class FileInterner {
void popHandler();
int addHandler();
void checkExternalMissing(const string& msg);
void processNextDocError();
void processNextDocError(Rcl::Doc &doc, string& ipath);
};

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: mh_exec.cpp,v 1.11 2008-10-02 13:30:32 dockes Exp $ (C) 2005 J.F.Dockes";
static char rcsid[] = "@(#$Id: mh_exec.cpp,v 1.12 2008-10-04 14:26:59 dockes Exp $ (C) 2005 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -35,8 +35,8 @@ public:
}
};
// Execute an external program to translate a file from its native format
// to html. Then call the html parser to do the actual indexing
// Execute an external program to translate a file from its native
// format to text or html.
bool MimeHandlerExec::next_document()
{
if (m_havedoc == false)
@ -59,29 +59,28 @@ bool MimeHandlerExec::next_document()
if (!m_ipath.empty())
myparams.push_back(m_ipath);
// Execute command and store the result text, which is supposedly html
string& html = m_metaData["content"];
html.erase();
// Execute command and store the result text
string& output = m_metaData["content"];
output.erase();
ExecCmd mexec;
MEAdv adv;
mexec.setAdvise(&adv);
mexec.putenv(m_forPreview ? "RECOLL_FILTER_FORPREVIEW=yes" :
"RECOLL_FILTER_FORPREVIEW=no");
int status = mexec.doexec(cmd, myparams, 0, &html);
int status = mexec.doexec(cmd, myparams, 0, &output);
if (status) {
LOGERR(("MimeHandlerExec: command status 0x%x: %s\n",
status, cmd.c_str()));
// If the output string begins with RECFILTERROR, then it's
// interpretable error information
if (html.find("RECFILTERROR") == 0)
m_reason = html;
if (output.find("RECFILTERROR") == 0)
m_reason = output;
return false;
}
m_metaData["origcharset"] = m_defcharset;
// Default charset: all recoll filters output utf-8, but this
// could still be overridden by the content-type meta tag.
m_metaData["charset"] = "utf-8";
m_metaData["mimetype"] = "text/html";
m_metaData["charset"] = cfgCharset.empty() ? "utf-8" : cfgCharset;
m_metaData["mimetype"] = cfgMtype.empty() ? "text/html" : cfgMtype;
return true;
}

View File

@ -16,7 +16,7 @@
*/
#ifndef _MH_EXEC_H_INCLUDED_
#define _MH_EXEC_H_INCLUDED_
/* @(#$Id: mh_exec.h,v 1.6 2008-10-02 13:30:32 dockes Exp $ (C) 2004 J.F.Dockes */
/* @(#$Id: mh_exec.h,v 1.7 2008-10-04 14:26:59 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string>
#include <list>
@ -33,7 +33,16 @@ using std::string;
*/
class MimeHandlerExec : public RecollFilter {
public:
// params, cfgMtype and chgCharset do not get reset by
// clear(). They define what I am
list<string> params;
// The defaults for external filters is to output html except if defined
// otherwise in the config.
string cfgMtype;
// For ext programs which don't output html, the output charset
// has to be known: ie they have a --charset utf-8 like option.
string cfgCharset;
MimeHandlerExec(const string& mt) : RecollFilter(mt) {}
virtual ~MimeHandlerExec() {}
virtual bool set_document_file(const string &file_path) {
@ -46,6 +55,12 @@ class MimeHandlerExec : public RecollFilter {
m_ipath = ipath;
return true;
}
virtual void clear() {
m_fn.erase();
m_ipath.erase();
RecollFilter::clear();
}
private:
string m_fn;
string m_ipath;

View File

@ -16,7 +16,7 @@
*/
#ifndef _HTML_H_INCLUDED_
#define _HTML_H_INCLUDED_
/* @(#$Id: mh_html.h,v 1.11 2008-10-03 06:17:46 dockes Exp $ (C) 2004 J.F.Dockes */
/* @(#$Id: mh_html.h,v 1.12 2008-10-04 14:26:59 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string>
@ -41,7 +41,11 @@ class MimeHandlerHtml : public RecollFilter {
{
return m_html;
}
virtual void clear() {
m_filename.erase();
m_html.erase();
RecollFilter::clear();
}
private:
string m_filename;
string m_html;

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: mh_mail.cpp,v 1.34 2008-09-16 08:13:45 dockes Exp $ (C) 2005 J.F.Dockes";
static char rcsid[] = "@(#$Id: mh_mail.cpp,v 1.35 2008-10-04 14:26:59 dockes Exp $ (C) 2005 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -54,14 +54,24 @@ static const string cstr_title = "title";
MimeHandlerMail::~MimeHandlerMail()
{
delete m_bincdoc;
if (m_fd >= 0)
clear();
}
void MimeHandlerMail::clear()
{
delete m_bincdoc; m_bincdoc = 0;
if (m_fd >= 0) {
close(m_fd);
delete m_stream;
m_fd = -1;
}
delete m_stream; m_stream = 0;
m_idx = -1;
m_subject.erase();
for (vector<MHMailAttach*>::iterator it = m_attachments.begin();
it != m_attachments.end(); it++) {
delete *it;
}
m_attachments.clear();
RecollFilter::clear();
}
bool MimeHandlerMail::set_document_file(const string &fn)

View File

@ -16,7 +16,7 @@
*/
#ifndef _MAIL_H_INCLUDED_
#define _MAIL_H_INCLUDED_
/* @(#$Id: mh_mail.h,v 1.12 2007-10-17 11:40:35 dockes Exp $ (C) 2004 J.F.Dockes */
/* @(#$Id: mh_mail.h,v 1.13 2008-10-04 14:26:59 dockes Exp $ (C) 2004 J.F.Dockes */
#include <sstream>
#include <vector>
@ -51,6 +51,7 @@ class MimeHandlerMail : public RecollFilter {
}
virtual bool next_document();
virtual bool skip_to_document(const string& ipath);
virtual void clear();
private:
bool processMsg(Binc::MimePart *doc, int depth);

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: mh_mbox.cpp,v 1.4 2008-08-29 13:05:12 dockes Exp $ (C) 2005 J.F.Dockes";
static char rcsid[] = "@(#$Id: mh_mbox.cpp,v 1.5 2008-10-04 14:26:59 dockes Exp $ (C) 2005 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -39,10 +39,19 @@ using namespace std;
MimeHandlerMbox::~MimeHandlerMbox()
{
clear();
}
void MimeHandlerMbox::clear()
{
m_fn.erase();
if (m_vfp) {
fclose((FILE *)m_vfp);
m_vfp = 0;
}
m_msgnum = m_lineno = 0;
m_ipath.erase();
RecollFilter::clear();
}
bool MimeHandlerMbox::set_document_file(const string &fn)

View File

@ -16,7 +16,7 @@
*/
#ifndef _MBOX_H_INCLUDED_
#define _MBOX_H_INCLUDED_
/* @(#$Id: mh_mbox.h,v 1.2 2007-10-03 14:53:37 dockes Exp $ (C) 2004 J.F.Dockes */
/* @(#$Id: mh_mbox.h,v 1.3 2008-10-04 14:26:59 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string>
using std::string;
@ -40,7 +40,7 @@ class MimeHandlerMbox : public RecollFilter {
m_ipath = ipath;
return true;
}
virtual void clear();
private:
string m_fn; // File name
void *m_vfp; // File pointer for folder

View File

@ -16,7 +16,7 @@
*/
#ifndef _MH_TEXT_H_INCLUDED_
#define _MH_TEXT_H_INCLUDED_
/* @(#$Id: mh_text.h,v 1.4 2006-12-16 15:39:54 dockes Exp $ (C) 2004 J.F.Dockes */
/* @(#$Id: mh_text.h,v 1.5 2008-10-04 14:26:59 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string>
using std::string;
@ -40,6 +40,11 @@ class MimeHandlerText : public RecollFilter {
return false;
}
virtual bool next_document();
virtual void clear()
{
m_text.erase();
RecollFilter::clear();
}
private:
string m_text;
};

View File

@ -16,7 +16,7 @@
*/
#ifndef _MH_UNKNOWN_H_INCLUDED_
#define _MH_UNKNOWN_H_INCLUDED_
/* @(#$Id: mh_unknown.h,v 1.2 2006-12-15 12:40:02 dockes Exp $ (C) 2004 J.F.Dockes */
/* @(#$Id: mh_unknown.h,v 1.3 2008-10-04 14:26:59 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string>
@ -44,6 +44,9 @@ class MimeHandlerUnknown : public RecollFilter {
m_metaData["mimetype"] = "text/plain";
return true;
}
virtual void clear() {
RecollFilter::clear();
}
};
#endif /* _MH_UNKNOWN_H_INCLUDED_ */

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: mimehandler.cpp,v 1.22 2007-11-16 14:28:52 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: mimehandler.cpp,v 1.23 2008-10-04 14:26:59 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -34,7 +34,10 @@ using namespace std;
#include "mh_mbox.h"
#include "mh_text.h"
#include "mh_unknown.h"
// Pool of already known and created handlers
static map<string, Dijon::Filter*> o_handlers;
/** Create internal handler object appropriate for given mime type */
static Dijon::Filter *mhFactory(const string &mime)
{
@ -52,16 +55,103 @@ static Dijon::Filter *mhFactory(const string &mime)
return new MimeHandlerUnknown(lmime);
}
/*
* Return handler object for given mime type:
/**
* Create a filter that executes an external program or script
* A filter def can look like.
* exec someprog -v -t " h i j";charset= xx; mimetype=yy
* We don't support ';' inside a quoted string for now. Can't see a use
* for it
*/
MimeHandlerExec *mhExecFactory(RclConfig *cfg, const string& mtype, string& hs)
{
list<string>semicolist;
stringToTokens(hs, semicolist, ";");
if (hs.size() < 1) {
LOGERR(("mhExecFactory: bad filter def: [%s]\n", hs.c_str()));
return 0;
}
string& cmd = *(semicolist.begin());
list<string> toks;
stringToStrings(cmd, toks);
if (toks.size() < 2) {
LOGERR(("mhExecFactory: bad config line for [%s]: [%s]\n",
mtype.c_str(), hs.c_str()));
return 0;
}
MimeHandlerExec *h = new MimeHandlerExec(mtype.c_str());
list<string>::iterator it;
// toks size is at least 2, this has been checked by caller.
it = toks.begin();
it++;
h->params.push_back(cfg->findFilter(*it++));
h->params.insert(h->params.end(), it, toks.end());
// Handle additional parameters
it = semicolist.begin();
it++;
for (;it != semicolist.end(); it++) {
string &line = *it;
string::size_type eqpos = line.find("=");
if (eqpos == string::npos)
continue;
// Compute name and value, trim white space
string nm, val;
nm = line.substr(0, eqpos);
trimstring(nm);
val = line.substr(eqpos+1, string::npos);
trimstring(val);
if (!nm.compare("charset")) {
h->cfgCharset = val;
} else if (!nm.compare("mimetype")) {
h->cfgMtype = val;
}
}
#if 0
string sparams;
for (it = h->params.begin(); it != h->params.end(); it++) {
sparams += string("[") + *it + "] ";
}
LOGDEB(("mhExecFactory:mt [%s] cfgmt [%s] cfgcs [%s] params: [%s]\n",
mtype.c_str(), h->cfgMtype.c_str(), h->cfgCharset.c_str(),
sparams.c_str()));
#endif
return h;
}
/* Return mime handler to pool */
void returnMimeHandler(Dijon::Filter *handler)
{
if (handler) {
handler->clear();
o_handlers[handler->get_mime_type()] = handler;
}
}
/* Get handler/filter object for given mime type: */
Dijon::Filter *getMimeHandler(const string &mtype, RclConfig *cfg,
bool filtertypes)
{
if (mtype.empty())
return false;
// Do we already have one ?
map<string, Dijon::Filter *>::iterator it = o_handlers.find(mtype);
if (it != o_handlers.end()) {
Dijon::Filter *h = it->second;
o_handlers.erase(it);
LOGDEB2(("getMimeHandler: found in cache\n"));
return h;
}
// Get handler definition for mime type
string hs;
if (!mtype.empty())
hs = cfg->getMimeHandlerDef(mtype, filtertypes);
hs = cfg->getMimeHandlerDef(mtype, filtertypes);
if (!hs.empty()) {
// Break definition into type and name
@ -84,11 +174,7 @@ Dijon::Filter *getMimeHandler(const string &mtype, RclConfig *cfg,
mtype.c_str(), hs.c_str()));
return 0;
}
MimeHandlerExec *h = new MimeHandlerExec(mtype.c_str());
it++;
h->params.push_back(cfg->findFilter(*it++));
h->params.insert(h->params.end(), it, toks.end());
return h;
return mhExecFactory(cfg, mtype, hs);
}
}

View File

@ -16,7 +16,7 @@
*/
#ifndef _MIMEHANDLER_H_INCLUDED_
#define _MIMEHANDLER_H_INCLUDED_
/* @(#$Id: mimehandler.h,v 1.15 2007-11-16 14:28:52 dockes Exp $ (C) 2004 J.F.Dockes */
/* @(#$Id: mimehandler.h,v 1.16 2008-10-04 14:26:59 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string>
#include <list>
@ -76,6 +76,11 @@ public:
return m_reason;
}
virtual void clear() {
m_forPreview = m_havedoc = false;
Dijon::Filter::clear();
}
protected:
bool m_forPreview;
string m_defcharset;
@ -92,9 +97,11 @@ protected:
* indexedmimetypes (if this is set at all).
*/
extern Dijon::Filter *getMimeHandler(const std::string &mtyp, RclConfig *cfg,
bool filtertypes=false);
/// Free up filter for reuse (you can also delete it)
extern void returnMimeHandler(Dijon::Filter *);
/// Can this mime type be interned ?
extern bool canIntern(const std::string mimetype, RclConfig *cfg);
#endif /* _MIMEHANDLER_H_INCLUDED_ */