changed the mime handler cache key (was the mime type), to avoid having multiple copies of the same filter when applied to different mime types. This reduces a lot the number of processes during indexing, with no impact on performance

This commit is contained in:
Jean-Francois Dockes 2013-04-25 18:18:48 +02:00
parent 62ca9549a3
commit a7728ceb91
20 changed files with 244 additions and 167 deletions

View File

@ -52,9 +52,10 @@ namespace Dijon
class Filter
{
public:
/// Builds an empty filter.
Filter(const std::string &mime_type) : m_mimeType(mime_type) {}
/// Destroys the filter.
Filter()
{
}
virtual ~Filter() {}
virtual void setConfig(RclConfig *) = 0;
@ -63,7 +64,8 @@ namespace Dijon
/** What data a filter supports as input.
* It can be either the whole document data, its file name, or its URI.
*/
typedef enum { DOCUMENT_DATA=0, DOCUMENT_STRING, DOCUMENT_FILE_NAME, DOCUMENT_URI } DataInput;
typedef enum { DOCUMENT_DATA=0, DOCUMENT_STRING, DOCUMENT_FILE_NAME,
DOCUMENT_URI } DataInput;
/** Input properties supported by the filter.
*
@ -94,7 +96,8 @@ namespace Dijon
/** Sets a property, prior to calling set_document_XXX().
* Returns false if the property is not supported.
*/
virtual bool set_property(Properties prop_name, const std::string &prop_value) = 0;
virtual bool set_property(Properties prop_name,
const std::string &prop_value) = 0;
/** (Re)initializes the filter with the given data.
* Caller should ensure the given pointer is valid until the
@ -103,25 +106,30 @@ namespace Dijon
* Call next_document() to position the filter onto the first document.
* Returns false if this input is not supported or an error occured.
*/
virtual bool set_document_data(const char *data_ptr, unsigned int data_length) = 0;
virtual bool set_document_data(const std::string& mtype,
const char *data_ptr,
unsigned int data_length) = 0;
/** (Re)initializes the filter with the given data.
* Call next_document() to position the filter onto the first document.
* Returns false if this input is not supported or an error occured.
*/
virtual bool set_document_string(const std::string &data_str) = 0;
virtual bool set_document_string(const std::string& mtype,
const std::string &data_str) = 0;
/** (Re)initializes the filter with the given file.
* Call next_document() to position the filter onto the first document.
* Returns false if this input is not supported or an error occured.
*/
virtual bool set_document_file(const std::string &file_path) = 0;
virtual bool set_document_file(const std::string& mtype,
const std::string &file_path) = 0;
/** (Re)initializes the filter with the given URI.
* Call next_document() to position the filter onto the first document.
* Returns false if this input is not supported or an error occured.
*/
virtual bool set_document_uri(const std::string &uri) = 0;
virtual bool set_document_uri(const std::string& mtype,
const std::string &uri) = 0;
/** Set the document size meta_data element. This is the size
of the immediate containing file (ie, a .doc, a .odt), not

View File

@ -263,7 +263,7 @@ void FileInterner::init(const string &f, const struct stat *stp, RclConfig *cnf,
// Look for appropriate handler (might still return empty)
m_mimetype = l_mime;
Dijon::Filter *df = getMimeHandler(l_mime, m_cfg, !m_forPreview);
RecollFilter *df = getMimeHandler(l_mime, m_cfg, !m_forPreview);
if (!df or df->is_unknown()) {
// No real handler for this type, for now :(
@ -284,7 +284,7 @@ void FileInterner::init(const string &f, const struct stat *stp, RclConfig *cnf,
#endif //RCL_USE_XATTR
df->set_docsize(docsize);
if (!df->set_document_file(m_fn)) {
if (!df->set_document_file(l_mime, m_fn)) {
delete df;
LOGERR(("FileInterner:: error converting %s\n", m_fn.c_str()));
return;
@ -315,7 +315,7 @@ void FileInterner::init(const string &data, RclConfig *cnf,
m_mimetype = imime;
// Look for appropriate handler (might still return empty)
Dijon::Filter *df = getMimeHandler(m_mimetype, m_cfg, !m_forPreview);
RecollFilter *df = getMimeHandler(m_mimetype, m_cfg, !m_forPreview);
if (!df) {
// No handler for this type, for now :( if indexallfilenames
@ -329,13 +329,13 @@ void FileInterner::init(const string &data, RclConfig *cnf,
bool result = false;
df->set_docsize(data.length());
if (df->is_data_input_ok(Dijon::Filter::DOCUMENT_STRING)) {
result = df->set_document_string(data);
result = df->set_document_string(m_mimetype, data);
} else if (df->is_data_input_ok(Dijon::Filter::DOCUMENT_DATA)) {
result = df->set_document_data(data.c_str(), data.length());
result = df->set_document_data(m_mimetype, data.c_str(), data.length());
} else if (df->is_data_input_ok(Dijon::Filter::DOCUMENT_FILE_NAME)) {
TempFile temp = dataToTempFile(data, m_mimetype);
if (temp.isNotNull() &&
(result = df->set_document_file(temp->filename()))) {
(result = df->set_document_file(m_mimetype, temp->filename()))) {
m_tmpflgs[m_handlers.size()] = true;
m_tempfiles.push_back(temp);
}
@ -406,7 +406,7 @@ bool FileInterner::makesig(RclConfig *cnf, const Rcl::Doc& idoc, string& sig)
FileInterner::~FileInterner()
{
for (vector<Dijon::Filter*>::iterator it = m_handlers.begin();
for (vector<RecollFilter*>::iterator it = m_handlers.begin();
it != m_handlers.end(); it++) {
returnMimeHandler(*it);
}
@ -548,7 +548,7 @@ static inline bool getKeyValue(const map<string, string>& docdata,
bool FileInterner::dijontorcl(Rcl::Doc& doc)
{
Dijon::Filter *df = m_handlers.back();
RecollFilter *df = m_handlers.back();
if (df == 0) {
//??
LOGERR(("FileInterner::dijontorcl: null top handler ??\n"));
@ -632,7 +632,7 @@ void FileInterner::collectIpathAndMT(Rcl::Doc& doc) const
doc.mimetype = m_mimetype;
string ipathel;
for (vector<Dijon::Filter*>::const_iterator hit = m_handlers.begin();
for (vector<RecollFilter*>::const_iterator hit = m_handlers.begin();
hit != m_handlers.end(); hit++) {
const map<string, string>& docdata = (*hit)->get_meta_data();
if (getKeyValue(docdata, cstr_dj_keyipath, ipathel)) {
@ -714,7 +714,7 @@ int FileInterner::addHandler()
return ADD_CONTINUE;
}
Dijon::Filter *newflt = getMimeHandler(mimetype, m_cfg);
RecollFilter *newflt = getMimeHandler(mimetype, m_cfg);
if (!newflt) {
// If we can't find a handler, this doc can't be handled
// but there can be other ones so we go on
@ -740,13 +740,13 @@ int FileInterner::addHandler()
bool setres = false;
newflt->set_docsize(txt->length());
if (newflt->is_data_input_ok(Dijon::Filter::DOCUMENT_STRING)) {
setres = newflt->set_document_string(*txt);
setres = newflt->set_document_string(mimetype, *txt);
} else if (newflt->is_data_input_ok(Dijon::Filter::DOCUMENT_DATA)) {
setres = newflt->set_document_data(txt->c_str(), txt->length());
setres = newflt->set_document_data(mimetype,txt->c_str(),txt->length());
} else if (newflt->is_data_input_ok(Dijon::Filter::DOCUMENT_FILE_NAME)) {
TempFile temp = dataToTempFile(*txt, mimetype);
if (temp.isNotNull() &&
(setres = newflt->set_document_file(temp->filename()))) {
(setres = newflt->set_document_file(mimetype, temp->filename()))) {
m_tmpflgs[m_handlers.size()] = true;
m_tempfiles.push_back(temp);
// Hack here, but really helps perfs: if we happen to

View File

@ -28,7 +28,7 @@ using std::vector;
using std::map;
using std::set;
#include "Filter.h"
#include "mimehandler.h"
#include "uncomp.h"
#include "pathut.h"
@ -262,7 +262,7 @@ class FileInterner {
// Filter stack, path to the current document from which we're
// fetching subdocs
vector<Dijon::Filter*> m_handlers;
vector<RecollFilter*> m_handlers;
// Temporary files used for decoding the current stack
bool m_tmpflgs[MAXHANDLERS];
vector<TempFile> m_tempfiles;

View File

@ -14,6 +14,14 @@
* Free Software Foundation, Inc.,
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
#include "autoconfig.h"
#include <sys/types.h>
#include <sys/wait.h>
#include <list>
using namespace std;
#include "cstr.h"
#include "execmd.h"
#include "mh_exec.h"
@ -24,13 +32,6 @@
#include "md5.h"
#include "rclconfig.h"
#include <sys/types.h>
#include <sys/wait.h>
#ifndef NO_NAMESPACES
using namespace std;
#endif /* NO_NAMESPACES */
// This is called periodically by ExeCmd when it is waiting for data,
// or when it does receive some. We may choose to interrupt the
// command.

View File

@ -56,11 +56,11 @@ class MimeHandlerExec : public RecollFilter {
bool missingHelper;
////////////////
MimeHandlerExec(RclConfig *cnf, const string& mt)
: RecollFilter(cnf, mt), missingHelper(false)
MimeHandlerExec(RclConfig *cnf, const string& id)
: RecollFilter(cnf, id), missingHelper(false)
{}
virtual bool set_document_file(const string &file_path) {
RecollFilter::set_document_file(file_path);
virtual bool set_document_file(const string& mt, const string &file_path) {
RecollFilter::set_document_file(mt, file_path);
m_fn = file_path;
m_havedoc = true;
return true;

View File

@ -102,14 +102,14 @@ class MimeHandlerExecMultiple : public MimeHandlerExec {
/////// End un-cleared stuff.
public:
MimeHandlerExecMultiple(RclConfig *cnf, const string& mt)
: MimeHandlerExec(cnf, mt)
MimeHandlerExecMultiple(RclConfig *cnf, const string& id)
: MimeHandlerExec(cnf, id)
{}
// No resources to clean up, the ExecCmd destructor does it.
virtual ~MimeHandlerExecMultiple() {}
virtual bool set_document_file(const string &file_path) {
virtual bool set_document_file(const string& mt, const string &file_path) {
m_filefirst = true;
return MimeHandlerExec::set_document_file(file_path);
return MimeHandlerExec::set_document_file(mt, file_path);
}
virtual bool next_document();

View File

@ -34,21 +34,23 @@ using namespace std;
#endif /* NO_NAMESPACES */
bool MimeHandlerHtml::set_document_file(const string &fn)
bool MimeHandlerHtml::set_document_file(const string& mt, const string &fn)
{
LOGDEB0(("textHtmlToDoc: %s\n", fn.c_str()));
RecollFilter::set_document_file(fn);
RecollFilter::set_document_file(mt, fn);
string otext;
if (!file_to_string(fn, otext)) {
LOGINFO(("textHtmlToDoc: cant read: %s\n", fn.c_str()));
return false;
}
m_filename = fn;
return set_document_string(otext);
return set_document_string(mt, otext);
}
bool MimeHandlerHtml::set_document_string(const string& htext)
bool MimeHandlerHtml::set_document_string(const string& mt,
const string& htext)
{
RecollFilter::set_document_string(mt, htext);
m_html = htext;
m_havedoc = true;

View File

@ -26,11 +26,15 @@
*/
class MimeHandlerHtml : public RecollFilter {
public:
MimeHandlerHtml(RclConfig *cnf, const string& mt)
: RecollFilter(cnf, mt) {}
virtual ~MimeHandlerHtml() {}
virtual bool set_document_file(const string &file_path);
virtual bool set_document_string(const string &data);
MimeHandlerHtml(RclConfig *cnf, const string& id)
: RecollFilter(cnf, id)
{
}
virtual ~MimeHandlerHtml()
{
}
virtual bool set_document_file(const string& mt, const string &file_path);
virtual bool set_document_string(const string& mt, const string &data);
virtual bool is_data_input_ok(DataInput input) const {
if (input == DOCUMENT_FILE_NAME || input == DOCUMENT_STRING)
return true;

View File

@ -46,8 +46,8 @@ using namespace std;
static const int maxdepth = 20;
static const string cstr_mail_charset("charset");
MimeHandlerMail::MimeHandlerMail(RclConfig *cnf, const string &mt)
: RecollFilter(cnf, mt), m_bincdoc(0), m_fd(-1), m_stream(0), m_idx(-1)
MimeHandlerMail::MimeHandlerMail(RclConfig *cnf, const string &id)
: RecollFilter(cnf, id), m_bincdoc(0), m_fd(-1), m_stream(0), m_idx(-1)
{
// Look for additional headers to be processed as per config:
@ -85,10 +85,10 @@ void MimeHandlerMail::clear()
RecollFilter::clear();
}
bool MimeHandlerMail::set_document_file(const string &fn)
bool MimeHandlerMail::set_document_file(const string& mt, const string &fn)
{
LOGDEB(("MimeHandlerMail::set_document_file(%s)\n", fn.c_str()));
RecollFilter::set_document_file(fn);
RecollFilter::set_document_file(mt, fn);
if (m_fd >= 0) {
close(m_fd);
m_fd = -1;
@ -123,10 +123,12 @@ bool MimeHandlerMail::set_document_file(const string &fn)
return true;
}
bool MimeHandlerMail::set_document_string(const string &msgtxt)
bool MimeHandlerMail::set_document_string(const string& mt,
const string &msgtxt)
{
LOGDEB1(("MimeHandlerMail::set_document_string\n"));
LOGDEB2(("Message text: [%s]\n", msgtxt.c_str()));
RecollFilter::set_document_string(mt, msgtxt);
delete m_stream;
if (!m_forPreview) {
@ -614,11 +616,11 @@ void MimeHandlerMail::walkmime(Binc::MimePart* doc, int depth)
// Handle html stripping and transcoding to utf8
if (!stringlowercmp("text/html", content_type.value)) {
MimeHandlerHtml mh(m_config, "text/html");
MimeHandlerHtml mh(m_config, "1234");
mh.set_property(Dijon::Filter::OPERATING_MODE,
m_forPreview ? "view" : "index");
mh.set_property(Dijon::Filter::DEFAULT_CHARSET, charset);
mh.set_document_string(body);
mh.set_document_string("text/html", body);
mh.next_document();
map<string, string>::const_iterator it =
mh.get_meta_data().find(cstr_dj_keycontent);

View File

@ -39,10 +39,10 @@ class MHMailAttach;
*/
class MimeHandlerMail : public RecollFilter {
public:
MimeHandlerMail(RclConfig *cnf, const string &mt);
MimeHandlerMail(RclConfig *cnf, const string &id);
virtual ~MimeHandlerMail();
virtual bool set_document_file(const string& file_path);
virtual bool set_document_string(const string& data);
virtual bool set_document_file(const string& mt, const string& file_path);
virtual bool set_document_string(const string& mt, const string& data);
virtual bool is_data_input_ok(DataInput input) const {
if (input == DOCUMENT_FILE_NAME || input == DOCUMENT_STRING)
return true;

View File

@ -251,10 +251,10 @@ void MimeHandlerMbox::clear()
RecollFilter::clear();
}
bool MimeHandlerMbox::set_document_file(const string &fn)
bool MimeHandlerMbox::set_document_file(const string& mt, const string &fn)
{
LOGDEB(("MimeHandlerMbox::set_document_file(%s)\n", fn.c_str()));
RecollFilter::set_document_file(fn);
RecollFilter::set_document_file(mt, fn);
m_fn = fn;
if (m_vfp) {
fclose((FILE *)m_vfp);
@ -598,8 +598,8 @@ int main(int argc, char **argv)
exit(1);
}
config->setKeyDir(path_getfather(filename));
MimeHandlerMbox mh(config, "text/x-mail");
if (!mh.set_document_file(filename)) {
MimeHandlerMbox mh(config, "some_id");
if (!mh.set_document_file("text/x-mail", filename)) {
cerr << "set_document_file failed" << endl;
exit(1);
}

View File

@ -31,12 +31,12 @@ using std::vector;
*/
class MimeHandlerMbox : public RecollFilter {
public:
MimeHandlerMbox(RclConfig *cnf, const string& mime)
: RecollFilter(cnf, mime), m_vfp(0), m_msgnum(0),
MimeHandlerMbox(RclConfig *cnf, const string& id)
: RecollFilter(cnf, id), m_vfp(0), m_msgnum(0),
m_lineno(0), m_fsize(0)
{}
virtual ~MimeHandlerMbox();
virtual bool set_document_file(const string &file_path);
virtual bool set_document_file(const string& mt, const string &file_path);
virtual bool next_document();
virtual bool skip_to_document(const string& ipath) {
m_ipath = ipath;

View File

@ -35,12 +35,16 @@
*/
class MimeHandlerSymlink : public RecollFilter {
public:
MimeHandlerSymlink(RclConfig *cnf, const std::string& mt)
: RecollFilter(cnf, mt) {}
virtual ~MimeHandlerSymlink() {}
virtual bool set_document_file(const string& fn)
MimeHandlerSymlink(RclConfig *cnf, const std::string& id)
: RecollFilter(cnf, id)
{
RecollFilter::set_document_file(fn);
}
virtual ~MimeHandlerSymlink()
{
}
virtual bool set_document_file(const string& mt, const string& fn)
{
RecollFilter::set_document_file(mt, fn);
m_fn = fn;
return m_havedoc = true;
}

View File

@ -39,11 +39,11 @@ const int MB = 1024*1024;
const int KB = 1024;
// Process a plain text file
bool MimeHandlerText::set_document_file(const string &fn)
bool MimeHandlerText::set_document_file(const string& mt, const string &fn)
{
LOGDEB(("MimeHandlerText::set_document_file: [%s]\n", fn.c_str()));
RecollFilter::set_document_file(fn);
RecollFilter::set_document_file(mt, fn);
m_fn = fn;
// file size for oversize check
@ -91,8 +91,9 @@ bool MimeHandlerText::set_document_file(const string &fn)
return true;
}
bool MimeHandlerText::set_document_string(const string& otext)
bool MimeHandlerText::set_document_string(const string& mt, const string& otext)
{
RecollFilter::set_document_string(mt, otext);
m_text = otext;
if (!m_forPreview) {
string md5, xmd5;

View File

@ -30,11 +30,15 @@ using std::string;
*/
class MimeHandlerText : public RecollFilter {
public:
MimeHandlerText(RclConfig *cnf, const string& mt)
: RecollFilter(cnf, mt), m_paging(false), m_offs(0) {}
virtual ~MimeHandlerText() {}
virtual bool set_document_file(const string &file_path);
virtual bool set_document_string(const string&);
MimeHandlerText(RclConfig *cnf, const string& id)
: RecollFilter(cnf, id), m_paging(false), m_offs(0)
{
}
virtual ~MimeHandlerText()
{
}
virtual bool set_document_file(const string& mt, const string &file_path);
virtual bool set_document_string(const string&, const string&);
virtual bool is_data_input_ok(DataInput input) const {
if (input == DOCUMENT_FILE_NAME || input == DOCUMENT_STRING)
return true;

View File

@ -28,14 +28,20 @@
*/
class MimeHandlerUnknown : public RecollFilter {
public:
MimeHandlerUnknown(RclConfig *cnf, const string& mt)
: RecollFilter(cnf, mt) {}
virtual ~MimeHandlerUnknown() {}
virtual bool set_document_file(const string& fn) {
RecollFilter::set_document_file(fn);
MimeHandlerUnknown(RclConfig *cnf, const string& id)
: RecollFilter(cnf, id)
{
}
virtual ~MimeHandlerUnknown()
{
}
virtual bool set_document_file(const string& mt, const string& fn)
{
RecollFilter::set_document_file(mt, fn);
return m_havedoc = true;
}
virtual bool set_document_string(const string&) {
virtual bool set_document_string(const string& mt, const string& s) {
RecollFilter::set_document_string(mt, s);
return m_havedoc = true;
}
virtual bool next_document() {

View File

@ -30,6 +30,7 @@ using namespace std;
#include "debuglog.h"
#include "rclconfig.h"
#include "smallut.h"
#include "md5.h"
#include "mh_exec.h"
#include "mh_execm.h"
@ -45,24 +46,26 @@ using namespace std;
// handlers. There can be several instances for a given mime type
// (think email attachment in email message: 2 rfc822 handlers are
// needed simulteanously)
static multimap<string, Dijon::Filter*> o_handlers;
static list<multimap<string, Dijon::Filter*>::iterator> o_hlru;
typedef list<multimap<string, Dijon::Filter*>::iterator>::iterator hlruit_tp;
static multimap<string, RecollFilter*> o_handlers;
static list<multimap<string, RecollFilter*>::iterator> o_hlru;
typedef list<multimap<string, RecollFilter*>::iterator>::iterator hlruit_tp;
static PTMutexInit o_handlers_mutex;
static const unsigned int max_handlers_cache_size = 100;
/* Look for mime handler in pool */
static Dijon::Filter *getMimeHandlerFromCache(const string& key)
static RecollFilter *getMimeHandlerFromCache(const string& key)
{
PTMutexLocker locker(o_handlers_mutex);
string xdigest;
MD5HexPrint(key, xdigest);
LOGDEB(("getMimeHandlerFromCache: %s cache size %u\n",
key.c_str(), o_handlers.size()));
xdigest.c_str(), o_handlers.size()));
multimap<string, Dijon::Filter *>::iterator it = o_handlers.find(key);
multimap<string, RecollFilter *>::iterator it = o_handlers.find(key);
if (it != o_handlers.end()) {
Dijon::Filter *h = it->second;
RecollFilter *h = it->second;
hlruit_tp it1 = find(o_hlru.begin(), o_hlru.end(), it);
if (it1 != o_hlru.end()) {
o_hlru.erase(it1);
@ -71,20 +74,22 @@ static Dijon::Filter *getMimeHandlerFromCache(const string& key)
}
o_handlers.erase(it);
LOGDEB(("getMimeHandlerFromCache: %s found size %u\n",
key.c_str(), o_handlers.size()));
xdigest.c_str(), o_handlers.size()));
return h;
}
LOGDEB(("getMimeHandlerFromCache: %s not found\n", key.c_str()));
LOGDEB(("getMimeHandlerFromCache: %s not found\n", xdigest.c_str()));
return 0;
}
/* Return mime handler to pool */
void returnMimeHandler(Dijon::Filter *handler)
void returnMimeHandler(RecollFilter *handler)
{
typedef multimap<string, Dijon::Filter*>::value_type value_type;
typedef multimap<string, RecollFilter*>::value_type value_type;
if (handler==0)
if (handler == 0) {
LOGERR(("returnMimeHandler: bad parameter\n"));
return;
}
handler->clear();
PTMutexLocker locker(o_handlers_mutex);
@ -97,7 +102,7 @@ void returnMimeHandler(Dijon::Filter *handler)
// at the same time either because it occurs several times in a
// stack (ie mail attachment to mail), or because several threads
// are processing the same mime type at the same time.
multimap<string, Dijon::Filter *>::iterator it;
multimap<string, RecollFilter *>::iterator it;
if (o_handlers.size() >= max_handlers_cache_size) {
static int once = 1;
if (once) {
@ -114,15 +119,15 @@ void returnMimeHandler(Dijon::Filter *handler)
o_handlers.erase(it);
}
}
it = o_handlers.insert(value_type(handler->get_mime_type(), handler));
it = o_handlers.insert(value_type(handler->get_id(), handler));
o_hlru.push_front(it);
}
void clearMimeHandlerCache()
{
LOGDEB(("clearMimeHandlerCache()\n"));
typedef multimap<string, Dijon::Filter*>::value_type value_type;
map<string, Dijon::Filter *>::iterator it;
typedef multimap<string, RecollFilter*>::value_type value_type;
map<string, RecollFilter *>::iterator it;
PTMutexLocker locker(o_handlers_mutex);
for (it = o_handlers.begin(); it != o_handlers.end(); it++) {
delete it->second;
@ -132,26 +137,32 @@ void clearMimeHandlerCache()
/** For mime types set as "internal" in mimeconf:
* create appropriate handler object. */
static Dijon::Filter *mhFactory(RclConfig *config, const string &mime)
static RecollFilter *mhFactory(RclConfig *config, const string &mime,
bool nobuild, string& id)
{
LOGDEB2(("mhFactory(%s)\n", mime.c_str()));
string lmime(mime);
stringtolower(lmime);
if (cstr_textplain == lmime) {
LOGDEB2(("mhFactory(%s): returning MimeHandlerText\n", mime.c_str()));
return new MimeHandlerText(config, lmime);
MD5String("MimeHandlerText", id);
return nobuild ? 0 : new MimeHandlerText(config, id);
} else if ("text/html" == lmime) {
LOGDEB2(("mhFactory(%s): returning MimeHandlerHtml\n", mime.c_str()));
return new MimeHandlerHtml(config, lmime);
MD5String("MimeHandlerHtml", id);
return nobuild ? 0 : new MimeHandlerHtml(config, id);
} else if ("text/x-mail" == lmime) {
LOGDEB2(("mhFactory(%s): returning MimeHandlerMbox\n", mime.c_str()));
return new MimeHandlerMbox(config, lmime);
MD5String("MimeHandlerMbox", id);
return nobuild ? 0 : new MimeHandlerMbox(config, id);
} else if ("message/rfc822" == lmime) {
LOGDEB2(("mhFactory(%s): returning MimeHandlerMail\n", mime.c_str()));
return new MimeHandlerMail(config, lmime);
MD5String("MimeHandlerMail", id);
return nobuild ? 0 : new MimeHandlerMail(config, id);
} else if ("inode/symlink" == lmime) {
LOGDEB2(("mhFactory(%s): ret MimeHandlerSymlink\n", mime.c_str()));
return new MimeHandlerSymlink(config, lmime);
MD5String("MimeHandlerSymlink", id);
return nobuild ? 0 : new MimeHandlerSymlink(config, id);
} else if (lmime.find("text/") == 0) {
// Try to handle unknown text/xx as text/plain. This
// only happen if the text/xx was defined as "internal" in
@ -159,14 +170,16 @@ static Dijon::Filter *mhFactory(RclConfig *config, const string &mime)
// allows indexing and previewing as text/plain (no filter
// exec) but still opening with a specific editor.
LOGDEB2(("mhFactory(%s): returning MimeHandlerText(x)\n",mime.c_str()));
return new MimeHandlerText(config, lmime);
MD5String("MimeHandlerText", id);
return nobuild ? 0 : new MimeHandlerText(config, id);
} else {
// We should not get there. It means that "internal" was set
// as a handler in mimeconf for a mime type we actually can't
// handle.
LOGERR(("mhFactory: mime type [%s] set as internal but unknown\n",
lmime.c_str()));
return new MimeHandlerUnknown(config, lmime);
MD5String("MimeHandlerUnknown", id);
return nobuild ? 0 : new MimeHandlerUnknown(config, id);
}
}
@ -181,10 +194,11 @@ static const string cstr_mh_charset("charset");
* a ';' inside a quoted string for now. Can't see a use for it.
*/
MimeHandlerExec *mhExecFactory(RclConfig *cfg, const string& mtype, string& hs,
bool multiple)
bool multiple, const string& id)
{
ConfSimple attrs;
string cmdstr;
if (!cfg->valueSplitAttributes(hs, cmdstr, attrs)) {
LOGERR(("mhExecFactory: bad config line for [%s]: [%s]\n",
mtype.c_str(), hs.c_str()));
@ -200,8 +214,8 @@ MimeHandlerExec *mhExecFactory(RclConfig *cfg, const string& mtype, string& hs,
return 0;
}
MimeHandlerExec *h = multiple ?
new MimeHandlerExecMultiple(cfg, mtype.c_str()) :
new MimeHandlerExec(cfg, mtype.c_str());
new MimeHandlerExecMultiple(cfg, id) :
new MimeHandlerExec(cfg, id);
list<string>::iterator it = cmdtoks.begin();
h->params.push_back(cfg->findFilter(*it++));
h->params.insert(h->params.end(), it, cmdtoks.end());
@ -228,32 +242,27 @@ MimeHandlerExec *mhExecFactory(RclConfig *cfg, const string& mtype, string& hs,
}
/* Get handler/filter object for given mime type: */
Dijon::Filter *getMimeHandler(const string &mtype, RclConfig *cfg,
RecollFilter *getMimeHandler(const string &mtype, RclConfig *cfg,
bool filtertypes)
{
LOGDEB(("getMimeHandler: mtype [%s] filtertypes %d\n",
mtype.c_str(), filtertypes));
Dijon::Filter *h = 0;
RecollFilter *h = 0;
// Get handler definition for mime type. We do this even if an
// appropriate handler object may be in the cache (indexed by mime
// type). This is fast, and necessary to conform to the
// appropriate handler object may be in the cache.
// This is fast, and necessary to conform to the
// configuration, (ie: text/html might be filtered out by
// indexedmimetypes but an html handler could still be in the
// cache because it was needed by some other interning stack).
string hs;
hs = cfg->getMimeHandlerDef(mtype, filtertypes);
string id;
if (!hs.empty()) { // Got a handler definition line
// Do we already have a handler object in the cache ?
h = getMimeHandlerFromCache(mtype);
if (h != 0)
goto out;
LOGDEB2(("getMimeHandler: %s not in cache\n", mtype.c_str()));
// Not in cache. Break definition into type and name/command
// string and instanciate handler object
if (!hs.empty()) {
// Got a handler definition line
// Break definition into type (internal/exec/execm)
// and name/command string
string::size_type b1 = hs.find_first_of(" \t");
string handlertype = hs.substr(0, b1);
string cmdstr;
@ -261,7 +270,30 @@ Dijon::Filter *getMimeHandler(const string &mtype, RclConfig *cfg,
cmdstr = hs.substr(b1);
trimstring(cmdstr);
}
if (!stringlowercmp("internal", handlertype)) {
bool internal = !stringlowercmp("internal", handlertype);
if (internal) {
// For internal types let the factory compute the id
mhFactory(cfg, cmdstr.empty() ? mtype : cmdstr, true, id);
} else {
// exec/execm: use the md5 of the def line
MD5String(hs, id);
}
#if 0
{ // string xdigest; LOGDEB2(("getMimeHandler: [%s] hs [%s] id [%s]\n",
//mtype.c_str(), hs.c_str(), MD5HexPrint(id, xdigest).c_str()));
}
#endif
// Do we already have a handler object in the cache ?
h = getMimeHandlerFromCache(id);
if (h != 0)
goto out;
LOGDEB2(("getMimeHandler: %s not in cache\n", mtype.c_str()));
// Not in cache.
if (internal) {
// If there is a parameter after "internal" it's the mime
// type to use. This is so that we can have bogus mime
// types like text/x-purple-html-log (for ie: specific
@ -270,14 +302,7 @@ Dijon::Filter *getMimeHandler(const string &mtype, RclConfig *cfg,
// better and the latter will probably go away at some
// point in the future.
LOGDEB2(("handlertype internal, cmdstr [%s]\n", cmdstr.c_str()));
if (!cmdstr.empty()) {
// Have to redo the cache thing. Maybe we should
// rather just recurse instead ?
if ((h = getMimeHandlerFromCache(cmdstr)) == 0)
h = mhFactory(cfg, cmdstr);
} else {
h = mhFactory(cfg, mtype);
}
h = mhFactory(cfg, cmdstr.empty() ? mtype : cmdstr, false, id);
goto out;
} else if (!stringlowercmp("dll", handlertype)) {
} else {
@ -287,10 +312,10 @@ Dijon::Filter *getMimeHandler(const string &mtype, RclConfig *cfg,
goto out;
}
if (!stringlowercmp("exec", handlertype)) {
h = mhExecFactory(cfg, mtype, cmdstr, false);
h = mhExecFactory(cfg, mtype, cmdstr, false, id);
goto out;
} else if (!stringlowercmp("execm", handlertype)) {
h = mhExecFactory(cfg, mtype, cmdstr, true);
h = mhExecFactory(cfg, mtype, cmdstr, true, id);
goto out;
} else {
LOGERR(("getMimeHandler: bad line for %s: %s\n",
@ -305,20 +330,20 @@ Dijon::Filter *getMimeHandler(const string &mtype, RclConfig *cfg,
// Finally, unhandled files are either ignored or their name and
// generic metadata is indexed, depending on configuration
{bool indexunknown = false;
{
bool indexunknown = false;
cfg->getConfParam("indexallfilenames", &indexunknown);
if (indexunknown) {
if ((h = getMimeHandlerFromCache("application/octet-stream")) == 0)
h = new MimeHandlerUnknown(cfg, "application/octet-stream");
goto out;
} else {
goto out;
MD5String("MimeHandlerUnknown", id);
if ((h = getMimeHandlerFromCache(id)) == 0)
h = new MimeHandlerUnknown(cfg, id);
}
goto out;
}
out:
if (h) {
h->set_property(Dijon::Filter::DEFAULT_CHARSET, cfg->getDefCharset());
h->set_property(RecollFilter::DEFAULT_CHARSET, cfg->getDefCharset());
// In multithread context, and in case this handler is out
// from the cache, it may have a config pointer belonging to
// another thread. Fix it.

View File

@ -21,26 +21,23 @@
#include <stdio.h>
#include <string>
#include <list>
using std::string;
using std::list;
#include <Filter.h>
#include "Filter.h"
#include "cstr.h"
class RclConfig;
class RecollFilter : public Dijon::Filter {
public:
RecollFilter(RclConfig *config, const string& mtype)
: Dijon::Filter(mtype), m_config(config),
m_forPreview(false), m_havedoc(false)
RecollFilter(RclConfig *config, const std::string& id)
: m_config(config), m_forPreview(false), m_havedoc(false), m_id(id)
{}
virtual ~RecollFilter() {}
virtual void setConfig(RclConfig *config)
{
m_config = config;
}
virtual bool set_property(Properties p, const string &v) {
virtual bool set_property(Properties p, const std::string &v) {
switch (p) {
case DJF_UDI:
m_udi = v;
@ -59,7 +56,12 @@ public:
}
// We don't use this for now
virtual bool set_document_uri(const std::string &) {return false;}
virtual bool set_document_uri(const std::string& mtype,
const std::string &)
{
m_mimeType = mtype;
return false;
}
// This does nothing right now but should be called from the
// subclass method in case we need some common processing one day
@ -69,12 +71,24 @@ public:
// having a pure virtual called from here and implemented in the
// subclass) would have to be repeated in each derived class. It's
// just simpler this way.
virtual bool set_document_file(const string & /*file_path*/) {return true;}
virtual bool set_document_file(const std::string& mtype,
const std::string & /*file_path*/)
{
m_mimeType = mtype;
return true;
}
// Default implementations
virtual bool set_document_string(const std::string &) {return false;}
virtual bool set_document_data(const char *cp, unsigned int sz) {
return set_document_string(string(cp, sz));
virtual bool set_document_string(const std::string& mtype,
const std::string &)
{
m_mimeType = mtype;
return false;
}
virtual bool set_document_data(const std::string& mtype,
const char *cp, unsigned int sz)
{
return set_document_string(mtype, std::string(cp, sz));
}
virtual void set_docsize(size_t size)
@ -87,7 +101,7 @@ public:
virtual bool has_documents() const {return m_havedoc;}
// Most doc types are single-doc
virtual bool skip_to_document(const string& s) {
virtual bool skip_to_document(const std::string& s) {
if (s.empty())
return true;
return false;
@ -99,10 +113,15 @@ public:
return false;
}
virtual string get_error() const {
virtual std::string get_error() const {
return m_reason;
}
virtual const std::string& get_id() const
{
return m_id;
}
// "Call super" anti-pattern again. Must be called from derived
// classes which reimplement clear()
virtual void clear() {
@ -114,17 +133,20 @@ public:
// This only makes sense if the contents are currently txt/plain
// It converts from keyorigcharset to UTF-8 and sets keycharset.
bool txtdcode(const string& who);
bool txtdcode(const std::string& who);
protected:
bool preview() {return m_forPreview;}
RclConfig *m_config;
bool m_forPreview;
string m_dfltInputCharset;
string m_reason;
std::string m_dfltInputCharset;
std::string m_reason;
bool m_havedoc;
string m_udi; // May be set by creator as a hint
std::string m_udi; // May be set by creator as a hint
// m_id is and md5 of the filter definition line (from mimeconf) and
// is used when fetching/returning filters to / from the cache.
std::string m_id;
};
/**
@ -135,11 +157,11 @@ protected:
* @param filtertypes decide if we should restrict to types in
* indexedmimetypes (if this is set at all).
*/
extern Dijon::Filter *getMimeHandler(const std::string &mtyp, RclConfig *cfg,
extern RecollFilter *getMimeHandler(const std::string &mtyp, RclConfig *cfg,
bool filtertypes=false);
/// Free up filter for reuse (you can also delete it)
extern void returnMimeHandler(Dijon::Filter *);
extern void returnMimeHandler(RecollFilter *);
/// Clean up cache at the end of an indexing pass. For people who use
/// the GUI to index: avoid all those filter processes forever hanging

View File

@ -62,10 +62,6 @@ using std::pair;
#include "docseqhist.h"
#include "rclhelp.h"
#ifndef MIN
#define MIN(A,B) ((A)<(B)?(A):(B))
#endif
// Subclass plainToRich to add <termtag>s and anchors to the preview text
class PlainToRichQtPreview : public PlainToRich {
public:

View File

@ -24,8 +24,10 @@
#include <time.h>
#include <sstream>
#include <list>
using std::ostringstream;
using std::endl;
using std::list;
#include "cstr.h"
#include "reslistpager.h"