changed the mime handler cache key (was the mime type), to avoid having multiple copies of the same filter when applied to different mime types. This reduces a lot the number of processes during indexing, with no impact on performance
This commit is contained in:
parent
62ca9549a3
commit
a7728ceb91
@ -52,9 +52,10 @@ namespace Dijon
|
||||
class Filter
|
||||
{
|
||||
public:
|
||||
/// Builds an empty filter.
|
||||
Filter(const std::string &mime_type) : m_mimeType(mime_type) {}
|
||||
/// Destroys the filter.
|
||||
Filter()
|
||||
{
|
||||
}
|
||||
virtual ~Filter() {}
|
||||
virtual void setConfig(RclConfig *) = 0;
|
||||
|
||||
@ -63,7 +64,8 @@ namespace Dijon
|
||||
/** What data a filter supports as input.
|
||||
* It can be either the whole document data, its file name, or its URI.
|
||||
*/
|
||||
typedef enum { DOCUMENT_DATA=0, DOCUMENT_STRING, DOCUMENT_FILE_NAME, DOCUMENT_URI } DataInput;
|
||||
typedef enum { DOCUMENT_DATA=0, DOCUMENT_STRING, DOCUMENT_FILE_NAME,
|
||||
DOCUMENT_URI } DataInput;
|
||||
|
||||
/** Input properties supported by the filter.
|
||||
*
|
||||
@ -94,7 +96,8 @@ namespace Dijon
|
||||
/** Sets a property, prior to calling set_document_XXX().
|
||||
* Returns false if the property is not supported.
|
||||
*/
|
||||
virtual bool set_property(Properties prop_name, const std::string &prop_value) = 0;
|
||||
virtual bool set_property(Properties prop_name,
|
||||
const std::string &prop_value) = 0;
|
||||
|
||||
/** (Re)initializes the filter with the given data.
|
||||
* Caller should ensure the given pointer is valid until the
|
||||
@ -103,25 +106,30 @@ namespace Dijon
|
||||
* Call next_document() to position the filter onto the first document.
|
||||
* Returns false if this input is not supported or an error occured.
|
||||
*/
|
||||
virtual bool set_document_data(const char *data_ptr, unsigned int data_length) = 0;
|
||||
virtual bool set_document_data(const std::string& mtype,
|
||||
const char *data_ptr,
|
||||
unsigned int data_length) = 0;
|
||||
|
||||
/** (Re)initializes the filter with the given data.
|
||||
* Call next_document() to position the filter onto the first document.
|
||||
* Returns false if this input is not supported or an error occured.
|
||||
*/
|
||||
virtual bool set_document_string(const std::string &data_str) = 0;
|
||||
virtual bool set_document_string(const std::string& mtype,
|
||||
const std::string &data_str) = 0;
|
||||
|
||||
/** (Re)initializes the filter with the given file.
|
||||
* Call next_document() to position the filter onto the first document.
|
||||
* Returns false if this input is not supported or an error occured.
|
||||
*/
|
||||
virtual bool set_document_file(const std::string &file_path) = 0;
|
||||
virtual bool set_document_file(const std::string& mtype,
|
||||
const std::string &file_path) = 0;
|
||||
|
||||
/** (Re)initializes the filter with the given URI.
|
||||
* Call next_document() to position the filter onto the first document.
|
||||
* Returns false if this input is not supported or an error occured.
|
||||
*/
|
||||
virtual bool set_document_uri(const std::string &uri) = 0;
|
||||
virtual bool set_document_uri(const std::string& mtype,
|
||||
const std::string &uri) = 0;
|
||||
|
||||
/** Set the document size meta_data element. This is the size
|
||||
of the immediate containing file (ie, a .doc, a .odt), not
|
||||
|
||||
@ -263,7 +263,7 @@ void FileInterner::init(const string &f, const struct stat *stp, RclConfig *cnf,
|
||||
|
||||
// Look for appropriate handler (might still return empty)
|
||||
m_mimetype = l_mime;
|
||||
Dijon::Filter *df = getMimeHandler(l_mime, m_cfg, !m_forPreview);
|
||||
RecollFilter *df = getMimeHandler(l_mime, m_cfg, !m_forPreview);
|
||||
|
||||
if (!df or df->is_unknown()) {
|
||||
// No real handler for this type, for now :(
|
||||
@ -284,7 +284,7 @@ void FileInterner::init(const string &f, const struct stat *stp, RclConfig *cnf,
|
||||
#endif //RCL_USE_XATTR
|
||||
|
||||
df->set_docsize(docsize);
|
||||
if (!df->set_document_file(m_fn)) {
|
||||
if (!df->set_document_file(l_mime, m_fn)) {
|
||||
delete df;
|
||||
LOGERR(("FileInterner:: error converting %s\n", m_fn.c_str()));
|
||||
return;
|
||||
@ -315,7 +315,7 @@ void FileInterner::init(const string &data, RclConfig *cnf,
|
||||
m_mimetype = imime;
|
||||
|
||||
// Look for appropriate handler (might still return empty)
|
||||
Dijon::Filter *df = getMimeHandler(m_mimetype, m_cfg, !m_forPreview);
|
||||
RecollFilter *df = getMimeHandler(m_mimetype, m_cfg, !m_forPreview);
|
||||
|
||||
if (!df) {
|
||||
// No handler for this type, for now :( if indexallfilenames
|
||||
@ -329,13 +329,13 @@ void FileInterner::init(const string &data, RclConfig *cnf,
|
||||
bool result = false;
|
||||
df->set_docsize(data.length());
|
||||
if (df->is_data_input_ok(Dijon::Filter::DOCUMENT_STRING)) {
|
||||
result = df->set_document_string(data);
|
||||
result = df->set_document_string(m_mimetype, data);
|
||||
} else if (df->is_data_input_ok(Dijon::Filter::DOCUMENT_DATA)) {
|
||||
result = df->set_document_data(data.c_str(), data.length());
|
||||
result = df->set_document_data(m_mimetype, data.c_str(), data.length());
|
||||
} else if (df->is_data_input_ok(Dijon::Filter::DOCUMENT_FILE_NAME)) {
|
||||
TempFile temp = dataToTempFile(data, m_mimetype);
|
||||
if (temp.isNotNull() &&
|
||||
(result = df->set_document_file(temp->filename()))) {
|
||||
(result = df->set_document_file(m_mimetype, temp->filename()))) {
|
||||
m_tmpflgs[m_handlers.size()] = true;
|
||||
m_tempfiles.push_back(temp);
|
||||
}
|
||||
@ -406,7 +406,7 @@ bool FileInterner::makesig(RclConfig *cnf, const Rcl::Doc& idoc, string& sig)
|
||||
|
||||
FileInterner::~FileInterner()
|
||||
{
|
||||
for (vector<Dijon::Filter*>::iterator it = m_handlers.begin();
|
||||
for (vector<RecollFilter*>::iterator it = m_handlers.begin();
|
||||
it != m_handlers.end(); it++) {
|
||||
returnMimeHandler(*it);
|
||||
}
|
||||
@ -548,7 +548,7 @@ static inline bool getKeyValue(const map<string, string>& docdata,
|
||||
|
||||
bool FileInterner::dijontorcl(Rcl::Doc& doc)
|
||||
{
|
||||
Dijon::Filter *df = m_handlers.back();
|
||||
RecollFilter *df = m_handlers.back();
|
||||
if (df == 0) {
|
||||
//??
|
||||
LOGERR(("FileInterner::dijontorcl: null top handler ??\n"));
|
||||
@ -632,7 +632,7 @@ void FileInterner::collectIpathAndMT(Rcl::Doc& doc) const
|
||||
doc.mimetype = m_mimetype;
|
||||
|
||||
string ipathel;
|
||||
for (vector<Dijon::Filter*>::const_iterator hit = m_handlers.begin();
|
||||
for (vector<RecollFilter*>::const_iterator hit = m_handlers.begin();
|
||||
hit != m_handlers.end(); hit++) {
|
||||
const map<string, string>& docdata = (*hit)->get_meta_data();
|
||||
if (getKeyValue(docdata, cstr_dj_keyipath, ipathel)) {
|
||||
@ -714,7 +714,7 @@ int FileInterner::addHandler()
|
||||
return ADD_CONTINUE;
|
||||
}
|
||||
|
||||
Dijon::Filter *newflt = getMimeHandler(mimetype, m_cfg);
|
||||
RecollFilter *newflt = getMimeHandler(mimetype, m_cfg);
|
||||
if (!newflt) {
|
||||
// If we can't find a handler, this doc can't be handled
|
||||
// but there can be other ones so we go on
|
||||
@ -740,13 +740,13 @@ int FileInterner::addHandler()
|
||||
bool setres = false;
|
||||
newflt->set_docsize(txt->length());
|
||||
if (newflt->is_data_input_ok(Dijon::Filter::DOCUMENT_STRING)) {
|
||||
setres = newflt->set_document_string(*txt);
|
||||
setres = newflt->set_document_string(mimetype, *txt);
|
||||
} else if (newflt->is_data_input_ok(Dijon::Filter::DOCUMENT_DATA)) {
|
||||
setres = newflt->set_document_data(txt->c_str(), txt->length());
|
||||
setres = newflt->set_document_data(mimetype,txt->c_str(),txt->length());
|
||||
} else if (newflt->is_data_input_ok(Dijon::Filter::DOCUMENT_FILE_NAME)) {
|
||||
TempFile temp = dataToTempFile(*txt, mimetype);
|
||||
if (temp.isNotNull() &&
|
||||
(setres = newflt->set_document_file(temp->filename()))) {
|
||||
(setres = newflt->set_document_file(mimetype, temp->filename()))) {
|
||||
m_tmpflgs[m_handlers.size()] = true;
|
||||
m_tempfiles.push_back(temp);
|
||||
// Hack here, but really helps perfs: if we happen to
|
||||
|
||||
@ -28,7 +28,7 @@ using std::vector;
|
||||
using std::map;
|
||||
using std::set;
|
||||
|
||||
#include "Filter.h"
|
||||
#include "mimehandler.h"
|
||||
#include "uncomp.h"
|
||||
#include "pathut.h"
|
||||
|
||||
@ -262,7 +262,7 @@ class FileInterner {
|
||||
|
||||
// Filter stack, path to the current document from which we're
|
||||
// fetching subdocs
|
||||
vector<Dijon::Filter*> m_handlers;
|
||||
vector<RecollFilter*> m_handlers;
|
||||
// Temporary files used for decoding the current stack
|
||||
bool m_tmpflgs[MAXHANDLERS];
|
||||
vector<TempFile> m_tempfiles;
|
||||
|
||||
@ -14,6 +14,14 @@
|
||||
* Free Software Foundation, Inc.,
|
||||
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
*/
|
||||
#include "autoconfig.h"
|
||||
|
||||
#include <sys/types.h>
|
||||
#include <sys/wait.h>
|
||||
|
||||
#include <list>
|
||||
using namespace std;
|
||||
|
||||
#include "cstr.h"
|
||||
#include "execmd.h"
|
||||
#include "mh_exec.h"
|
||||
@ -24,13 +32,6 @@
|
||||
#include "md5.h"
|
||||
#include "rclconfig.h"
|
||||
|
||||
#include <sys/types.h>
|
||||
#include <sys/wait.h>
|
||||
|
||||
#ifndef NO_NAMESPACES
|
||||
using namespace std;
|
||||
#endif /* NO_NAMESPACES */
|
||||
|
||||
// This is called periodically by ExeCmd when it is waiting for data,
|
||||
// or when it does receive some. We may choose to interrupt the
|
||||
// command.
|
||||
|
||||
@ -56,11 +56,11 @@ class MimeHandlerExec : public RecollFilter {
|
||||
bool missingHelper;
|
||||
////////////////
|
||||
|
||||
MimeHandlerExec(RclConfig *cnf, const string& mt)
|
||||
: RecollFilter(cnf, mt), missingHelper(false)
|
||||
MimeHandlerExec(RclConfig *cnf, const string& id)
|
||||
: RecollFilter(cnf, id), missingHelper(false)
|
||||
{}
|
||||
virtual bool set_document_file(const string &file_path) {
|
||||
RecollFilter::set_document_file(file_path);
|
||||
virtual bool set_document_file(const string& mt, const string &file_path) {
|
||||
RecollFilter::set_document_file(mt, file_path);
|
||||
m_fn = file_path;
|
||||
m_havedoc = true;
|
||||
return true;
|
||||
|
||||
@ -102,14 +102,14 @@ class MimeHandlerExecMultiple : public MimeHandlerExec {
|
||||
/////// End un-cleared stuff.
|
||||
|
||||
public:
|
||||
MimeHandlerExecMultiple(RclConfig *cnf, const string& mt)
|
||||
: MimeHandlerExec(cnf, mt)
|
||||
MimeHandlerExecMultiple(RclConfig *cnf, const string& id)
|
||||
: MimeHandlerExec(cnf, id)
|
||||
{}
|
||||
// No resources to clean up, the ExecCmd destructor does it.
|
||||
virtual ~MimeHandlerExecMultiple() {}
|
||||
virtual bool set_document_file(const string &file_path) {
|
||||
virtual bool set_document_file(const string& mt, const string &file_path) {
|
||||
m_filefirst = true;
|
||||
return MimeHandlerExec::set_document_file(file_path);
|
||||
return MimeHandlerExec::set_document_file(mt, file_path);
|
||||
}
|
||||
virtual bool next_document();
|
||||
|
||||
|
||||
@ -34,21 +34,23 @@ using namespace std;
|
||||
#endif /* NO_NAMESPACES */
|
||||
|
||||
|
||||
bool MimeHandlerHtml::set_document_file(const string &fn)
|
||||
bool MimeHandlerHtml::set_document_file(const string& mt, const string &fn)
|
||||
{
|
||||
LOGDEB0(("textHtmlToDoc: %s\n", fn.c_str()));
|
||||
RecollFilter::set_document_file(fn);
|
||||
RecollFilter::set_document_file(mt, fn);
|
||||
string otext;
|
||||
if (!file_to_string(fn, otext)) {
|
||||
LOGINFO(("textHtmlToDoc: cant read: %s\n", fn.c_str()));
|
||||
return false;
|
||||
}
|
||||
m_filename = fn;
|
||||
return set_document_string(otext);
|
||||
return set_document_string(mt, otext);
|
||||
}
|
||||
|
||||
bool MimeHandlerHtml::set_document_string(const string& htext)
|
||||
bool MimeHandlerHtml::set_document_string(const string& mt,
|
||||
const string& htext)
|
||||
{
|
||||
RecollFilter::set_document_string(mt, htext);
|
||||
m_html = htext;
|
||||
m_havedoc = true;
|
||||
|
||||
|
||||
@ -26,11 +26,15 @@
|
||||
*/
|
||||
class MimeHandlerHtml : public RecollFilter {
|
||||
public:
|
||||
MimeHandlerHtml(RclConfig *cnf, const string& mt)
|
||||
: RecollFilter(cnf, mt) {}
|
||||
virtual ~MimeHandlerHtml() {}
|
||||
virtual bool set_document_file(const string &file_path);
|
||||
virtual bool set_document_string(const string &data);
|
||||
MimeHandlerHtml(RclConfig *cnf, const string& id)
|
||||
: RecollFilter(cnf, id)
|
||||
{
|
||||
}
|
||||
virtual ~MimeHandlerHtml()
|
||||
{
|
||||
}
|
||||
virtual bool set_document_file(const string& mt, const string &file_path);
|
||||
virtual bool set_document_string(const string& mt, const string &data);
|
||||
virtual bool is_data_input_ok(DataInput input) const {
|
||||
if (input == DOCUMENT_FILE_NAME || input == DOCUMENT_STRING)
|
||||
return true;
|
||||
|
||||
@ -46,8 +46,8 @@ using namespace std;
|
||||
static const int maxdepth = 20;
|
||||
static const string cstr_mail_charset("charset");
|
||||
|
||||
MimeHandlerMail::MimeHandlerMail(RclConfig *cnf, const string &mt)
|
||||
: RecollFilter(cnf, mt), m_bincdoc(0), m_fd(-1), m_stream(0), m_idx(-1)
|
||||
MimeHandlerMail::MimeHandlerMail(RclConfig *cnf, const string &id)
|
||||
: RecollFilter(cnf, id), m_bincdoc(0), m_fd(-1), m_stream(0), m_idx(-1)
|
||||
{
|
||||
|
||||
// Look for additional headers to be processed as per config:
|
||||
@ -85,10 +85,10 @@ void MimeHandlerMail::clear()
|
||||
RecollFilter::clear();
|
||||
}
|
||||
|
||||
bool MimeHandlerMail::set_document_file(const string &fn)
|
||||
bool MimeHandlerMail::set_document_file(const string& mt, const string &fn)
|
||||
{
|
||||
LOGDEB(("MimeHandlerMail::set_document_file(%s)\n", fn.c_str()));
|
||||
RecollFilter::set_document_file(fn);
|
||||
RecollFilter::set_document_file(mt, fn);
|
||||
if (m_fd >= 0) {
|
||||
close(m_fd);
|
||||
m_fd = -1;
|
||||
@ -123,10 +123,12 @@ bool MimeHandlerMail::set_document_file(const string &fn)
|
||||
return true;
|
||||
}
|
||||
|
||||
bool MimeHandlerMail::set_document_string(const string &msgtxt)
|
||||
bool MimeHandlerMail::set_document_string(const string& mt,
|
||||
const string &msgtxt)
|
||||
{
|
||||
LOGDEB1(("MimeHandlerMail::set_document_string\n"));
|
||||
LOGDEB2(("Message text: [%s]\n", msgtxt.c_str()));
|
||||
RecollFilter::set_document_string(mt, msgtxt);
|
||||
delete m_stream;
|
||||
|
||||
if (!m_forPreview) {
|
||||
@ -614,11 +616,11 @@ void MimeHandlerMail::walkmime(Binc::MimePart* doc, int depth)
|
||||
|
||||
// Handle html stripping and transcoding to utf8
|
||||
if (!stringlowercmp("text/html", content_type.value)) {
|
||||
MimeHandlerHtml mh(m_config, "text/html");
|
||||
MimeHandlerHtml mh(m_config, "1234");
|
||||
mh.set_property(Dijon::Filter::OPERATING_MODE,
|
||||
m_forPreview ? "view" : "index");
|
||||
mh.set_property(Dijon::Filter::DEFAULT_CHARSET, charset);
|
||||
mh.set_document_string(body);
|
||||
mh.set_document_string("text/html", body);
|
||||
mh.next_document();
|
||||
map<string, string>::const_iterator it =
|
||||
mh.get_meta_data().find(cstr_dj_keycontent);
|
||||
|
||||
@ -39,10 +39,10 @@ class MHMailAttach;
|
||||
*/
|
||||
class MimeHandlerMail : public RecollFilter {
|
||||
public:
|
||||
MimeHandlerMail(RclConfig *cnf, const string &mt);
|
||||
MimeHandlerMail(RclConfig *cnf, const string &id);
|
||||
virtual ~MimeHandlerMail();
|
||||
virtual bool set_document_file(const string& file_path);
|
||||
virtual bool set_document_string(const string& data);
|
||||
virtual bool set_document_file(const string& mt, const string& file_path);
|
||||
virtual bool set_document_string(const string& mt, const string& data);
|
||||
virtual bool is_data_input_ok(DataInput input) const {
|
||||
if (input == DOCUMENT_FILE_NAME || input == DOCUMENT_STRING)
|
||||
return true;
|
||||
|
||||
@ -251,10 +251,10 @@ void MimeHandlerMbox::clear()
|
||||
RecollFilter::clear();
|
||||
}
|
||||
|
||||
bool MimeHandlerMbox::set_document_file(const string &fn)
|
||||
bool MimeHandlerMbox::set_document_file(const string& mt, const string &fn)
|
||||
{
|
||||
LOGDEB(("MimeHandlerMbox::set_document_file(%s)\n", fn.c_str()));
|
||||
RecollFilter::set_document_file(fn);
|
||||
RecollFilter::set_document_file(mt, fn);
|
||||
m_fn = fn;
|
||||
if (m_vfp) {
|
||||
fclose((FILE *)m_vfp);
|
||||
@ -598,8 +598,8 @@ int main(int argc, char **argv)
|
||||
exit(1);
|
||||
}
|
||||
config->setKeyDir(path_getfather(filename));
|
||||
MimeHandlerMbox mh(config, "text/x-mail");
|
||||
if (!mh.set_document_file(filename)) {
|
||||
MimeHandlerMbox mh(config, "some_id");
|
||||
if (!mh.set_document_file("text/x-mail", filename)) {
|
||||
cerr << "set_document_file failed" << endl;
|
||||
exit(1);
|
||||
}
|
||||
|
||||
@ -31,12 +31,12 @@ using std::vector;
|
||||
*/
|
||||
class MimeHandlerMbox : public RecollFilter {
|
||||
public:
|
||||
MimeHandlerMbox(RclConfig *cnf, const string& mime)
|
||||
: RecollFilter(cnf, mime), m_vfp(0), m_msgnum(0),
|
||||
MimeHandlerMbox(RclConfig *cnf, const string& id)
|
||||
: RecollFilter(cnf, id), m_vfp(0), m_msgnum(0),
|
||||
m_lineno(0), m_fsize(0)
|
||||
{}
|
||||
virtual ~MimeHandlerMbox();
|
||||
virtual bool set_document_file(const string &file_path);
|
||||
virtual bool set_document_file(const string& mt, const string &file_path);
|
||||
virtual bool next_document();
|
||||
virtual bool skip_to_document(const string& ipath) {
|
||||
m_ipath = ipath;
|
||||
|
||||
@ -35,12 +35,16 @@
|
||||
*/
|
||||
class MimeHandlerSymlink : public RecollFilter {
|
||||
public:
|
||||
MimeHandlerSymlink(RclConfig *cnf, const std::string& mt)
|
||||
: RecollFilter(cnf, mt) {}
|
||||
virtual ~MimeHandlerSymlink() {}
|
||||
virtual bool set_document_file(const string& fn)
|
||||
MimeHandlerSymlink(RclConfig *cnf, const std::string& id)
|
||||
: RecollFilter(cnf, id)
|
||||
{
|
||||
RecollFilter::set_document_file(fn);
|
||||
}
|
||||
virtual ~MimeHandlerSymlink()
|
||||
{
|
||||
}
|
||||
virtual bool set_document_file(const string& mt, const string& fn)
|
||||
{
|
||||
RecollFilter::set_document_file(mt, fn);
|
||||
m_fn = fn;
|
||||
return m_havedoc = true;
|
||||
}
|
||||
|
||||
@ -39,11 +39,11 @@ const int MB = 1024*1024;
|
||||
const int KB = 1024;
|
||||
|
||||
// Process a plain text file
|
||||
bool MimeHandlerText::set_document_file(const string &fn)
|
||||
bool MimeHandlerText::set_document_file(const string& mt, const string &fn)
|
||||
{
|
||||
LOGDEB(("MimeHandlerText::set_document_file: [%s]\n", fn.c_str()));
|
||||
|
||||
RecollFilter::set_document_file(fn);
|
||||
RecollFilter::set_document_file(mt, fn);
|
||||
m_fn = fn;
|
||||
|
||||
// file size for oversize check
|
||||
@ -91,8 +91,9 @@ bool MimeHandlerText::set_document_file(const string &fn)
|
||||
return true;
|
||||
}
|
||||
|
||||
bool MimeHandlerText::set_document_string(const string& otext)
|
||||
bool MimeHandlerText::set_document_string(const string& mt, const string& otext)
|
||||
{
|
||||
RecollFilter::set_document_string(mt, otext);
|
||||
m_text = otext;
|
||||
if (!m_forPreview) {
|
||||
string md5, xmd5;
|
||||
|
||||
@ -30,11 +30,15 @@ using std::string;
|
||||
*/
|
||||
class MimeHandlerText : public RecollFilter {
|
||||
public:
|
||||
MimeHandlerText(RclConfig *cnf, const string& mt)
|
||||
: RecollFilter(cnf, mt), m_paging(false), m_offs(0) {}
|
||||
virtual ~MimeHandlerText() {}
|
||||
virtual bool set_document_file(const string &file_path);
|
||||
virtual bool set_document_string(const string&);
|
||||
MimeHandlerText(RclConfig *cnf, const string& id)
|
||||
: RecollFilter(cnf, id), m_paging(false), m_offs(0)
|
||||
{
|
||||
}
|
||||
virtual ~MimeHandlerText()
|
||||
{
|
||||
}
|
||||
virtual bool set_document_file(const string& mt, const string &file_path);
|
||||
virtual bool set_document_string(const string&, const string&);
|
||||
virtual bool is_data_input_ok(DataInput input) const {
|
||||
if (input == DOCUMENT_FILE_NAME || input == DOCUMENT_STRING)
|
||||
return true;
|
||||
|
||||
@ -28,14 +28,20 @@
|
||||
*/
|
||||
class MimeHandlerUnknown : public RecollFilter {
|
||||
public:
|
||||
MimeHandlerUnknown(RclConfig *cnf, const string& mt)
|
||||
: RecollFilter(cnf, mt) {}
|
||||
virtual ~MimeHandlerUnknown() {}
|
||||
virtual bool set_document_file(const string& fn) {
|
||||
RecollFilter::set_document_file(fn);
|
||||
MimeHandlerUnknown(RclConfig *cnf, const string& id)
|
||||
: RecollFilter(cnf, id)
|
||||
{
|
||||
}
|
||||
virtual ~MimeHandlerUnknown()
|
||||
{
|
||||
}
|
||||
virtual bool set_document_file(const string& mt, const string& fn)
|
||||
{
|
||||
RecollFilter::set_document_file(mt, fn);
|
||||
return m_havedoc = true;
|
||||
}
|
||||
virtual bool set_document_string(const string&) {
|
||||
virtual bool set_document_string(const string& mt, const string& s) {
|
||||
RecollFilter::set_document_string(mt, s);
|
||||
return m_havedoc = true;
|
||||
}
|
||||
virtual bool next_document() {
|
||||
|
||||
@ -30,6 +30,7 @@ using namespace std;
|
||||
#include "debuglog.h"
|
||||
#include "rclconfig.h"
|
||||
#include "smallut.h"
|
||||
#include "md5.h"
|
||||
|
||||
#include "mh_exec.h"
|
||||
#include "mh_execm.h"
|
||||
@ -45,24 +46,26 @@ using namespace std;
|
||||
// handlers. There can be several instances for a given mime type
|
||||
// (think email attachment in email message: 2 rfc822 handlers are
|
||||
// needed simulteanously)
|
||||
static multimap<string, Dijon::Filter*> o_handlers;
|
||||
static list<multimap<string, Dijon::Filter*>::iterator> o_hlru;
|
||||
typedef list<multimap<string, Dijon::Filter*>::iterator>::iterator hlruit_tp;
|
||||
static multimap<string, RecollFilter*> o_handlers;
|
||||
static list<multimap<string, RecollFilter*>::iterator> o_hlru;
|
||||
typedef list<multimap<string, RecollFilter*>::iterator>::iterator hlruit_tp;
|
||||
|
||||
static PTMutexInit o_handlers_mutex;
|
||||
|
||||
static const unsigned int max_handlers_cache_size = 100;
|
||||
|
||||
/* Look for mime handler in pool */
|
||||
static Dijon::Filter *getMimeHandlerFromCache(const string& key)
|
||||
static RecollFilter *getMimeHandlerFromCache(const string& key)
|
||||
{
|
||||
PTMutexLocker locker(o_handlers_mutex);
|
||||
string xdigest;
|
||||
MD5HexPrint(key, xdigest);
|
||||
LOGDEB(("getMimeHandlerFromCache: %s cache size %u\n",
|
||||
key.c_str(), o_handlers.size()));
|
||||
xdigest.c_str(), o_handlers.size()));
|
||||
|
||||
multimap<string, Dijon::Filter *>::iterator it = o_handlers.find(key);
|
||||
multimap<string, RecollFilter *>::iterator it = o_handlers.find(key);
|
||||
if (it != o_handlers.end()) {
|
||||
Dijon::Filter *h = it->second;
|
||||
RecollFilter *h = it->second;
|
||||
hlruit_tp it1 = find(o_hlru.begin(), o_hlru.end(), it);
|
||||
if (it1 != o_hlru.end()) {
|
||||
o_hlru.erase(it1);
|
||||
@ -71,20 +74,22 @@ static Dijon::Filter *getMimeHandlerFromCache(const string& key)
|
||||
}
|
||||
o_handlers.erase(it);
|
||||
LOGDEB(("getMimeHandlerFromCache: %s found size %u\n",
|
||||
key.c_str(), o_handlers.size()));
|
||||
xdigest.c_str(), o_handlers.size()));
|
||||
return h;
|
||||
}
|
||||
LOGDEB(("getMimeHandlerFromCache: %s not found\n", key.c_str()));
|
||||
LOGDEB(("getMimeHandlerFromCache: %s not found\n", xdigest.c_str()));
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Return mime handler to pool */
|
||||
void returnMimeHandler(Dijon::Filter *handler)
|
||||
void returnMimeHandler(RecollFilter *handler)
|
||||
{
|
||||
typedef multimap<string, Dijon::Filter*>::value_type value_type;
|
||||
typedef multimap<string, RecollFilter*>::value_type value_type;
|
||||
|
||||
if (handler==0)
|
||||
if (handler == 0) {
|
||||
LOGERR(("returnMimeHandler: bad parameter\n"));
|
||||
return;
|
||||
}
|
||||
handler->clear();
|
||||
|
||||
PTMutexLocker locker(o_handlers_mutex);
|
||||
@ -97,7 +102,7 @@ void returnMimeHandler(Dijon::Filter *handler)
|
||||
// at the same time either because it occurs several times in a
|
||||
// stack (ie mail attachment to mail), or because several threads
|
||||
// are processing the same mime type at the same time.
|
||||
multimap<string, Dijon::Filter *>::iterator it;
|
||||
multimap<string, RecollFilter *>::iterator it;
|
||||
if (o_handlers.size() >= max_handlers_cache_size) {
|
||||
static int once = 1;
|
||||
if (once) {
|
||||
@ -114,15 +119,15 @@ void returnMimeHandler(Dijon::Filter *handler)
|
||||
o_handlers.erase(it);
|
||||
}
|
||||
}
|
||||
it = o_handlers.insert(value_type(handler->get_mime_type(), handler));
|
||||
it = o_handlers.insert(value_type(handler->get_id(), handler));
|
||||
o_hlru.push_front(it);
|
||||
}
|
||||
|
||||
void clearMimeHandlerCache()
|
||||
{
|
||||
LOGDEB(("clearMimeHandlerCache()\n"));
|
||||
typedef multimap<string, Dijon::Filter*>::value_type value_type;
|
||||
map<string, Dijon::Filter *>::iterator it;
|
||||
typedef multimap<string, RecollFilter*>::value_type value_type;
|
||||
map<string, RecollFilter *>::iterator it;
|
||||
PTMutexLocker locker(o_handlers_mutex);
|
||||
for (it = o_handlers.begin(); it != o_handlers.end(); it++) {
|
||||
delete it->second;
|
||||
@ -132,26 +137,32 @@ void clearMimeHandlerCache()
|
||||
|
||||
/** For mime types set as "internal" in mimeconf:
|
||||
* create appropriate handler object. */
|
||||
static Dijon::Filter *mhFactory(RclConfig *config, const string &mime)
|
||||
static RecollFilter *mhFactory(RclConfig *config, const string &mime,
|
||||
bool nobuild, string& id)
|
||||
{
|
||||
LOGDEB2(("mhFactory(%s)\n", mime.c_str()));
|
||||
string lmime(mime);
|
||||
stringtolower(lmime);
|
||||
if (cstr_textplain == lmime) {
|
||||
LOGDEB2(("mhFactory(%s): returning MimeHandlerText\n", mime.c_str()));
|
||||
return new MimeHandlerText(config, lmime);
|
||||
MD5String("MimeHandlerText", id);
|
||||
return nobuild ? 0 : new MimeHandlerText(config, id);
|
||||
} else if ("text/html" == lmime) {
|
||||
LOGDEB2(("mhFactory(%s): returning MimeHandlerHtml\n", mime.c_str()));
|
||||
return new MimeHandlerHtml(config, lmime);
|
||||
MD5String("MimeHandlerHtml", id);
|
||||
return nobuild ? 0 : new MimeHandlerHtml(config, id);
|
||||
} else if ("text/x-mail" == lmime) {
|
||||
LOGDEB2(("mhFactory(%s): returning MimeHandlerMbox\n", mime.c_str()));
|
||||
return new MimeHandlerMbox(config, lmime);
|
||||
MD5String("MimeHandlerMbox", id);
|
||||
return nobuild ? 0 : new MimeHandlerMbox(config, id);
|
||||
} else if ("message/rfc822" == lmime) {
|
||||
LOGDEB2(("mhFactory(%s): returning MimeHandlerMail\n", mime.c_str()));
|
||||
return new MimeHandlerMail(config, lmime);
|
||||
MD5String("MimeHandlerMail", id);
|
||||
return nobuild ? 0 : new MimeHandlerMail(config, id);
|
||||
} else if ("inode/symlink" == lmime) {
|
||||
LOGDEB2(("mhFactory(%s): ret MimeHandlerSymlink\n", mime.c_str()));
|
||||
return new MimeHandlerSymlink(config, lmime);
|
||||
MD5String("MimeHandlerSymlink", id);
|
||||
return nobuild ? 0 : new MimeHandlerSymlink(config, id);
|
||||
} else if (lmime.find("text/") == 0) {
|
||||
// Try to handle unknown text/xx as text/plain. This
|
||||
// only happen if the text/xx was defined as "internal" in
|
||||
@ -159,14 +170,16 @@ static Dijon::Filter *mhFactory(RclConfig *config, const string &mime)
|
||||
// allows indexing and previewing as text/plain (no filter
|
||||
// exec) but still opening with a specific editor.
|
||||
LOGDEB2(("mhFactory(%s): returning MimeHandlerText(x)\n",mime.c_str()));
|
||||
return new MimeHandlerText(config, lmime);
|
||||
MD5String("MimeHandlerText", id);
|
||||
return nobuild ? 0 : new MimeHandlerText(config, id);
|
||||
} else {
|
||||
// We should not get there. It means that "internal" was set
|
||||
// as a handler in mimeconf for a mime type we actually can't
|
||||
// handle.
|
||||
LOGERR(("mhFactory: mime type [%s] set as internal but unknown\n",
|
||||
lmime.c_str()));
|
||||
return new MimeHandlerUnknown(config, lmime);
|
||||
MD5String("MimeHandlerUnknown", id);
|
||||
return nobuild ? 0 : new MimeHandlerUnknown(config, id);
|
||||
}
|
||||
}
|
||||
|
||||
@ -181,10 +194,11 @@ static const string cstr_mh_charset("charset");
|
||||
* a ';' inside a quoted string for now. Can't see a use for it.
|
||||
*/
|
||||
MimeHandlerExec *mhExecFactory(RclConfig *cfg, const string& mtype, string& hs,
|
||||
bool multiple)
|
||||
bool multiple, const string& id)
|
||||
{
|
||||
ConfSimple attrs;
|
||||
string cmdstr;
|
||||
|
||||
if (!cfg->valueSplitAttributes(hs, cmdstr, attrs)) {
|
||||
LOGERR(("mhExecFactory: bad config line for [%s]: [%s]\n",
|
||||
mtype.c_str(), hs.c_str()));
|
||||
@ -200,8 +214,8 @@ MimeHandlerExec *mhExecFactory(RclConfig *cfg, const string& mtype, string& hs,
|
||||
return 0;
|
||||
}
|
||||
MimeHandlerExec *h = multiple ?
|
||||
new MimeHandlerExecMultiple(cfg, mtype.c_str()) :
|
||||
new MimeHandlerExec(cfg, mtype.c_str());
|
||||
new MimeHandlerExecMultiple(cfg, id) :
|
||||
new MimeHandlerExec(cfg, id);
|
||||
list<string>::iterator it = cmdtoks.begin();
|
||||
h->params.push_back(cfg->findFilter(*it++));
|
||||
h->params.insert(h->params.end(), it, cmdtoks.end());
|
||||
@ -228,32 +242,27 @@ MimeHandlerExec *mhExecFactory(RclConfig *cfg, const string& mtype, string& hs,
|
||||
}
|
||||
|
||||
/* Get handler/filter object for given mime type: */
|
||||
Dijon::Filter *getMimeHandler(const string &mtype, RclConfig *cfg,
|
||||
RecollFilter *getMimeHandler(const string &mtype, RclConfig *cfg,
|
||||
bool filtertypes)
|
||||
{
|
||||
LOGDEB(("getMimeHandler: mtype [%s] filtertypes %d\n",
|
||||
mtype.c_str(), filtertypes));
|
||||
Dijon::Filter *h = 0;
|
||||
RecollFilter *h = 0;
|
||||
|
||||
// Get handler definition for mime type. We do this even if an
|
||||
// appropriate handler object may be in the cache (indexed by mime
|
||||
// type). This is fast, and necessary to conform to the
|
||||
// appropriate handler object may be in the cache.
|
||||
// This is fast, and necessary to conform to the
|
||||
// configuration, (ie: text/html might be filtered out by
|
||||
// indexedmimetypes but an html handler could still be in the
|
||||
// cache because it was needed by some other interning stack).
|
||||
string hs;
|
||||
hs = cfg->getMimeHandlerDef(mtype, filtertypes);
|
||||
string id;
|
||||
|
||||
if (!hs.empty()) { // Got a handler definition line
|
||||
|
||||
// Do we already have a handler object in the cache ?
|
||||
h = getMimeHandlerFromCache(mtype);
|
||||
if (h != 0)
|
||||
goto out;
|
||||
LOGDEB2(("getMimeHandler: %s not in cache\n", mtype.c_str()));
|
||||
|
||||
// Not in cache. Break definition into type and name/command
|
||||
// string and instanciate handler object
|
||||
if (!hs.empty()) {
|
||||
// Got a handler definition line
|
||||
// Break definition into type (internal/exec/execm)
|
||||
// and name/command string
|
||||
string::size_type b1 = hs.find_first_of(" \t");
|
||||
string handlertype = hs.substr(0, b1);
|
||||
string cmdstr;
|
||||
@ -261,7 +270,30 @@ Dijon::Filter *getMimeHandler(const string &mtype, RclConfig *cfg,
|
||||
cmdstr = hs.substr(b1);
|
||||
trimstring(cmdstr);
|
||||
}
|
||||
if (!stringlowercmp("internal", handlertype)) {
|
||||
bool internal = !stringlowercmp("internal", handlertype);
|
||||
if (internal) {
|
||||
// For internal types let the factory compute the id
|
||||
mhFactory(cfg, cmdstr.empty() ? mtype : cmdstr, true, id);
|
||||
} else {
|
||||
// exec/execm: use the md5 of the def line
|
||||
MD5String(hs, id);
|
||||
}
|
||||
|
||||
#if 0
|
||||
{ // string xdigest; LOGDEB2(("getMimeHandler: [%s] hs [%s] id [%s]\n",
|
||||
//mtype.c_str(), hs.c_str(), MD5HexPrint(id, xdigest).c_str()));
|
||||
}
|
||||
#endif
|
||||
|
||||
// Do we already have a handler object in the cache ?
|
||||
h = getMimeHandlerFromCache(id);
|
||||
if (h != 0)
|
||||
goto out;
|
||||
|
||||
LOGDEB2(("getMimeHandler: %s not in cache\n", mtype.c_str()));
|
||||
|
||||
// Not in cache.
|
||||
if (internal) {
|
||||
// If there is a parameter after "internal" it's the mime
|
||||
// type to use. This is so that we can have bogus mime
|
||||
// types like text/x-purple-html-log (for ie: specific
|
||||
@ -270,14 +302,7 @@ Dijon::Filter *getMimeHandler(const string &mtype, RclConfig *cfg,
|
||||
// better and the latter will probably go away at some
|
||||
// point in the future.
|
||||
LOGDEB2(("handlertype internal, cmdstr [%s]\n", cmdstr.c_str()));
|
||||
if (!cmdstr.empty()) {
|
||||
// Have to redo the cache thing. Maybe we should
|
||||
// rather just recurse instead ?
|
||||
if ((h = getMimeHandlerFromCache(cmdstr)) == 0)
|
||||
h = mhFactory(cfg, cmdstr);
|
||||
} else {
|
||||
h = mhFactory(cfg, mtype);
|
||||
}
|
||||
h = mhFactory(cfg, cmdstr.empty() ? mtype : cmdstr, false, id);
|
||||
goto out;
|
||||
} else if (!stringlowercmp("dll", handlertype)) {
|
||||
} else {
|
||||
@ -287,10 +312,10 @@ Dijon::Filter *getMimeHandler(const string &mtype, RclConfig *cfg,
|
||||
goto out;
|
||||
}
|
||||
if (!stringlowercmp("exec", handlertype)) {
|
||||
h = mhExecFactory(cfg, mtype, cmdstr, false);
|
||||
h = mhExecFactory(cfg, mtype, cmdstr, false, id);
|
||||
goto out;
|
||||
} else if (!stringlowercmp("execm", handlertype)) {
|
||||
h = mhExecFactory(cfg, mtype, cmdstr, true);
|
||||
h = mhExecFactory(cfg, mtype, cmdstr, true, id);
|
||||
goto out;
|
||||
} else {
|
||||
LOGERR(("getMimeHandler: bad line for %s: %s\n",
|
||||
@ -305,20 +330,20 @@ Dijon::Filter *getMimeHandler(const string &mtype, RclConfig *cfg,
|
||||
|
||||
// Finally, unhandled files are either ignored or their name and
|
||||
// generic metadata is indexed, depending on configuration
|
||||
{bool indexunknown = false;
|
||||
{
|
||||
bool indexunknown = false;
|
||||
cfg->getConfParam("indexallfilenames", &indexunknown);
|
||||
if (indexunknown) {
|
||||
if ((h = getMimeHandlerFromCache("application/octet-stream")) == 0)
|
||||
h = new MimeHandlerUnknown(cfg, "application/octet-stream");
|
||||
goto out;
|
||||
} else {
|
||||
goto out;
|
||||
MD5String("MimeHandlerUnknown", id);
|
||||
if ((h = getMimeHandlerFromCache(id)) == 0)
|
||||
h = new MimeHandlerUnknown(cfg, id);
|
||||
}
|
||||
goto out;
|
||||
}
|
||||
|
||||
out:
|
||||
if (h) {
|
||||
h->set_property(Dijon::Filter::DEFAULT_CHARSET, cfg->getDefCharset());
|
||||
h->set_property(RecollFilter::DEFAULT_CHARSET, cfg->getDefCharset());
|
||||
// In multithread context, and in case this handler is out
|
||||
// from the cache, it may have a config pointer belonging to
|
||||
// another thread. Fix it.
|
||||
|
||||
@ -21,26 +21,23 @@
|
||||
#include <stdio.h>
|
||||
|
||||
#include <string>
|
||||
#include <list>
|
||||
using std::string;
|
||||
using std::list;
|
||||
|
||||
#include <Filter.h>
|
||||
#include "Filter.h"
|
||||
#include "cstr.h"
|
||||
|
||||
class RclConfig;
|
||||
|
||||
class RecollFilter : public Dijon::Filter {
|
||||
public:
|
||||
RecollFilter(RclConfig *config, const string& mtype)
|
||||
: Dijon::Filter(mtype), m_config(config),
|
||||
m_forPreview(false), m_havedoc(false)
|
||||
RecollFilter(RclConfig *config, const std::string& id)
|
||||
: m_config(config), m_forPreview(false), m_havedoc(false), m_id(id)
|
||||
{}
|
||||
virtual ~RecollFilter() {}
|
||||
virtual void setConfig(RclConfig *config)
|
||||
{
|
||||
m_config = config;
|
||||
}
|
||||
virtual bool set_property(Properties p, const string &v) {
|
||||
virtual bool set_property(Properties p, const std::string &v) {
|
||||
switch (p) {
|
||||
case DJF_UDI:
|
||||
m_udi = v;
|
||||
@ -59,7 +56,12 @@ public:
|
||||
}
|
||||
|
||||
// We don't use this for now
|
||||
virtual bool set_document_uri(const std::string &) {return false;}
|
||||
virtual bool set_document_uri(const std::string& mtype,
|
||||
const std::string &)
|
||||
{
|
||||
m_mimeType = mtype;
|
||||
return false;
|
||||
}
|
||||
|
||||
// This does nothing right now but should be called from the
|
||||
// subclass method in case we need some common processing one day
|
||||
@ -69,12 +71,24 @@ public:
|
||||
// having a pure virtual called from here and implemented in the
|
||||
// subclass) would have to be repeated in each derived class. It's
|
||||
// just simpler this way.
|
||||
virtual bool set_document_file(const string & /*file_path*/) {return true;}
|
||||
virtual bool set_document_file(const std::string& mtype,
|
||||
const std::string & /*file_path*/)
|
||||
{
|
||||
m_mimeType = mtype;
|
||||
return true;
|
||||
}
|
||||
|
||||
// Default implementations
|
||||
virtual bool set_document_string(const std::string &) {return false;}
|
||||
virtual bool set_document_data(const char *cp, unsigned int sz) {
|
||||
return set_document_string(string(cp, sz));
|
||||
virtual bool set_document_string(const std::string& mtype,
|
||||
const std::string &)
|
||||
{
|
||||
m_mimeType = mtype;
|
||||
return false;
|
||||
}
|
||||
virtual bool set_document_data(const std::string& mtype,
|
||||
const char *cp, unsigned int sz)
|
||||
{
|
||||
return set_document_string(mtype, std::string(cp, sz));
|
||||
}
|
||||
|
||||
virtual void set_docsize(size_t size)
|
||||
@ -87,7 +101,7 @@ public:
|
||||
virtual bool has_documents() const {return m_havedoc;}
|
||||
|
||||
// Most doc types are single-doc
|
||||
virtual bool skip_to_document(const string& s) {
|
||||
virtual bool skip_to_document(const std::string& s) {
|
||||
if (s.empty())
|
||||
return true;
|
||||
return false;
|
||||
@ -99,10 +113,15 @@ public:
|
||||
return false;
|
||||
}
|
||||
|
||||
virtual string get_error() const {
|
||||
virtual std::string get_error() const {
|
||||
return m_reason;
|
||||
}
|
||||
|
||||
virtual const std::string& get_id() const
|
||||
{
|
||||
return m_id;
|
||||
}
|
||||
|
||||
// "Call super" anti-pattern again. Must be called from derived
|
||||
// classes which reimplement clear()
|
||||
virtual void clear() {
|
||||
@ -114,17 +133,20 @@ public:
|
||||
|
||||
// This only makes sense if the contents are currently txt/plain
|
||||
// It converts from keyorigcharset to UTF-8 and sets keycharset.
|
||||
bool txtdcode(const string& who);
|
||||
bool txtdcode(const std::string& who);
|
||||
|
||||
protected:
|
||||
bool preview() {return m_forPreview;}
|
||||
|
||||
RclConfig *m_config;
|
||||
bool m_forPreview;
|
||||
string m_dfltInputCharset;
|
||||
string m_reason;
|
||||
std::string m_dfltInputCharset;
|
||||
std::string m_reason;
|
||||
bool m_havedoc;
|
||||
string m_udi; // May be set by creator as a hint
|
||||
std::string m_udi; // May be set by creator as a hint
|
||||
// m_id is and md5 of the filter definition line (from mimeconf) and
|
||||
// is used when fetching/returning filters to / from the cache.
|
||||
std::string m_id;
|
||||
};
|
||||
|
||||
/**
|
||||
@ -135,11 +157,11 @@ protected:
|
||||
* @param filtertypes decide if we should restrict to types in
|
||||
* indexedmimetypes (if this is set at all).
|
||||
*/
|
||||
extern Dijon::Filter *getMimeHandler(const std::string &mtyp, RclConfig *cfg,
|
||||
extern RecollFilter *getMimeHandler(const std::string &mtyp, RclConfig *cfg,
|
||||
bool filtertypes=false);
|
||||
|
||||
/// Free up filter for reuse (you can also delete it)
|
||||
extern void returnMimeHandler(Dijon::Filter *);
|
||||
extern void returnMimeHandler(RecollFilter *);
|
||||
|
||||
/// Clean up cache at the end of an indexing pass. For people who use
|
||||
/// the GUI to index: avoid all those filter processes forever hanging
|
||||
|
||||
@ -62,10 +62,6 @@ using std::pair;
|
||||
#include "docseqhist.h"
|
||||
#include "rclhelp.h"
|
||||
|
||||
#ifndef MIN
|
||||
#define MIN(A,B) ((A)<(B)?(A):(B))
|
||||
#endif
|
||||
|
||||
// Subclass plainToRich to add <termtag>s and anchors to the preview text
|
||||
class PlainToRichQtPreview : public PlainToRich {
|
||||
public:
|
||||
|
||||
@ -24,8 +24,10 @@
|
||||
#include <time.h>
|
||||
|
||||
#include <sstream>
|
||||
#include <list>
|
||||
using std::ostringstream;
|
||||
using std::endl;
|
||||
using std::list;
|
||||
|
||||
#include "cstr.h"
|
||||
#include "reslistpager.h"
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user