diff --git a/src/index/indexer.cpp b/src/index/indexer.cpp index 7bbb9e67..e4d848b5 100644 --- a/src/index/indexer.cpp +++ b/src/index/indexer.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: indexer.cpp,v 1.68 2008-07-29 06:25:29 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: indexer.cpp,v 1.69 2008-10-04 14:26:59 dockes Exp $ (C) 2004 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -455,20 +455,17 @@ DbIndexer::processone(const std::string &fn, const struct stat *stp, // The not so nice point was that the file name was not // indexed. // - // We now index at least the file name. We use a dirty - // hack to ensure that the indexing will be retried each - // time: the stored number as decimal ascii mtime is - // prefixed with a '+', which doesnt change its value for - // atoll() but is tested by rcldb::needUpdate() - // Reset the date as set by the handler if any + // We now index at least the file name and the mod time. + // We change the signature to ensure that the indexing will + // be retried every time. This can make indexing passes quite + // slower if there are many files of types with no helper doc.fmtime.erase(); // Go through: } if (doc.fmtime.empty()) { // Set the date if this was not done in the document handler - doc.fmtime = (fis == FileInterner::FIError) ? plus + ascdate : - ascdate; + doc.fmtime = ascdate; } // Internal access path for multi-document files @@ -492,6 +489,14 @@ DbIndexer::processone(const std::string &fn, const struct stat *stp, // need for reversible formatting sprintf(cbuf, "%ld%ld", (long)stp->st_size, (long)stp->st_mtime); doc.sig = cbuf; + // If there was an error, ensure indexing will be + // retried. This is for the once missing, later installed + // filter case. It can make indexing much slower (if there are + // myriads of such files, the ext script is executed for them + // and fails every time) + if (fis == FileInterner::FIError) { + doc.sig += plus; + } // Add document to database. If there is an ipath, add it as a children // of the file document. diff --git a/src/internfile/Filter.h b/src/internfile/Filter.h index 0593d6ce..846eff7d 100644 --- a/src/internfile/Filter.h +++ b/src/internfile/Filter.h @@ -157,11 +157,13 @@ namespace Dijon * that the client application can pass the nested document's content * to another filter that supports this particular type. */ - const std::map &get_meta_data(void) const + virtual const std::map &get_meta_data(void) const { return m_metaData; } + virtual void clear() {m_metaData.clear();} + protected: /// The MIME type handled by the filter. std::string m_mimeType; diff --git a/src/internfile/internfile.cpp b/src/internfile/internfile.cpp index 9e3829b5..b5f92ed6 100644 --- a/src/internfile/internfile.cpp +++ b/src/internfile/internfile.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: internfile.cpp,v 1.43 2008-10-03 06:23:23 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: internfile.cpp,v 1.44 2008-10-04 14:26:59 dockes Exp $ (C) 2004 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -203,8 +203,9 @@ FileInterner::~FileInterner() { tmpcleanup(); for (vector::iterator it = m_handlers.begin(); - it != m_handlers.end(); it++) - delete *it; + it != m_handlers.end(); it++) { + returnMimeHandler(*it); + } // m_tempfiles will take care of itself } @@ -283,8 +284,10 @@ static inline bool getKeyValue(const map& docdata, it = docdata.find(key); if (it != docdata.end()) { value = it->second; + LOGDEB2(("getKeyValue: [%s]->[%s]\n", key.c_str(), value.c_str())); return true; } + LOGDEB2(("getKeyValue: no value for [%s]\n", key.c_str())); return false; } @@ -314,7 +317,7 @@ bool FileInterner::dijontorcl(Rcl::Doc& doc) } else if (it->first == Rcl::Doc::keyoc) { doc.origcharset = it->second; } else if (it->first == keymt || it->first == keycs) { - // don't need these. + // don't need/want these. } else { doc.meta[it->first] = it->second; } @@ -338,7 +341,6 @@ void FileInterner::collectIpathAndMT(Rcl::Doc& doc, string& ipath) const // If there is no ipath stack, the mimetype is the one from the file doc.mimetype = m_mimetype; - LOGDEB2(("INITIAL mimetype: %s\n", doc.mimetype.c_str())); string ipathel; for (vector::const_iterator hit = m_handlers.begin(); @@ -382,7 +384,7 @@ void FileInterner::popHandler() m_tempfiles.pop_back(); m_tmpflgs[i] = false; } - delete m_handlers.back(); + returnMimeHandler(m_handlers.back()); m_handlers.pop_back(); } @@ -430,8 +432,8 @@ int FileInterner::addHandler() m_forPreview ? "view" : "index"); newflt->set_property(Dijon::Filter::DEFAULT_CHARSET, charset); - // Get content: we don't use getkeyvalue() here to avoid copying - // the text, which may be big. + // Get current content: we don't use getkeyvalue() here to avoid + // copying the text, which may be big. string ns; const string *txt = &ns; { @@ -469,9 +471,8 @@ int FileInterner::addHandler() } // Information and debug after a next_document error -void FileInterner::processNextDocError() +void FileInterner::processNextDocError(Rcl::Doc &doc, string& ipath) { - Rcl::Doc doc; string ipath; collectIpathAndMT(doc, ipath); m_reason = m_handlers.back()->get_error(); checkExternalMissing(m_reason); @@ -530,7 +531,7 @@ FileInterner::Status FileInterner::internfile(Rcl::Doc& doc, string& ipath) // might be ie an error while decoding an attachment, but we // still want to process the rest of the mbox! For preview: fatal. if (!m_handlers.back()->next_document()) { - processNextDocError(); // Debug etc. + processNextDocError(doc, ipath); if (m_forPreview) return FIError; popHandler(); diff --git a/src/internfile/internfile.h b/src/internfile/internfile.h index ad7e9a5f..56836c8b 100644 --- a/src/internfile/internfile.h +++ b/src/internfile/internfile.h @@ -16,7 +16,7 @@ */ #ifndef _INTERNFILE_H_INCLUDED_ #define _INTERNFILE_H_INCLUDED_ -/* @(#$Id: internfile.h,v 1.19 2008-10-03 06:23:23 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: internfile.h,v 1.20 2008-10-04 14:26:59 dockes Exp $ (C) 2004 J.F.Dockes */ #include #include @@ -144,7 +144,7 @@ class FileInterner { void popHandler(); int addHandler(); void checkExternalMissing(const string& msg); - void processNextDocError(); + void processNextDocError(Rcl::Doc &doc, string& ipath); }; diff --git a/src/internfile/mh_exec.cpp b/src/internfile/mh_exec.cpp index 6e36e38c..d408ac81 100644 --- a/src/internfile/mh_exec.cpp +++ b/src/internfile/mh_exec.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: mh_exec.cpp,v 1.11 2008-10-02 13:30:32 dockes Exp $ (C) 2005 J.F.Dockes"; +static char rcsid[] = "@(#$Id: mh_exec.cpp,v 1.12 2008-10-04 14:26:59 dockes Exp $ (C) 2005 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -35,8 +35,8 @@ public: } }; -// Execute an external program to translate a file from its native format -// to html. Then call the html parser to do the actual indexing +// Execute an external program to translate a file from its native +// format to text or html. bool MimeHandlerExec::next_document() { if (m_havedoc == false) @@ -59,29 +59,28 @@ bool MimeHandlerExec::next_document() if (!m_ipath.empty()) myparams.push_back(m_ipath); - // Execute command and store the result text, which is supposedly html - string& html = m_metaData["content"]; - html.erase(); + // Execute command and store the result text + string& output = m_metaData["content"]; + output.erase(); ExecCmd mexec; MEAdv adv; mexec.setAdvise(&adv); mexec.putenv(m_forPreview ? "RECOLL_FILTER_FORPREVIEW=yes" : "RECOLL_FILTER_FORPREVIEW=no"); - int status = mexec.doexec(cmd, myparams, 0, &html); + int status = mexec.doexec(cmd, myparams, 0, &output); if (status) { LOGERR(("MimeHandlerExec: command status 0x%x: %s\n", status, cmd.c_str())); // If the output string begins with RECFILTERROR, then it's // interpretable error information - if (html.find("RECFILTERROR") == 0) - m_reason = html; + if (output.find("RECFILTERROR") == 0) + m_reason = output; return false; } - m_metaData["origcharset"] = m_defcharset; // Default charset: all recoll filters output utf-8, but this // could still be overridden by the content-type meta tag. - m_metaData["charset"] = "utf-8"; - m_metaData["mimetype"] = "text/html"; + m_metaData["charset"] = cfgCharset.empty() ? "utf-8" : cfgCharset; + m_metaData["mimetype"] = cfgMtype.empty() ? "text/html" : cfgMtype; return true; } diff --git a/src/internfile/mh_exec.h b/src/internfile/mh_exec.h index a0021b1d..6263f563 100644 --- a/src/internfile/mh_exec.h +++ b/src/internfile/mh_exec.h @@ -16,7 +16,7 @@ */ #ifndef _MH_EXEC_H_INCLUDED_ #define _MH_EXEC_H_INCLUDED_ -/* @(#$Id: mh_exec.h,v 1.6 2008-10-02 13:30:32 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: mh_exec.h,v 1.7 2008-10-04 14:26:59 dockes Exp $ (C) 2004 J.F.Dockes */ #include #include @@ -33,7 +33,16 @@ using std::string; */ class MimeHandlerExec : public RecollFilter { public: + // params, cfgMtype and chgCharset do not get reset by + // clear(). They define what I am list params; + // The defaults for external filters is to output html except if defined + // otherwise in the config. + string cfgMtype; + // For ext programs which don't output html, the output charset + // has to be known: ie they have a --charset utf-8 like option. + string cfgCharset; + MimeHandlerExec(const string& mt) : RecollFilter(mt) {} virtual ~MimeHandlerExec() {} virtual bool set_document_file(const string &file_path) { @@ -46,6 +55,12 @@ class MimeHandlerExec : public RecollFilter { m_ipath = ipath; return true; } + virtual void clear() { + m_fn.erase(); + m_ipath.erase(); + RecollFilter::clear(); + } + private: string m_fn; string m_ipath; diff --git a/src/internfile/mh_html.h b/src/internfile/mh_html.h index ea899b30..3182a8ec 100644 --- a/src/internfile/mh_html.h +++ b/src/internfile/mh_html.h @@ -16,7 +16,7 @@ */ #ifndef _HTML_H_INCLUDED_ #define _HTML_H_INCLUDED_ -/* @(#$Id: mh_html.h,v 1.11 2008-10-03 06:17:46 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: mh_html.h,v 1.12 2008-10-04 14:26:59 dockes Exp $ (C) 2004 J.F.Dockes */ #include @@ -41,7 +41,11 @@ class MimeHandlerHtml : public RecollFilter { { return m_html; } - + virtual void clear() { + m_filename.erase(); + m_html.erase(); + RecollFilter::clear(); + } private: string m_filename; string m_html; diff --git a/src/internfile/mh_mail.cpp b/src/internfile/mh_mail.cpp index d7ba007d..89dad472 100644 --- a/src/internfile/mh_mail.cpp +++ b/src/internfile/mh_mail.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: mh_mail.cpp,v 1.34 2008-09-16 08:13:45 dockes Exp $ (C) 2005 J.F.Dockes"; +static char rcsid[] = "@(#$Id: mh_mail.cpp,v 1.35 2008-10-04 14:26:59 dockes Exp $ (C) 2005 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -54,14 +54,24 @@ static const string cstr_title = "title"; MimeHandlerMail::~MimeHandlerMail() { - delete m_bincdoc; - if (m_fd >= 0) + clear(); +} +void MimeHandlerMail::clear() +{ + delete m_bincdoc; m_bincdoc = 0; + if (m_fd >= 0) { close(m_fd); - delete m_stream; + m_fd = -1; + } + delete m_stream; m_stream = 0; + m_idx = -1; + m_subject.erase(); for (vector::iterator it = m_attachments.begin(); it != m_attachments.end(); it++) { delete *it; } + m_attachments.clear(); + RecollFilter::clear(); } bool MimeHandlerMail::set_document_file(const string &fn) diff --git a/src/internfile/mh_mail.h b/src/internfile/mh_mail.h index 311e4949..bfb757b7 100644 --- a/src/internfile/mh_mail.h +++ b/src/internfile/mh_mail.h @@ -16,7 +16,7 @@ */ #ifndef _MAIL_H_INCLUDED_ #define _MAIL_H_INCLUDED_ -/* @(#$Id: mh_mail.h,v 1.12 2007-10-17 11:40:35 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: mh_mail.h,v 1.13 2008-10-04 14:26:59 dockes Exp $ (C) 2004 J.F.Dockes */ #include #include @@ -51,6 +51,7 @@ class MimeHandlerMail : public RecollFilter { } virtual bool next_document(); virtual bool skip_to_document(const string& ipath); + virtual void clear(); private: bool processMsg(Binc::MimePart *doc, int depth); diff --git a/src/internfile/mh_mbox.cpp b/src/internfile/mh_mbox.cpp index 6193baaa..f238be5b 100644 --- a/src/internfile/mh_mbox.cpp +++ b/src/internfile/mh_mbox.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: mh_mbox.cpp,v 1.4 2008-08-29 13:05:12 dockes Exp $ (C) 2005 J.F.Dockes"; +static char rcsid[] = "@(#$Id: mh_mbox.cpp,v 1.5 2008-10-04 14:26:59 dockes Exp $ (C) 2005 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -39,10 +39,19 @@ using namespace std; MimeHandlerMbox::~MimeHandlerMbox() { + clear(); +} + +void MimeHandlerMbox::clear() +{ + m_fn.erase(); if (m_vfp) { fclose((FILE *)m_vfp); m_vfp = 0; } + m_msgnum = m_lineno = 0; + m_ipath.erase(); + RecollFilter::clear(); } bool MimeHandlerMbox::set_document_file(const string &fn) diff --git a/src/internfile/mh_mbox.h b/src/internfile/mh_mbox.h index a5e8c719..72cd4940 100644 --- a/src/internfile/mh_mbox.h +++ b/src/internfile/mh_mbox.h @@ -16,7 +16,7 @@ */ #ifndef _MBOX_H_INCLUDED_ #define _MBOX_H_INCLUDED_ -/* @(#$Id: mh_mbox.h,v 1.2 2007-10-03 14:53:37 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: mh_mbox.h,v 1.3 2008-10-04 14:26:59 dockes Exp $ (C) 2004 J.F.Dockes */ #include using std::string; @@ -40,7 +40,7 @@ class MimeHandlerMbox : public RecollFilter { m_ipath = ipath; return true; } - + virtual void clear(); private: string m_fn; // File name void *m_vfp; // File pointer for folder diff --git a/src/internfile/mh_text.h b/src/internfile/mh_text.h index 7c7578a8..94b66bb1 100644 --- a/src/internfile/mh_text.h +++ b/src/internfile/mh_text.h @@ -16,7 +16,7 @@ */ #ifndef _MH_TEXT_H_INCLUDED_ #define _MH_TEXT_H_INCLUDED_ -/* @(#$Id: mh_text.h,v 1.4 2006-12-16 15:39:54 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: mh_text.h,v 1.5 2008-10-04 14:26:59 dockes Exp $ (C) 2004 J.F.Dockes */ #include using std::string; @@ -40,6 +40,11 @@ class MimeHandlerText : public RecollFilter { return false; } virtual bool next_document(); + virtual void clear() + { + m_text.erase(); + RecollFilter::clear(); + } private: string m_text; }; diff --git a/src/internfile/mh_unknown.h b/src/internfile/mh_unknown.h index 92c33576..8d0340c2 100644 --- a/src/internfile/mh_unknown.h +++ b/src/internfile/mh_unknown.h @@ -16,7 +16,7 @@ */ #ifndef _MH_UNKNOWN_H_INCLUDED_ #define _MH_UNKNOWN_H_INCLUDED_ -/* @(#$Id: mh_unknown.h,v 1.2 2006-12-15 12:40:02 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: mh_unknown.h,v 1.3 2008-10-04 14:26:59 dockes Exp $ (C) 2004 J.F.Dockes */ #include @@ -44,6 +44,9 @@ class MimeHandlerUnknown : public RecollFilter { m_metaData["mimetype"] = "text/plain"; return true; } + virtual void clear() { + RecollFilter::clear(); + } }; #endif /* _MH_UNKNOWN_H_INCLUDED_ */ diff --git a/src/internfile/mimehandler.cpp b/src/internfile/mimehandler.cpp index 4b284173..5fa22ad9 100644 --- a/src/internfile/mimehandler.cpp +++ b/src/internfile/mimehandler.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: mimehandler.cpp,v 1.22 2007-11-16 14:28:52 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: mimehandler.cpp,v 1.23 2008-10-04 14:26:59 dockes Exp $ (C) 2004 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -34,7 +34,10 @@ using namespace std; #include "mh_mbox.h" #include "mh_text.h" #include "mh_unknown.h" - + +// Pool of already known and created handlers +static map o_handlers; + /** Create internal handler object appropriate for given mime type */ static Dijon::Filter *mhFactory(const string &mime) { @@ -52,16 +55,103 @@ static Dijon::Filter *mhFactory(const string &mime) return new MimeHandlerUnknown(lmime); } -/* - * Return handler object for given mime type: +/** + * Create a filter that executes an external program or script + * A filter def can look like. + * exec someprog -v -t " h i j";charset= xx; mimetype=yy + * We don't support ';' inside a quoted string for now. Can't see a use + * for it */ +MimeHandlerExec *mhExecFactory(RclConfig *cfg, const string& mtype, string& hs) +{ + listsemicolist; + stringToTokens(hs, semicolist, ";"); + if (hs.size() < 1) { + LOGERR(("mhExecFactory: bad filter def: [%s]\n", hs.c_str())); + return 0; + } + string& cmd = *(semicolist.begin()); + + list toks; + stringToStrings(cmd, toks); + if (toks.size() < 2) { + LOGERR(("mhExecFactory: bad config line for [%s]: [%s]\n", + mtype.c_str(), hs.c_str())); + return 0; + } + + MimeHandlerExec *h = new MimeHandlerExec(mtype.c_str()); + + list::iterator it; + + // toks size is at least 2, this has been checked by caller. + it = toks.begin(); + it++; + h->params.push_back(cfg->findFilter(*it++)); + h->params.insert(h->params.end(), it, toks.end()); + + // Handle additional parameters + it = semicolist.begin(); + it++; + for (;it != semicolist.end(); it++) { + string &line = *it; + string::size_type eqpos = line.find("="); + if (eqpos == string::npos) + continue; + // Compute name and value, trim white space + string nm, val; + nm = line.substr(0, eqpos); + trimstring(nm); + val = line.substr(eqpos+1, string::npos); + trimstring(val); + if (!nm.compare("charset")) { + h->cfgCharset = val; + } else if (!nm.compare("mimetype")) { + h->cfgMtype = val; + } + } + +#if 0 + string sparams; + for (it = h->params.begin(); it != h->params.end(); it++) { + sparams += string("[") + *it + "] "; + } + LOGDEB(("mhExecFactory:mt [%s] cfgmt [%s] cfgcs [%s] params: [%s]\n", + mtype.c_str(), h->cfgMtype.c_str(), h->cfgCharset.c_str(), + sparams.c_str())); +#endif + + return h; +} + +/* Return mime handler to pool */ +void returnMimeHandler(Dijon::Filter *handler) +{ + if (handler) { + handler->clear(); + o_handlers[handler->get_mime_type()] = handler; + } +} + +/* Get handler/filter object for given mime type: */ Dijon::Filter *getMimeHandler(const string &mtype, RclConfig *cfg, bool filtertypes) { + if (mtype.empty()) + return false; + + // Do we already have one ? + map::iterator it = o_handlers.find(mtype); + if (it != o_handlers.end()) { + Dijon::Filter *h = it->second; + o_handlers.erase(it); + LOGDEB2(("getMimeHandler: found in cache\n")); + return h; + } + // Get handler definition for mime type string hs; - if (!mtype.empty()) - hs = cfg->getMimeHandlerDef(mtype, filtertypes); + hs = cfg->getMimeHandlerDef(mtype, filtertypes); if (!hs.empty()) { // Break definition into type and name @@ -84,11 +174,7 @@ Dijon::Filter *getMimeHandler(const string &mtype, RclConfig *cfg, mtype.c_str(), hs.c_str())); return 0; } - MimeHandlerExec *h = new MimeHandlerExec(mtype.c_str()); - it++; - h->params.push_back(cfg->findFilter(*it++)); - h->params.insert(h->params.end(), it, toks.end()); - return h; + return mhExecFactory(cfg, mtype, hs); } } diff --git a/src/internfile/mimehandler.h b/src/internfile/mimehandler.h index 9674c670..10dd3db8 100644 --- a/src/internfile/mimehandler.h +++ b/src/internfile/mimehandler.h @@ -16,7 +16,7 @@ */ #ifndef _MIMEHANDLER_H_INCLUDED_ #define _MIMEHANDLER_H_INCLUDED_ -/* @(#$Id: mimehandler.h,v 1.15 2007-11-16 14:28:52 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: mimehandler.h,v 1.16 2008-10-04 14:26:59 dockes Exp $ (C) 2004 J.F.Dockes */ #include #include @@ -76,6 +76,11 @@ public: return m_reason; } + virtual void clear() { + m_forPreview = m_havedoc = false; + Dijon::Filter::clear(); + } + protected: bool m_forPreview; string m_defcharset; @@ -92,9 +97,11 @@ protected: * indexedmimetypes (if this is set at all). */ extern Dijon::Filter *getMimeHandler(const std::string &mtyp, RclConfig *cfg, + bool filtertypes=false); +/// Free up filter for reuse (you can also delete it) +extern void returnMimeHandler(Dijon::Filter *); /// Can this mime type be interned ? extern bool canIntern(const std::string mimetype, RclConfig *cfg); - #endif /* _MIMEHANDLER_H_INCLUDED_ */