From 320a869d6e19883bf13164ec3092c6d495635912 Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Tue, 1 Feb 2011 15:04:49 +0100 Subject: [PATCH] Indexing filters: somewhat clarified and unified some charset-related parameters --- src/internfile/Filter.h | 4 +-- src/internfile/internfile.cpp | 14 +++------- src/internfile/mh_exec.cpp | 29 +++++++++++++------ src/internfile/mh_exec.h | 7 +++-- src/internfile/mh_execm.cpp | 12 ++++++++ src/internfile/mh_html.cpp | 13 +++++++-- src/internfile/mh_text.cpp | 8 +++--- src/internfile/mimehandler.cpp | 51 +++++++++++++++++++++------------- src/internfile/mimehandler.h | 6 ++-- src/sampleconf/mimeconf | 8 +++--- 10 files changed, 96 insertions(+), 56 deletions(-) diff --git a/src/internfile/Filter.h b/src/internfile/Filter.h index cdc358bb..13f35572 100644 --- a/src/internfile/Filter.h +++ b/src/internfile/Filter.h @@ -66,8 +66,8 @@ namespace Dijon /** Input properties supported by the filter. * * - DEFAULT_CHARSET is the source encoding that should be used - * for transcoding to utf-8 if there is no other way to determine - * it (ie: for text/plain files) + * for reading/transcoding the original data if there is no + * other way to determine it (ie: for text/plain files) * - OPERATING_MODE can be set to either view or index. * - DJF_UDI Unique document identifier. This can be useful if the * filter wants to manage a persistent cache. diff --git a/src/internfile/internfile.cpp b/src/internfile/internfile.cpp index e3c01eed..073947e8 100644 --- a/src/internfile/internfile.cpp +++ b/src/internfile/internfile.cpp @@ -263,9 +263,6 @@ void FileInterner::init(const string &f, const struct stat *stp, RclConfig *cnf, } df->set_property(Dijon::Filter::OPERATING_MODE, m_forPreview ? "view" : "index"); - - string charset = m_cfg->getDefCharset(); - df->set_property(Dijon::Filter::DEFAULT_CHARSET, charset); df->set_property(Dijon::Filter::DJF_UDI, udi); #ifdef RCL_USE_XATTR @@ -315,9 +312,6 @@ void FileInterner::init(const string &data, RclConfig *cnf, df->set_property(Dijon::Filter::OPERATING_MODE, m_forPreview ? "view" : "index"); - string charset = m_cfg->getDefCharset(); - df->set_property(Dijon::Filter::DEFAULT_CHARSET, charset); - bool setres = false; if (df->is_data_input_ok(Dijon::Filter::DOCUMENT_STRING)) { setres = df->set_document_string(data); @@ -650,8 +644,7 @@ enum addResols {ADD_OK, ADD_CONTINUE, ADD_BREAK, ADD_ERROR}; // and possibly add a filter/handler to the stack int FileInterner::addHandler() { - const map& docdata = - m_handlers.back()->get_meta_data(); + const map& docdata = m_handlers.back()->get_meta_data(); string charset, mimetype; getKeyValue(docdata, keycs, charset); getKeyValue(docdata, keymt, mimetype); @@ -685,8 +678,9 @@ int FileInterner::addHandler() return ADD_CONTINUE; } newflt->set_property(Dijon::Filter::OPERATING_MODE, - m_forPreview ? "view" : "index"); - newflt->set_property(Dijon::Filter::DEFAULT_CHARSET, charset); + m_forPreview ? "view" : "index"); + if (!charset.empty()) + newflt->set_property(Dijon::Filter::DEFAULT_CHARSET, charset); // Get current content: we don't use getkeyvalue() here to avoid // copying the text, which may be big. diff --git a/src/internfile/mh_exec.cpp b/src/internfile/mh_exec.cpp index 7e3379de..6c4ac942 100644 --- a/src/internfile/mh_exec.cpp +++ b/src/internfile/mh_exec.cpp @@ -147,11 +147,17 @@ void MimeHandlerExec::finaldetails() { string& output = m_metaData["content"]; - // if output is text/plain (not text/html), we may have to convert - // it to utf-8. cfgCharset comes from the mimeconf filter definition line - string charset = cfgCharset.empty() ? "utf-8" : cfgCharset; - string mt = cfgMtype.empty() ? "text/html" : cfgMtype; - if (!mt.compare("text/plain") && charset.compare("utf-8")) { + // If output is text/plain (not text/html), we may have to convert + // it to utf-8, because this is the last point where it can be done. + // cfgFilterOutputCharset comes from the mimeconf filter definition line + string charset = cfgFilterOutputCharset.empty() ? "utf-8" : + cfgFilterOutputCharset; + if (!stringlowercmp("default", charset)) { + charset = m_dfltInputCharset; + } + string mt = cfgFilterOutputMtype.empty() ? "text/html" : + cfgFilterOutputMtype; + if (!mt.compare("text/plain") && stringlowercmp("utf-8", charset)) { string transcoded; int ecnt; if (!transcode(output, transcoded, charset, "UTF-8", &ecnt)) { @@ -163,12 +169,19 @@ void MimeHandlerExec::finaldetails() ecnt, charset.c_str())); } output = transcoded; + charset = "utf-8"; } } + // Success. Store some external metadata - m_metaData["origcharset"] = m_defcharset; - // Default charset: all recoll filters output utf-8, but this - // could still be overridden by the content-type meta tag for html + + // Original charset. Can't be too sure about this actually. It's + // just a hint anyway + m_metaData["origcharset"] = m_dfltInputCharset; + + // Supposed contents charset encoding. This could still be + // overridden by the content-type meta tag for html, but this is + // wasteful so we hope it's correct m_metaData["charset"] = charset; m_metaData["mimetype"] = mt; diff --git a/src/internfile/mh_exec.h b/src/internfile/mh_exec.h index db798aa3..cc166029 100644 --- a/src/internfile/mh_exec.h +++ b/src/internfile/mh_exec.h @@ -38,7 +38,8 @@ using std::string; class MimeHandlerExec : public RecollFilter { public: /////////////////////// - // Members not reset by clear(). params, cfgMtype and chgCharset + // Members not reset by clear(). params, cfgFilterOutputMtype and + // cfgFilterOutputCharset // define what I am. missingHelper is a permanent error // (no use to try and execute over and over something that's not // here). @@ -48,11 +49,11 @@ class MimeHandlerExec : public RecollFilter { list params; // Filter output type. The default for ext. filters is to output html, // but some don't, in which case the type is defined in the config. - string cfgMtype; + string cfgFilterOutputMtype; // Output character set if the above type is not text/html. For // those filters, the output charset has to be known: ie set by a command // line option. - string cfgCharset; + string cfgFilterOutputCharset; bool missingHelper; //////////////// diff --git a/src/internfile/mh_execm.cpp b/src/internfile/mh_execm.cpp index c7bc4f57..15c8e9a8 100644 --- a/src/internfile/mh_execm.cpp +++ b/src/internfile/mh_execm.cpp @@ -272,6 +272,18 @@ bool MimeHandlerExecMultiple::next_document() } } + // Charset. For many document types it doesn't matter. For text + // and html it does. We supply a default from the + // configuration. We may want to process a charset parameter in + // filter output one day. We should do the + // text transcoding to utf-8 here like exec::finaldetails does. + string charset = cfgFilterOutputCharset.empty() ? "utf-8" : + cfgFilterOutputCharset; + if (!stringlowercmp("default", charset)) { + charset = m_dfltInputCharset; + } + m_metaData["charset"] = charset; + if (eofnext_received) m_havedoc = false; diff --git a/src/internfile/mh_html.cpp b/src/internfile/mh_html.cpp index 9799a18e..81c8ea9a 100644 --- a/src/internfile/mh_html.cpp +++ b/src/internfile/mh_html.cpp @@ -73,11 +73,18 @@ bool MimeHandlerHtml::next_document() string fn = m_filename; m_filename.erase(); - string charset = m_defcharset; - LOGDEB(("textHtmlToDoc: next_document. defcharset before parsing: [%s]\n", + string charset = m_dfltInputCharset; + LOGDEB(("MHHtml::next_doc.: default supposed input charset: [%s]\n", charset.c_str())); + // Override default input charset if someone took care to set one: + map::const_iterator it = m_metaData.find("charset"); + if (it != m_metaData.end() && !it->second.empty()) { + charset = it->second; + LOGDEB(("MHHtml: next_doc.: input charset from metadata: [%s]\n", + charset.c_str())); + } - // - We first try to convert from the default configured charset + // - We first try to convert from the supposed charset // (which may depend of the current directory) to utf-8. If this // fails, we keep the original text // - During parsing, if we find a charset parameter, and it differs from diff --git a/src/internfile/mh_text.cpp b/src/internfile/mh_text.cpp index 5451a58b..cf7c7bf3 100644 --- a/src/internfile/mh_text.cpp +++ b/src/internfile/mh_text.cpp @@ -116,14 +116,14 @@ bool MimeHandlerText::next_document() // We transcode even if defcharset is already utf-8: // this validates the encoding. LOGDEB1(("MimeHandlerText::mkDoc: transcod from %s to utf-8\n", - m_defcharset.c_str())); - if (!transcode(m_text, m_metaData["content"], m_defcharset, "UTF-8")) { + m_dfltInputCharset.c_str())); + if (!transcode(m_text, m_metaData["content"], m_dfltInputCharset, "UTF-8")) { LOGERR(("MimeHandlerText::mkDoc: transcode to utf-8 failed " - "for charset [%s]\n", m_defcharset.c_str())); + "for charset [%s]\n", m_dfltInputCharset.c_str())); m_metaData["content"].erase(); return false; } - m_metaData["origcharset"] = m_defcharset; + m_metaData["origcharset"] = m_dfltInputCharset; m_metaData["charset"] = "utf-8"; m_metaData["mimetype"] = "text/plain"; diff --git a/src/internfile/mimehandler.cpp b/src/internfile/mimehandler.cpp index eea2113c..d6d47efc 100644 --- a/src/internfile/mimehandler.cpp +++ b/src/internfile/mimehandler.cpp @@ -119,9 +119,9 @@ MimeHandlerExec *mhExecFactory(RclConfig *cfg, const string& mtype, string& hs, // with newlines and use a ConfSimple string value; if (attrs.get("charset", value)) - h->cfgCharset = stringtolower((const string&)value); + h->cfgFilterOutputCharset = stringtolower((const string&)value); if (attrs.get("mimetype", value)) - h->cfgMtype = stringtolower((const string&)value); + h->cfgFilterOutputMtype = stringtolower((const string&)value); #if 1 string scmd; @@ -129,7 +129,7 @@ MimeHandlerExec *mhExecFactory(RclConfig *cfg, const string& mtype, string& hs, scmd += string("[") + *it + "] "; } LOGDEB(("mhExecFactory:mt [%s] cfgmt [%s] cfgcs [%s] cmd: [%s]\n", - mtype.c_str(), h->cfgMtype.c_str(), h->cfgCharset.c_str(), + mtype.c_str(), h->cfgFilterOutputMtype.c_str(), h->cfgFilterOutputCharset.c_str(), scmd.c_str())); #endif @@ -163,6 +163,7 @@ Dijon::Filter *getMimeHandler(const string &mtype, RclConfig *cfg, { LOGDEB2(("getMimeHandler: mtype [%s] filtertypes %d\n", mtype.c_str(), filtertypes)); + Dijon::Filter *h = 0; // Get handler definition for mime type. We do this even if an // appropriate handler object may be in the cache (indexed by mime @@ -178,10 +179,10 @@ Dijon::Filter *getMimeHandler(const string &mtype, RclConfig *cfg, // Do we already have a handler object in the cache ? map::iterator it = o_handlers.find(mtype); if (it != o_handlers.end()) { - Dijon::Filter *h = it->second; + h = it->second; o_handlers.erase(it); LOGDEB2(("getMimeHandler: found in cache\n")); - return h; + goto out; } // Not in cache. Break definition into type and name/command @@ -202,23 +203,26 @@ Dijon::Filter *getMimeHandler(const string &mtype, RclConfig *cfg, // better and the latter will probably go away at some // point in the future if (!cmdstr.empty()) - return mhFactory(cmdstr); - return mhFactory(mtype); + h = mhFactory(cmdstr); + h = mhFactory(mtype); + goto out; } else if (!stringlowercmp("dll", handlertype)) { } else { if (cmdstr.empty()) { LOGERR(("getMimeHandler: bad line for %s: %s\n", mtype.c_str(), hs.c_str())); - return 0; + goto out; } if (!stringlowercmp("exec", handlertype)) { - return mhExecFactory(cfg, mtype, cmdstr, false); + h = mhExecFactory(cfg, mtype, cmdstr, false); + goto out; } else if (!stringlowercmp("execm", handlertype)) { - return mhExecFactory(cfg, mtype, cmdstr, true); + h = mhExecFactory(cfg, mtype, cmdstr, true); + goto out; } else { LOGERR(("getMimeHandler: bad line for %s: %s\n", mtype.c_str(), hs.c_str())); - return 0; + goto out; } } } @@ -230,21 +234,30 @@ Dijon::Filter *getMimeHandler(const string &mtype, RclConfig *cfg, // If the type is an unknown text/xxx, index as text/plain and // hope for the best (this wouldn't work too well with text/rtf...) if (mtype.find("text/") == 0) { - return mhFactory("text/plain"); + h = mhFactory("text/plain"); + goto out; } #endif // Finally, unhandled files are either ignored or their name and // generic metadata is indexed, depending on configuration - bool indexunknown = false; - cfg->getConfParam("indexallfilenames", &indexunknown); - if (indexunknown) { - return new MimeHandlerUnknown("application/octet-stream"); - } else { - return 0; + {bool indexunknown = false; + cfg->getConfParam("indexallfilenames", &indexunknown); + if (indexunknown) { + h = new MimeHandlerUnknown("application/octet-stream"); + goto out; + } else { + goto out; + } } -} +out: + if (h) { + string charset = cfg->getDefCharset(); + h->set_property(Dijon::Filter::DEFAULT_CHARSET, charset); + } + return h; +} /// Can this mime type be interned (according to config) ? bool canIntern(const std::string mtype, RclConfig *cfg) diff --git a/src/internfile/mimehandler.h b/src/internfile/mimehandler.h index b8e9a167..5c7fa388 100644 --- a/src/internfile/mimehandler.h +++ b/src/internfile/mimehandler.h @@ -40,7 +40,7 @@ public: m_udi = v; break; case DEFAULT_CHARSET: - m_defcharset = v; + m_dfltInputCharset = v; break; case OPERATING_MODE: if (!v.empty() && v[0] == 'v') @@ -88,13 +88,13 @@ public: virtual void clear() { Dijon::Filter::clear(); m_forPreview = m_havedoc = false; - m_defcharset.clear(); + m_dfltInputCharset.clear(); m_reason.clear(); } protected: bool m_forPreview; - string m_defcharset; + string m_dfltInputCharset; string m_reason; bool m_havedoc; string m_udi; // May be set by creator as a hint diff --git a/src/sampleconf/mimeconf b/src/sampleconf/mimeconf index 386efd8e..08a27675 100644 --- a/src/sampleconf/mimeconf +++ b/src/sampleconf/mimeconf @@ -65,19 +65,19 @@ application/vnd.sun.xml.writer.global = exec rclsoff application/vnd.sun.xml.writer.template = exec rclsoff application/vnd.wordperfect = exec wpd2html;mimetype=text/html application/x-abiword = exec rclabw -application/x-awk = exec rcltext +application/x-awk = exec rcltext; charset=default application/x-dvi = exec rcldvi application/x-flac = execm rclaudio application/x-gnuinfo = execm rclinfo application/x-kword = exec rclkwd application/x-lyx = exec rcllyx -application/x-perl = exec rcltext +application/x-perl = exec rcltext; charset=default application/x-scribus = exec rclscribus -application/x-shellscript = exec rcltext +application/x-shellscript = exec rcltext; charset=default application/x-tex = exec rcltex text/x-tex = exec rcltex application/x-chm = execm rclchm -application/zip = execm rclzip +application/zip = execm rclzip;charset=default audio/mpeg = execm rclaudio audio/x-karaoke = execm rclkar image/gif = execm rclimg