Indexing filters: somewhat clarified and unified some charset-related parameters

This commit is contained in:
Jean-Francois Dockes 2011-02-01 15:04:49 +01:00
parent ee64d0064a
commit 320a869d6e
10 changed files with 96 additions and 56 deletions

View File

@ -66,8 +66,8 @@ namespace Dijon
/** Input properties supported by the filter.
*
* - DEFAULT_CHARSET is the source encoding that should be used
* for transcoding to utf-8 if there is no other way to determine
* it (ie: for text/plain files)
* for reading/transcoding the original data if there is no
* other way to determine it (ie: for text/plain files)
* - OPERATING_MODE can be set to either view or index.
* - DJF_UDI Unique document identifier. This can be useful if the
* filter wants to manage a persistent cache.

View File

@ -263,9 +263,6 @@ void FileInterner::init(const string &f, const struct stat *stp, RclConfig *cnf,
}
df->set_property(Dijon::Filter::OPERATING_MODE,
m_forPreview ? "view" : "index");
string charset = m_cfg->getDefCharset();
df->set_property(Dijon::Filter::DEFAULT_CHARSET, charset);
df->set_property(Dijon::Filter::DJF_UDI, udi);
#ifdef RCL_USE_XATTR
@ -315,9 +312,6 @@ void FileInterner::init(const string &data, RclConfig *cnf,
df->set_property(Dijon::Filter::OPERATING_MODE,
m_forPreview ? "view" : "index");
string charset = m_cfg->getDefCharset();
df->set_property(Dijon::Filter::DEFAULT_CHARSET, charset);
bool setres = false;
if (df->is_data_input_ok(Dijon::Filter::DOCUMENT_STRING)) {
setres = df->set_document_string(data);
@ -650,8 +644,7 @@ enum addResols {ADD_OK, ADD_CONTINUE, ADD_BREAK, ADD_ERROR};
// and possibly add a filter/handler to the stack
int FileInterner::addHandler()
{
const map<string, string>& docdata =
m_handlers.back()->get_meta_data();
const map<string, string>& docdata = m_handlers.back()->get_meta_data();
string charset, mimetype;
getKeyValue(docdata, keycs, charset);
getKeyValue(docdata, keymt, mimetype);
@ -685,8 +678,9 @@ int FileInterner::addHandler()
return ADD_CONTINUE;
}
newflt->set_property(Dijon::Filter::OPERATING_MODE,
m_forPreview ? "view" : "index");
newflt->set_property(Dijon::Filter::DEFAULT_CHARSET, charset);
m_forPreview ? "view" : "index");
if (!charset.empty())
newflt->set_property(Dijon::Filter::DEFAULT_CHARSET, charset);
// Get current content: we don't use getkeyvalue() here to avoid
// copying the text, which may be big.

View File

@ -147,11 +147,17 @@ void MimeHandlerExec::finaldetails()
{
string& output = m_metaData["content"];
// if output is text/plain (not text/html), we may have to convert
// it to utf-8. cfgCharset comes from the mimeconf filter definition line
string charset = cfgCharset.empty() ? "utf-8" : cfgCharset;
string mt = cfgMtype.empty() ? "text/html" : cfgMtype;
if (!mt.compare("text/plain") && charset.compare("utf-8")) {
// If output is text/plain (not text/html), we may have to convert
// it to utf-8, because this is the last point where it can be done.
// cfgFilterOutputCharset comes from the mimeconf filter definition line
string charset = cfgFilterOutputCharset.empty() ? "utf-8" :
cfgFilterOutputCharset;
if (!stringlowercmp("default", charset)) {
charset = m_dfltInputCharset;
}
string mt = cfgFilterOutputMtype.empty() ? "text/html" :
cfgFilterOutputMtype;
if (!mt.compare("text/plain") && stringlowercmp("utf-8", charset)) {
string transcoded;
int ecnt;
if (!transcode(output, transcoded, charset, "UTF-8", &ecnt)) {
@ -163,12 +169,19 @@ void MimeHandlerExec::finaldetails()
ecnt, charset.c_str()));
}
output = transcoded;
charset = "utf-8";
}
}
// Success. Store some external metadata
m_metaData["origcharset"] = m_defcharset;
// Default charset: all recoll filters output utf-8, but this
// could still be overridden by the content-type meta tag for html
// Original charset. Can't be too sure about this actually. It's
// just a hint anyway
m_metaData["origcharset"] = m_dfltInputCharset;
// Supposed contents charset encoding. This could still be
// overridden by the content-type meta tag for html, but this is
// wasteful so we hope it's correct
m_metaData["charset"] = charset;
m_metaData["mimetype"] = mt;

View File

@ -38,7 +38,8 @@ using std::string;
class MimeHandlerExec : public RecollFilter {
public:
///////////////////////
// Members not reset by clear(). params, cfgMtype and chgCharset
// Members not reset by clear(). params, cfgFilterOutputMtype and
// cfgFilterOutputCharset
// define what I am. missingHelper is a permanent error
// (no use to try and execute over and over something that's not
// here).
@ -48,11 +49,11 @@ class MimeHandlerExec : public RecollFilter {
list<string> params;
// Filter output type. The default for ext. filters is to output html,
// but some don't, in which case the type is defined in the config.
string cfgMtype;
string cfgFilterOutputMtype;
// Output character set if the above type is not text/html. For
// those filters, the output charset has to be known: ie set by a command
// line option.
string cfgCharset;
string cfgFilterOutputCharset;
bool missingHelper;
////////////////

View File

@ -272,6 +272,18 @@ bool MimeHandlerExecMultiple::next_document()
}
}
// Charset. For many document types it doesn't matter. For text
// and html it does. We supply a default from the
// configuration. We may want to process a charset parameter in
// filter output one day. We should do the
// text transcoding to utf-8 here like exec::finaldetails does.
string charset = cfgFilterOutputCharset.empty() ? "utf-8" :
cfgFilterOutputCharset;
if (!stringlowercmp("default", charset)) {
charset = m_dfltInputCharset;
}
m_metaData["charset"] = charset;
if (eofnext_received)
m_havedoc = false;

View File

@ -73,11 +73,18 @@ bool MimeHandlerHtml::next_document()
string fn = m_filename;
m_filename.erase();
string charset = m_defcharset;
LOGDEB(("textHtmlToDoc: next_document. defcharset before parsing: [%s]\n",
string charset = m_dfltInputCharset;
LOGDEB(("MHHtml::next_doc.: default supposed input charset: [%s]\n",
charset.c_str()));
// Override default input charset if someone took care to set one:
map<string,string>::const_iterator it = m_metaData.find("charset");
if (it != m_metaData.end() && !it->second.empty()) {
charset = it->second;
LOGDEB(("MHHtml: next_doc.: input charset from metadata: [%s]\n",
charset.c_str()));
}
// - We first try to convert from the default configured charset
// - We first try to convert from the supposed charset
// (which may depend of the current directory) to utf-8. If this
// fails, we keep the original text
// - During parsing, if we find a charset parameter, and it differs from

View File

@ -116,14 +116,14 @@ bool MimeHandlerText::next_document()
// We transcode even if defcharset is already utf-8:
// this validates the encoding.
LOGDEB1(("MimeHandlerText::mkDoc: transcod from %s to utf-8\n",
m_defcharset.c_str()));
if (!transcode(m_text, m_metaData["content"], m_defcharset, "UTF-8")) {
m_dfltInputCharset.c_str()));
if (!transcode(m_text, m_metaData["content"], m_dfltInputCharset, "UTF-8")) {
LOGERR(("MimeHandlerText::mkDoc: transcode to utf-8 failed "
"for charset [%s]\n", m_defcharset.c_str()));
"for charset [%s]\n", m_dfltInputCharset.c_str()));
m_metaData["content"].erase();
return false;
}
m_metaData["origcharset"] = m_defcharset;
m_metaData["origcharset"] = m_dfltInputCharset;
m_metaData["charset"] = "utf-8";
m_metaData["mimetype"] = "text/plain";

View File

@ -119,9 +119,9 @@ MimeHandlerExec *mhExecFactory(RclConfig *cfg, const string& mtype, string& hs,
// with newlines and use a ConfSimple
string value;
if (attrs.get("charset", value))
h->cfgCharset = stringtolower((const string&)value);
h->cfgFilterOutputCharset = stringtolower((const string&)value);
if (attrs.get("mimetype", value))
h->cfgMtype = stringtolower((const string&)value);
h->cfgFilterOutputMtype = stringtolower((const string&)value);
#if 1
string scmd;
@ -129,7 +129,7 @@ MimeHandlerExec *mhExecFactory(RclConfig *cfg, const string& mtype, string& hs,
scmd += string("[") + *it + "] ";
}
LOGDEB(("mhExecFactory:mt [%s] cfgmt [%s] cfgcs [%s] cmd: [%s]\n",
mtype.c_str(), h->cfgMtype.c_str(), h->cfgCharset.c_str(),
mtype.c_str(), h->cfgFilterOutputMtype.c_str(), h->cfgFilterOutputCharset.c_str(),
scmd.c_str()));
#endif
@ -163,6 +163,7 @@ Dijon::Filter *getMimeHandler(const string &mtype, RclConfig *cfg,
{
LOGDEB2(("getMimeHandler: mtype [%s] filtertypes %d\n",
mtype.c_str(), filtertypes));
Dijon::Filter *h = 0;
// Get handler definition for mime type. We do this even if an
// appropriate handler object may be in the cache (indexed by mime
@ -178,10 +179,10 @@ Dijon::Filter *getMimeHandler(const string &mtype, RclConfig *cfg,
// Do we already have a handler object in the cache ?
map<string, Dijon::Filter *>::iterator it = o_handlers.find(mtype);
if (it != o_handlers.end()) {
Dijon::Filter *h = it->second;
h = it->second;
o_handlers.erase(it);
LOGDEB2(("getMimeHandler: found in cache\n"));
return h;
goto out;
}
// Not in cache. Break definition into type and name/command
@ -202,23 +203,26 @@ Dijon::Filter *getMimeHandler(const string &mtype, RclConfig *cfg,
// better and the latter will probably go away at some
// point in the future
if (!cmdstr.empty())
return mhFactory(cmdstr);
return mhFactory(mtype);
h = mhFactory(cmdstr);
h = mhFactory(mtype);
goto out;
} else if (!stringlowercmp("dll", handlertype)) {
} else {
if (cmdstr.empty()) {
LOGERR(("getMimeHandler: bad line for %s: %s\n",
mtype.c_str(), hs.c_str()));
return 0;
goto out;
}
if (!stringlowercmp("exec", handlertype)) {
return mhExecFactory(cfg, mtype, cmdstr, false);
h = mhExecFactory(cfg, mtype, cmdstr, false);
goto out;
} else if (!stringlowercmp("execm", handlertype)) {
return mhExecFactory(cfg, mtype, cmdstr, true);
h = mhExecFactory(cfg, mtype, cmdstr, true);
goto out;
} else {
LOGERR(("getMimeHandler: bad line for %s: %s\n",
mtype.c_str(), hs.c_str()));
return 0;
goto out;
}
}
}
@ -230,21 +234,30 @@ Dijon::Filter *getMimeHandler(const string &mtype, RclConfig *cfg,
// If the type is an unknown text/xxx, index as text/plain and
// hope for the best (this wouldn't work too well with text/rtf...)
if (mtype.find("text/") == 0) {
return mhFactory("text/plain");
h = mhFactory("text/plain");
goto out;
}
#endif
// Finally, unhandled files are either ignored or their name and
// generic metadata is indexed, depending on configuration
bool indexunknown = false;
cfg->getConfParam("indexallfilenames", &indexunknown);
if (indexunknown) {
return new MimeHandlerUnknown("application/octet-stream");
} else {
return 0;
{bool indexunknown = false;
cfg->getConfParam("indexallfilenames", &indexunknown);
if (indexunknown) {
h = new MimeHandlerUnknown("application/octet-stream");
goto out;
} else {
goto out;
}
}
}
out:
if (h) {
string charset = cfg->getDefCharset();
h->set_property(Dijon::Filter::DEFAULT_CHARSET, charset);
}
return h;
}
/// Can this mime type be interned (according to config) ?
bool canIntern(const std::string mtype, RclConfig *cfg)

View File

@ -40,7 +40,7 @@ public:
m_udi = v;
break;
case DEFAULT_CHARSET:
m_defcharset = v;
m_dfltInputCharset = v;
break;
case OPERATING_MODE:
if (!v.empty() && v[0] == 'v')
@ -88,13 +88,13 @@ public:
virtual void clear() {
Dijon::Filter::clear();
m_forPreview = m_havedoc = false;
m_defcharset.clear();
m_dfltInputCharset.clear();
m_reason.clear();
}
protected:
bool m_forPreview;
string m_defcharset;
string m_dfltInputCharset;
string m_reason;
bool m_havedoc;
string m_udi; // May be set by creator as a hint

View File

@ -65,19 +65,19 @@ application/vnd.sun.xml.writer.global = exec rclsoff
application/vnd.sun.xml.writer.template = exec rclsoff
application/vnd.wordperfect = exec wpd2html;mimetype=text/html
application/x-abiword = exec rclabw
application/x-awk = exec rcltext
application/x-awk = exec rcltext; charset=default
application/x-dvi = exec rcldvi
application/x-flac = execm rclaudio
application/x-gnuinfo = execm rclinfo
application/x-kword = exec rclkwd
application/x-lyx = exec rcllyx
application/x-perl = exec rcltext
application/x-perl = exec rcltext; charset=default
application/x-scribus = exec rclscribus
application/x-shellscript = exec rcltext
application/x-shellscript = exec rcltext; charset=default
application/x-tex = exec rcltex
text/x-tex = exec rcltex
application/x-chm = execm rclchm
application/zip = execm rclzip
application/zip = execm rclzip;charset=default
audio/mpeg = execm rclaudio
audio/x-karaoke = execm rclkar
image/gif = execm rclimg