Indexing filters: somewhat clarified and unified some charset-related parameters
This commit is contained in:
parent
ee64d0064a
commit
320a869d6e
@ -66,8 +66,8 @@ namespace Dijon
|
||||
/** Input properties supported by the filter.
|
||||
*
|
||||
* - DEFAULT_CHARSET is the source encoding that should be used
|
||||
* for transcoding to utf-8 if there is no other way to determine
|
||||
* it (ie: for text/plain files)
|
||||
* for reading/transcoding the original data if there is no
|
||||
* other way to determine it (ie: for text/plain files)
|
||||
* - OPERATING_MODE can be set to either view or index.
|
||||
* - DJF_UDI Unique document identifier. This can be useful if the
|
||||
* filter wants to manage a persistent cache.
|
||||
|
||||
@ -263,9 +263,6 @@ void FileInterner::init(const string &f, const struct stat *stp, RclConfig *cnf,
|
||||
}
|
||||
df->set_property(Dijon::Filter::OPERATING_MODE,
|
||||
m_forPreview ? "view" : "index");
|
||||
|
||||
string charset = m_cfg->getDefCharset();
|
||||
df->set_property(Dijon::Filter::DEFAULT_CHARSET, charset);
|
||||
df->set_property(Dijon::Filter::DJF_UDI, udi);
|
||||
|
||||
#ifdef RCL_USE_XATTR
|
||||
@ -315,9 +312,6 @@ void FileInterner::init(const string &data, RclConfig *cnf,
|
||||
df->set_property(Dijon::Filter::OPERATING_MODE,
|
||||
m_forPreview ? "view" : "index");
|
||||
|
||||
string charset = m_cfg->getDefCharset();
|
||||
df->set_property(Dijon::Filter::DEFAULT_CHARSET, charset);
|
||||
|
||||
bool setres = false;
|
||||
if (df->is_data_input_ok(Dijon::Filter::DOCUMENT_STRING)) {
|
||||
setres = df->set_document_string(data);
|
||||
@ -650,8 +644,7 @@ enum addResols {ADD_OK, ADD_CONTINUE, ADD_BREAK, ADD_ERROR};
|
||||
// and possibly add a filter/handler to the stack
|
||||
int FileInterner::addHandler()
|
||||
{
|
||||
const map<string, string>& docdata =
|
||||
m_handlers.back()->get_meta_data();
|
||||
const map<string, string>& docdata = m_handlers.back()->get_meta_data();
|
||||
string charset, mimetype;
|
||||
getKeyValue(docdata, keycs, charset);
|
||||
getKeyValue(docdata, keymt, mimetype);
|
||||
@ -685,8 +678,9 @@ int FileInterner::addHandler()
|
||||
return ADD_CONTINUE;
|
||||
}
|
||||
newflt->set_property(Dijon::Filter::OPERATING_MODE,
|
||||
m_forPreview ? "view" : "index");
|
||||
newflt->set_property(Dijon::Filter::DEFAULT_CHARSET, charset);
|
||||
m_forPreview ? "view" : "index");
|
||||
if (!charset.empty())
|
||||
newflt->set_property(Dijon::Filter::DEFAULT_CHARSET, charset);
|
||||
|
||||
// Get current content: we don't use getkeyvalue() here to avoid
|
||||
// copying the text, which may be big.
|
||||
|
||||
@ -147,11 +147,17 @@ void MimeHandlerExec::finaldetails()
|
||||
{
|
||||
string& output = m_metaData["content"];
|
||||
|
||||
// if output is text/plain (not text/html), we may have to convert
|
||||
// it to utf-8. cfgCharset comes from the mimeconf filter definition line
|
||||
string charset = cfgCharset.empty() ? "utf-8" : cfgCharset;
|
||||
string mt = cfgMtype.empty() ? "text/html" : cfgMtype;
|
||||
if (!mt.compare("text/plain") && charset.compare("utf-8")) {
|
||||
// If output is text/plain (not text/html), we may have to convert
|
||||
// it to utf-8, because this is the last point where it can be done.
|
||||
// cfgFilterOutputCharset comes from the mimeconf filter definition line
|
||||
string charset = cfgFilterOutputCharset.empty() ? "utf-8" :
|
||||
cfgFilterOutputCharset;
|
||||
if (!stringlowercmp("default", charset)) {
|
||||
charset = m_dfltInputCharset;
|
||||
}
|
||||
string mt = cfgFilterOutputMtype.empty() ? "text/html" :
|
||||
cfgFilterOutputMtype;
|
||||
if (!mt.compare("text/plain") && stringlowercmp("utf-8", charset)) {
|
||||
string transcoded;
|
||||
int ecnt;
|
||||
if (!transcode(output, transcoded, charset, "UTF-8", &ecnt)) {
|
||||
@ -163,12 +169,19 @@ void MimeHandlerExec::finaldetails()
|
||||
ecnt, charset.c_str()));
|
||||
}
|
||||
output = transcoded;
|
||||
charset = "utf-8";
|
||||
}
|
||||
}
|
||||
|
||||
// Success. Store some external metadata
|
||||
m_metaData["origcharset"] = m_defcharset;
|
||||
// Default charset: all recoll filters output utf-8, but this
|
||||
// could still be overridden by the content-type meta tag for html
|
||||
|
||||
// Original charset. Can't be too sure about this actually. It's
|
||||
// just a hint anyway
|
||||
m_metaData["origcharset"] = m_dfltInputCharset;
|
||||
|
||||
// Supposed contents charset encoding. This could still be
|
||||
// overridden by the content-type meta tag for html, but this is
|
||||
// wasteful so we hope it's correct
|
||||
m_metaData["charset"] = charset;
|
||||
m_metaData["mimetype"] = mt;
|
||||
|
||||
|
||||
@ -38,7 +38,8 @@ using std::string;
|
||||
class MimeHandlerExec : public RecollFilter {
|
||||
public:
|
||||
///////////////////////
|
||||
// Members not reset by clear(). params, cfgMtype and chgCharset
|
||||
// Members not reset by clear(). params, cfgFilterOutputMtype and
|
||||
// cfgFilterOutputCharset
|
||||
// define what I am. missingHelper is a permanent error
|
||||
// (no use to try and execute over and over something that's not
|
||||
// here).
|
||||
@ -48,11 +49,11 @@ class MimeHandlerExec : public RecollFilter {
|
||||
list<string> params;
|
||||
// Filter output type. The default for ext. filters is to output html,
|
||||
// but some don't, in which case the type is defined in the config.
|
||||
string cfgMtype;
|
||||
string cfgFilterOutputMtype;
|
||||
// Output character set if the above type is not text/html. For
|
||||
// those filters, the output charset has to be known: ie set by a command
|
||||
// line option.
|
||||
string cfgCharset;
|
||||
string cfgFilterOutputCharset;
|
||||
bool missingHelper;
|
||||
////////////////
|
||||
|
||||
|
||||
@ -272,6 +272,18 @@ bool MimeHandlerExecMultiple::next_document()
|
||||
}
|
||||
}
|
||||
|
||||
// Charset. For many document types it doesn't matter. For text
|
||||
// and html it does. We supply a default from the
|
||||
// configuration. We may want to process a charset parameter in
|
||||
// filter output one day. We should do the
|
||||
// text transcoding to utf-8 here like exec::finaldetails does.
|
||||
string charset = cfgFilterOutputCharset.empty() ? "utf-8" :
|
||||
cfgFilterOutputCharset;
|
||||
if (!stringlowercmp("default", charset)) {
|
||||
charset = m_dfltInputCharset;
|
||||
}
|
||||
m_metaData["charset"] = charset;
|
||||
|
||||
if (eofnext_received)
|
||||
m_havedoc = false;
|
||||
|
||||
|
||||
@ -73,11 +73,18 @@ bool MimeHandlerHtml::next_document()
|
||||
string fn = m_filename;
|
||||
m_filename.erase();
|
||||
|
||||
string charset = m_defcharset;
|
||||
LOGDEB(("textHtmlToDoc: next_document. defcharset before parsing: [%s]\n",
|
||||
string charset = m_dfltInputCharset;
|
||||
LOGDEB(("MHHtml::next_doc.: default supposed input charset: [%s]\n",
|
||||
charset.c_str()));
|
||||
// Override default input charset if someone took care to set one:
|
||||
map<string,string>::const_iterator it = m_metaData.find("charset");
|
||||
if (it != m_metaData.end() && !it->second.empty()) {
|
||||
charset = it->second;
|
||||
LOGDEB(("MHHtml: next_doc.: input charset from metadata: [%s]\n",
|
||||
charset.c_str()));
|
||||
}
|
||||
|
||||
// - We first try to convert from the default configured charset
|
||||
// - We first try to convert from the supposed charset
|
||||
// (which may depend of the current directory) to utf-8. If this
|
||||
// fails, we keep the original text
|
||||
// - During parsing, if we find a charset parameter, and it differs from
|
||||
|
||||
@ -116,14 +116,14 @@ bool MimeHandlerText::next_document()
|
||||
// We transcode even if defcharset is already utf-8:
|
||||
// this validates the encoding.
|
||||
LOGDEB1(("MimeHandlerText::mkDoc: transcod from %s to utf-8\n",
|
||||
m_defcharset.c_str()));
|
||||
if (!transcode(m_text, m_metaData["content"], m_defcharset, "UTF-8")) {
|
||||
m_dfltInputCharset.c_str()));
|
||||
if (!transcode(m_text, m_metaData["content"], m_dfltInputCharset, "UTF-8")) {
|
||||
LOGERR(("MimeHandlerText::mkDoc: transcode to utf-8 failed "
|
||||
"for charset [%s]\n", m_defcharset.c_str()));
|
||||
"for charset [%s]\n", m_dfltInputCharset.c_str()));
|
||||
m_metaData["content"].erase();
|
||||
return false;
|
||||
}
|
||||
m_metaData["origcharset"] = m_defcharset;
|
||||
m_metaData["origcharset"] = m_dfltInputCharset;
|
||||
m_metaData["charset"] = "utf-8";
|
||||
m_metaData["mimetype"] = "text/plain";
|
||||
|
||||
|
||||
@ -119,9 +119,9 @@ MimeHandlerExec *mhExecFactory(RclConfig *cfg, const string& mtype, string& hs,
|
||||
// with newlines and use a ConfSimple
|
||||
string value;
|
||||
if (attrs.get("charset", value))
|
||||
h->cfgCharset = stringtolower((const string&)value);
|
||||
h->cfgFilterOutputCharset = stringtolower((const string&)value);
|
||||
if (attrs.get("mimetype", value))
|
||||
h->cfgMtype = stringtolower((const string&)value);
|
||||
h->cfgFilterOutputMtype = stringtolower((const string&)value);
|
||||
|
||||
#if 1
|
||||
string scmd;
|
||||
@ -129,7 +129,7 @@ MimeHandlerExec *mhExecFactory(RclConfig *cfg, const string& mtype, string& hs,
|
||||
scmd += string("[") + *it + "] ";
|
||||
}
|
||||
LOGDEB(("mhExecFactory:mt [%s] cfgmt [%s] cfgcs [%s] cmd: [%s]\n",
|
||||
mtype.c_str(), h->cfgMtype.c_str(), h->cfgCharset.c_str(),
|
||||
mtype.c_str(), h->cfgFilterOutputMtype.c_str(), h->cfgFilterOutputCharset.c_str(),
|
||||
scmd.c_str()));
|
||||
#endif
|
||||
|
||||
@ -163,6 +163,7 @@ Dijon::Filter *getMimeHandler(const string &mtype, RclConfig *cfg,
|
||||
{
|
||||
LOGDEB2(("getMimeHandler: mtype [%s] filtertypes %d\n",
|
||||
mtype.c_str(), filtertypes));
|
||||
Dijon::Filter *h = 0;
|
||||
|
||||
// Get handler definition for mime type. We do this even if an
|
||||
// appropriate handler object may be in the cache (indexed by mime
|
||||
@ -178,10 +179,10 @@ Dijon::Filter *getMimeHandler(const string &mtype, RclConfig *cfg,
|
||||
// Do we already have a handler object in the cache ?
|
||||
map<string, Dijon::Filter *>::iterator it = o_handlers.find(mtype);
|
||||
if (it != o_handlers.end()) {
|
||||
Dijon::Filter *h = it->second;
|
||||
h = it->second;
|
||||
o_handlers.erase(it);
|
||||
LOGDEB2(("getMimeHandler: found in cache\n"));
|
||||
return h;
|
||||
goto out;
|
||||
}
|
||||
|
||||
// Not in cache. Break definition into type and name/command
|
||||
@ -202,23 +203,26 @@ Dijon::Filter *getMimeHandler(const string &mtype, RclConfig *cfg,
|
||||
// better and the latter will probably go away at some
|
||||
// point in the future
|
||||
if (!cmdstr.empty())
|
||||
return mhFactory(cmdstr);
|
||||
return mhFactory(mtype);
|
||||
h = mhFactory(cmdstr);
|
||||
h = mhFactory(mtype);
|
||||
goto out;
|
||||
} else if (!stringlowercmp("dll", handlertype)) {
|
||||
} else {
|
||||
if (cmdstr.empty()) {
|
||||
LOGERR(("getMimeHandler: bad line for %s: %s\n",
|
||||
mtype.c_str(), hs.c_str()));
|
||||
return 0;
|
||||
goto out;
|
||||
}
|
||||
if (!stringlowercmp("exec", handlertype)) {
|
||||
return mhExecFactory(cfg, mtype, cmdstr, false);
|
||||
h = mhExecFactory(cfg, mtype, cmdstr, false);
|
||||
goto out;
|
||||
} else if (!stringlowercmp("execm", handlertype)) {
|
||||
return mhExecFactory(cfg, mtype, cmdstr, true);
|
||||
h = mhExecFactory(cfg, mtype, cmdstr, true);
|
||||
goto out;
|
||||
} else {
|
||||
LOGERR(("getMimeHandler: bad line for %s: %s\n",
|
||||
mtype.c_str(), hs.c_str()));
|
||||
return 0;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -230,21 +234,30 @@ Dijon::Filter *getMimeHandler(const string &mtype, RclConfig *cfg,
|
||||
// If the type is an unknown text/xxx, index as text/plain and
|
||||
// hope for the best (this wouldn't work too well with text/rtf...)
|
||||
if (mtype.find("text/") == 0) {
|
||||
return mhFactory("text/plain");
|
||||
h = mhFactory("text/plain");
|
||||
goto out;
|
||||
}
|
||||
#endif
|
||||
|
||||
// Finally, unhandled files are either ignored or their name and
|
||||
// generic metadata is indexed, depending on configuration
|
||||
bool indexunknown = false;
|
||||
cfg->getConfParam("indexallfilenames", &indexunknown);
|
||||
if (indexunknown) {
|
||||
return new MimeHandlerUnknown("application/octet-stream");
|
||||
} else {
|
||||
return 0;
|
||||
{bool indexunknown = false;
|
||||
cfg->getConfParam("indexallfilenames", &indexunknown);
|
||||
if (indexunknown) {
|
||||
h = new MimeHandlerUnknown("application/octet-stream");
|
||||
goto out;
|
||||
} else {
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
out:
|
||||
if (h) {
|
||||
string charset = cfg->getDefCharset();
|
||||
h->set_property(Dijon::Filter::DEFAULT_CHARSET, charset);
|
||||
}
|
||||
return h;
|
||||
}
|
||||
|
||||
/// Can this mime type be interned (according to config) ?
|
||||
bool canIntern(const std::string mtype, RclConfig *cfg)
|
||||
|
||||
@ -40,7 +40,7 @@ public:
|
||||
m_udi = v;
|
||||
break;
|
||||
case DEFAULT_CHARSET:
|
||||
m_defcharset = v;
|
||||
m_dfltInputCharset = v;
|
||||
break;
|
||||
case OPERATING_MODE:
|
||||
if (!v.empty() && v[0] == 'v')
|
||||
@ -88,13 +88,13 @@ public:
|
||||
virtual void clear() {
|
||||
Dijon::Filter::clear();
|
||||
m_forPreview = m_havedoc = false;
|
||||
m_defcharset.clear();
|
||||
m_dfltInputCharset.clear();
|
||||
m_reason.clear();
|
||||
}
|
||||
|
||||
protected:
|
||||
bool m_forPreview;
|
||||
string m_defcharset;
|
||||
string m_dfltInputCharset;
|
||||
string m_reason;
|
||||
bool m_havedoc;
|
||||
string m_udi; // May be set by creator as a hint
|
||||
|
||||
@ -65,19 +65,19 @@ application/vnd.sun.xml.writer.global = exec rclsoff
|
||||
application/vnd.sun.xml.writer.template = exec rclsoff
|
||||
application/vnd.wordperfect = exec wpd2html;mimetype=text/html
|
||||
application/x-abiword = exec rclabw
|
||||
application/x-awk = exec rcltext
|
||||
application/x-awk = exec rcltext; charset=default
|
||||
application/x-dvi = exec rcldvi
|
||||
application/x-flac = execm rclaudio
|
||||
application/x-gnuinfo = execm rclinfo
|
||||
application/x-kword = exec rclkwd
|
||||
application/x-lyx = exec rcllyx
|
||||
application/x-perl = exec rcltext
|
||||
application/x-perl = exec rcltext; charset=default
|
||||
application/x-scribus = exec rclscribus
|
||||
application/x-shellscript = exec rcltext
|
||||
application/x-shellscript = exec rcltext; charset=default
|
||||
application/x-tex = exec rcltex
|
||||
text/x-tex = exec rcltex
|
||||
application/x-chm = execm rclchm
|
||||
application/zip = execm rclzip
|
||||
application/zip = execm rclzip;charset=default
|
||||
audio/mpeg = execm rclaudio
|
||||
audio/x-karaoke = execm rclkar
|
||||
image/gif = execm rclimg
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user