Factorized common text transcoding code in separate module

This commit is contained in:
Jean-Francois Dockes 2011-10-20 17:53:42 +02:00
parent f544b28b4a
commit 49554e42c2
9 changed files with 92 additions and 89 deletions

View File

@ -54,6 +54,7 @@ DEF_CSTR(iso_8859_1, "ISO-8859-1");
DEF_CSTR(mimetype, "mimetype");
DEF_CSTR(minwilds, "*?[");
DEF_CSTR(newline, "\n");
DEF_CSTR(origcharset, "origcharset");
DEF_CSTR(null, "");
DEF_CSTR(plus, "+");
DEF_CSTR(textplain, "text/plain");

View File

@ -21,7 +21,6 @@
#include "debuglog.h"
#include "cancelcheck.h"
#include "smallut.h"
#include "transcode.h"
#include "md5.h"
#include "rclconfig.h"
@ -146,53 +145,24 @@ bool MimeHandlerExec::next_document()
void MimeHandlerExec::finaldetails()
{
string& output = m_metaData[cstr_content];
m_metaData[cstr_origcharset] = m_dfltInputCharset;
// If output is text/plain (not text/html), we may have to convert
// it to utf-8, because this is the last point where it can be done.
// cfgFilterOutputCharset comes from the mimeconf filter definition line
string charset = cfgFilterOutputCharset.empty() ? "utf-8" :
cfgFilterOutputCharset;
bool trustcharset = true;
string& charset = m_metaData[cstr_charset];
charset = cfgFilterOutputCharset.empty() ? "UTF-8" : cfgFilterOutputCharset;
if (!stringlowercmp("default", charset)) {
charset = m_dfltInputCharset;
trustcharset = false;
}
string mt = cfgFilterOutputMtype.empty() ? "text/html" :
string& mt = m_metaData[cstr_mimetype];
mt = cfgFilterOutputMtype.empty() ? "text/html" :
cfgFilterOutputMtype;
// If this is text/plain and not utf-8 or untrusted, transcode to utf-8.
if (!mt.compare(cstr_textplain) &&
(!trustcharset || stringlowercmp("utf-8", charset))) {
string transcoded;
int ecnt;
if (!transcode(output, transcoded, charset, "UTF-8", &ecnt)) {
LOGERR(("mh_exec: transcode failed from [%s] to UTF-8\n",
charset.c_str()));
// Erase text in this case: it's garbage
output.clear();
} else {
if (ecnt) {
LOGDEB(("mh_exec: %d transcoding errors from [%s] to UTF-8\n",
ecnt, charset.c_str()));
}
output = transcoded;
charset = "utf-8";
}
// If this is text/plain transcode_to/check utf-8
if (!mt.compare(cstr_textplain)) {
(void)txtdcode("mh_exec");
}
// Success. Store some external metadata
// Original charset. Can't be too sure about this actually. It's
// just a hint anyway
m_metaData["origcharset"] = m_dfltInputCharset;
// Supposed contents charset encoding. This could still be
// overridden by the content-type meta tag for html, but this is
// wasteful so we hope it's correct
m_metaData[cstr_charset] = charset;
m_metaData[cstr_mimetype] = mt;
string md5, xmd5, reason;
if (MD5File(m_fn, md5, &reason)) {
m_metaData["md5"] = MD5HexPrint(md5, xmd5);

View File

@ -25,7 +25,6 @@
#include "debuglog.h"
#include "cancelcheck.h"
#include "smallut.h"
#include "transcode.h"
#include "md5.h"
#include "rclconfig.h"
#include "mimetype.h"
@ -283,39 +282,21 @@ bool MimeHandlerExecMultiple::next_document()
}
// Charset. For many document types it doesn't matter. For text
// and html it does. We supply a default from the
// configuration.
bool trustcharset = true;
// and html it does. We supply a default from the configuration.
if (charset.empty()) {
charset = cfgFilterOutputCharset.empty() ? "utf-8" :
cfgFilterOutputCharset;
if (!stringlowercmp("default", charset)) {
trustcharset = false;
charset = m_dfltInputCharset;
}
}
m_metaData[cstr_origcharset] = charset;
m_metaData[cstr_charset] = charset;
string& output = m_metaData[cstr_content];
if (!m_metaData[cstr_mimetype].compare(cstr_textplain) &&
(!trustcharset || stringlowercmp("utf-8", charset))) {
string transcoded;
int ecnt;
if (!transcode(output, transcoded, charset, "UTF-8", &ecnt)) {
LOGERR(("mh_execm: transcode failed from [%s] to UTF-8\n",
charset.c_str()));
// Erase text in this case: it's garbage
output.clear();
} else {
if (ecnt) {
LOGDEB(("mh_exec: %d transcoding errors from [%s] to UTF-8\n",
ecnt, charset.c_str()));
}
output = transcoded;
charset = "utf-8";
}
if (!m_metaData[cstr_mimetype].compare(cstr_textplain)) {
(void)txtdcode("mh_execm");
}
m_metaData[cstr_charset] = charset;
if (eofnext_received)
m_havedoc = false;

View File

@ -162,7 +162,7 @@ bool MimeHandlerHtml::next_document()
}
}
m_metaData["origcharset"] = result.get_charset();
m_metaData[cstr_origcharset] = result.get_charset();
m_metaData[cstr_content] = result.dump;
m_metaData[cstr_charset] = "utf-8";
// Avoid setting empty values which would crush ones possibly inherited

View File

@ -32,7 +32,6 @@ using namespace std;
#include "csguess.h"
#include "debuglog.h"
#include "readfile.h"
#include "transcode.h"
#include "md5.h"
#include "rclconfig.h"
@ -117,28 +116,21 @@ bool MimeHandlerText::next_document()
if (m_havedoc == false)
return false;
// We transcode even if defcharset is already utf-8:
// We transcode even if defcharset is supposedly already utf-8:
// this validates the encoding.
LOGDEB1(("MimeHandlerText::mkDoc: transcod from %s to utf-8\n",
m_dfltInputCharset.c_str()));
int ecnt;
bool ret;
string& itext = m_metaData[cstr_content];
if (!(ret=transcode(m_text, itext, m_dfltInputCharset, "UTF-8", &ecnt)) ||
ecnt > int(itext.size() / 4)) {
LOGERR(("MimeHandlerText::mkDoc: transcode to utf-8 failed "
"for input charset [%s] ret %d ecnt %d\n",
m_dfltInputCharset.c_str(), ret, ecnt));
itext.erase();
return false;
}
m_metaData["origcharset"] = m_dfltInputCharset;
m_metaData[cstr_charset] = "utf-8";
m_metaData[cstr_origcharset] = m_dfltInputCharset;
m_metaData[cstr_mimetype] = cstr_textplain;
// If text length is 0 (the file is empty or oversize), or we have
// read all at once, we're done
if (m_text.length() == 0 || !m_paging) {
size_t srclen = m_text.length();
m_metaData[cstr_content].swap(m_text);
// txtdcode() truncates the text if transcoding fails
(void)txtdcode("mh_text");
// If the text length is 0 (the file is empty or oversize), or we are
// not paging, we're done
if (srclen == 0 || !m_paging) {
m_havedoc = false;
return true;
} else {
@ -150,8 +142,8 @@ bool MimeHandlerText::next_document()
// be to use a different mtype for files over the page size,
// and keep text/plain only for smaller files.
char buf[30];
sprintf(buf, "%lld", (long long)(m_offs - m_text.length()));
if (m_offs - m_text.length() != 0)
sprintf(buf, "%lld", (long long)(m_offs - srclen));
if (m_offs - srclen != 0)
m_metaData[cstr_ipath] = buf;
readnext();
return true;
@ -161,7 +153,7 @@ bool MimeHandlerText::next_document()
bool MimeHandlerText::readnext()
{
string reason;
m_text.erase();
m_text.clear();
if (!file_to_string(m_fn, m_text, m_offs, m_pagesz, &reason)) {
LOGERR(("MimeHandlerText: can't read file: %s\n", reason.c_str()));
m_havedoc = false;

View File

@ -92,6 +92,9 @@ public:
m_reason.clear();
}
// This only makes sense if the contents are currently txt/plain
bool txtdcode(const string& who);
protected:
bool preview() {return m_forPreview;}

View File

@ -0,0 +1,49 @@
/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Library General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
#include "autoconfig.h"
#include "cstr.h"
#include "transcode.h"
#include "mimehandler.h"
#include "debuglog.h"
bool RecollFilter::txtdcode(const string& who)
{
if (m_metaData[cstr_mimetype].compare(cstr_textplain)) {
LOGERR(("%s::txtdcode: called on non txt/plain: %s\n", who.c_str(),
m_metaData[cstr_mimetype].c_str()));
return false;
}
string& ocs = m_metaData[cstr_origcharset];
string& itext = m_metaData[cstr_content];
LOGDEB0(("%s::txtdcode: %d bytes from [%s] to UTF-8\n",
who.c_str(), itext.size(), ocs.c_str()));
int ecnt;
bool ret;
string otext;
if (!(ret=transcode(itext, otext, ocs, "UTF-8", &ecnt)) ||
ecnt > int(itext.size() / 4)) {
LOGERR(("%s::txtdcode: transcode %d bytes to UTF-8 failed "
"for input charset [%s] ret %d ecnt %d\n",
who.c_str(), itext.size(), ocs.c_str(), ret, ecnt));
itext.erase();
return false;
}
itext.swap(otext);
m_metaData[cstr_charset] = "UTF-8";
return true;
}

View File

@ -6,8 +6,8 @@ LIBS = librcl.a
all: $(LIBS)
OBJS = rclaspell.o beaglequeuecache.o cstr.o rclconfig.o rclinit.o textsplit.o unacpp.o beaglequeue.o csguess.o fsindexer.o indexer.o mimetype.o subtreelist.o htmlparse.o myhtmlparse.o mimehandler.o internfile.o mh_exec.o mh_execm.o mh_html.o mh_mail.o mh_mbox.o mh_text.o docseq.o docseqdb.o docseqhist.o filtseq.o dynconf.o plaintorich.o recollq.o reslistpager.o sortseq.o wasastringtoquery.o wasatorcl.o rcldb.o rcldoc.o rclquery.o searchdata.o stemdb.o stoplist.o base64.o circache.o closefrom.o conftree.o copyfile.o debuglog.o execmd.o fstreewalk.o idfile.o fileudi.o md5.o mimeparse.o netcon.o pathut.o pxattr.o rclionice.o readfile.o smallut.o transcode.o wipedir.o x11mon.o mime-getpart.o mime-parsefull.o mime-parseonlyheader.o mime-printbody.o mime-printdoc.o mime-printheader.o mime.o convert.o iodevice.o iofactory.o
DEPS = rclaspell.dep.stamp beaglequeuecache.dep.stamp cstr.dep.stamp rclconfig.dep.stamp rclinit.dep.stamp textsplit.dep.stamp unacpp.dep.stamp beaglequeue.dep.stamp csguess.dep.stamp fsindexer.dep.stamp indexer.dep.stamp mimetype.dep.stamp subtreelist.dep.stamp htmlparse.dep.stamp myhtmlparse.dep.stamp mimehandler.dep.stamp internfile.dep.stamp mh_exec.dep.stamp mh_execm.dep.stamp mh_html.dep.stamp mh_mail.dep.stamp mh_mbox.dep.stamp mh_text.dep.stamp docseq.dep.stamp docseqdb.dep.stamp docseqhist.dep.stamp filtseq.dep.stamp dynconf.dep.stamp plaintorich.dep.stamp recollq.dep.stamp reslistpager.dep.stamp sortseq.dep.stamp wasastringtoquery.dep.stamp wasatorcl.dep.stamp rcldb.dep.stamp rcldoc.dep.stamp rclquery.dep.stamp searchdata.dep.stamp stemdb.dep.stamp stoplist.dep.stamp base64.dep.stamp circache.dep.stamp closefrom.dep.stamp conftree.dep.stamp copyfile.dep.stamp debuglog.dep.stamp execmd.dep.stamp fstreewalk.dep.stamp idfile.dep.stamp fileudi.dep.stamp md5.dep.stamp mimeparse.dep.stamp netcon.dep.stamp pathut.dep.stamp pxattr.dep.stamp rclionice.dep.stamp readfile.dep.stamp smallut.dep.stamp transcode.dep.stamp wipedir.dep.stamp x11mon.dep.stamp mime-getpart.dep.stamp mime-parsefull.dep.stamp mime-parseonlyheader.dep.stamp mime-printbody.dep.stamp mime-printdoc.dep.stamp mime-printheader.dep.stamp mime.dep.stamp convert.dep.stamp iodevice.dep.stamp iofactory.dep.stamp
OBJS = rclaspell.o beaglequeuecache.o cstr.o rclconfig.o rclinit.o textsplit.o unacpp.o beaglequeue.o csguess.o fsindexer.o indexer.o mimetype.o subtreelist.o htmlparse.o myhtmlparse.o mimehandler.o internfile.o mh_exec.o mh_execm.o mh_html.o mh_mail.o mh_mbox.o mh_text.o txtdcode.o docseq.o docseqdb.o docseqhist.o filtseq.o dynconf.o plaintorich.o recollq.o reslistpager.o sortseq.o wasastringtoquery.o wasatorcl.o rcldb.o rcldoc.o rclquery.o searchdata.o stemdb.o stoplist.o base64.o circache.o closefrom.o conftree.o copyfile.o debuglog.o execmd.o fstreewalk.o idfile.o fileudi.o md5.o mimeparse.o netcon.o pathut.o pxattr.o rclionice.o readfile.o smallut.o transcode.o wipedir.o x11mon.o mime-getpart.o mime-parsefull.o mime-parseonlyheader.o mime-printbody.o mime-printdoc.o mime-printheader.o mime.o convert.o iodevice.o iofactory.o
DEPS = rclaspell.dep.stamp beaglequeuecache.dep.stamp cstr.dep.stamp rclconfig.dep.stamp rclinit.dep.stamp textsplit.dep.stamp unacpp.dep.stamp beaglequeue.dep.stamp csguess.dep.stamp fsindexer.dep.stamp indexer.dep.stamp mimetype.dep.stamp subtreelist.dep.stamp htmlparse.dep.stamp myhtmlparse.dep.stamp mimehandler.dep.stamp internfile.dep.stamp mh_exec.dep.stamp mh_execm.dep.stamp mh_html.dep.stamp mh_mail.dep.stamp mh_mbox.dep.stamp mh_text.dep.stamp txtdcode.dep.stamp docseq.dep.stamp docseqdb.dep.stamp docseqhist.dep.stamp filtseq.dep.stamp dynconf.dep.stamp plaintorich.dep.stamp recollq.dep.stamp reslistpager.dep.stamp sortseq.dep.stamp wasastringtoquery.dep.stamp wasatorcl.dep.stamp rcldb.dep.stamp rcldoc.dep.stamp rclquery.dep.stamp searchdata.dep.stamp stemdb.dep.stamp stoplist.dep.stamp base64.dep.stamp circache.dep.stamp closefrom.dep.stamp conftree.dep.stamp copyfile.dep.stamp debuglog.dep.stamp execmd.dep.stamp fstreewalk.dep.stamp idfile.dep.stamp fileudi.dep.stamp md5.dep.stamp mimeparse.dep.stamp netcon.dep.stamp pathut.dep.stamp pxattr.dep.stamp rclionice.dep.stamp readfile.dep.stamp smallut.dep.stamp transcode.dep.stamp wipedir.dep.stamp x11mon.dep.stamp mime-getpart.dep.stamp mime-parsefull.dep.stamp mime-parseonlyheader.dep.stamp mime-printbody.dep.stamp mime-printdoc.dep.stamp mime-printheader.dep.stamp mime.dep.stamp convert.dep.stamp iodevice.dep.stamp iofactory.dep.stamp
librcl.a : $(DEPS) $(OBJS) unac.o
ar ru librcl.a $(OBJS) unac.o
@ -61,6 +61,8 @@ mh_mbox.o : ../internfile/mh_mbox.cpp $(depth)/mk/localdefs
$(CXX) $(ALL_CXXFLAGS) -c ../internfile/mh_mbox.cpp
mh_text.o : ../internfile/mh_text.cpp $(depth)/mk/localdefs
$(CXX) $(ALL_CXXFLAGS) -c ../internfile/mh_text.cpp
txtdcode.o : ../internfile/txtdcode.cpp $(depth)/mk/localdefs
$(CXX) $(ALL_CXXFLAGS) -c ../internfile/txtdcode.cpp
docseq.o : ../query/docseq.cpp $(depth)/mk/localdefs
$(CXX) $(ALL_CXXFLAGS) -c ../query/docseq.cpp
docseqdb.o : ../query/docseqdb.cpp $(depth)/mk/localdefs
@ -232,6 +234,9 @@ mh_mbox.dep.stamp : ../internfile/mh_mbox.cpp $(depth)/mk/localdefs
mh_text.dep.stamp : ../internfile/mh_text.cpp $(depth)/mk/localdefs
$(CXX) -M $(ALL_CXXFLAGS) ../internfile/mh_text.cpp > mh_text.dep
touch mh_text.dep.stamp
txtdcode.dep.stamp : ../internfile/txtdcode.cpp $(depth)/mk/localdefs
$(CXX) -M $(ALL_CXXFLAGS) ../internfile/txtdcode.cpp > txtdcode.dep
touch txtdcode.dep.stamp
docseq.dep.stamp : ../query/docseq.cpp $(depth)/mk/localdefs
$(CXX) -M $(ALL_CXXFLAGS) ../query/docseq.cpp > docseq.dep
touch docseq.dep.stamp
@ -369,6 +374,7 @@ include mh_html.dep
include mh_mail.dep
include mh_mbox.dep
include mh_text.dep
include txtdcode.dep
include docseq.dep
include docseqdb.dep
include docseqhist.dep

View File

@ -27,6 +27,7 @@ ${depth}/internfile/mh_html.cpp \
${depth}/internfile/mh_mail.cpp \
${depth}/internfile/mh_mbox.cpp \
${depth}/internfile/mh_text.cpp \
${depth}/internfile/txtdcode.cpp \
${depth}/query/docseq.cpp \
${depth}/query/docseqdb.cpp \
${depth}/query/docseqhist.cpp \