Factorized common text transcoding code in separate module
This commit is contained in:
parent
f544b28b4a
commit
49554e42c2
@ -54,6 +54,7 @@ DEF_CSTR(iso_8859_1, "ISO-8859-1");
|
||||
DEF_CSTR(mimetype, "mimetype");
|
||||
DEF_CSTR(minwilds, "*?[");
|
||||
DEF_CSTR(newline, "\n");
|
||||
DEF_CSTR(origcharset, "origcharset");
|
||||
DEF_CSTR(null, "");
|
||||
DEF_CSTR(plus, "+");
|
||||
DEF_CSTR(textplain, "text/plain");
|
||||
|
||||
@ -21,7 +21,6 @@
|
||||
#include "debuglog.h"
|
||||
#include "cancelcheck.h"
|
||||
#include "smallut.h"
|
||||
#include "transcode.h"
|
||||
#include "md5.h"
|
||||
#include "rclconfig.h"
|
||||
|
||||
@ -146,53 +145,24 @@ bool MimeHandlerExec::next_document()
|
||||
|
||||
void MimeHandlerExec::finaldetails()
|
||||
{
|
||||
string& output = m_metaData[cstr_content];
|
||||
m_metaData[cstr_origcharset] = m_dfltInputCharset;
|
||||
|
||||
// If output is text/plain (not text/html), we may have to convert
|
||||
// it to utf-8, because this is the last point where it can be done.
|
||||
// cfgFilterOutputCharset comes from the mimeconf filter definition line
|
||||
string charset = cfgFilterOutputCharset.empty() ? "utf-8" :
|
||||
cfgFilterOutputCharset;
|
||||
bool trustcharset = true;
|
||||
string& charset = m_metaData[cstr_charset];
|
||||
charset = cfgFilterOutputCharset.empty() ? "UTF-8" : cfgFilterOutputCharset;
|
||||
if (!stringlowercmp("default", charset)) {
|
||||
charset = m_dfltInputCharset;
|
||||
trustcharset = false;
|
||||
}
|
||||
string mt = cfgFilterOutputMtype.empty() ? "text/html" :
|
||||
|
||||
string& mt = m_metaData[cstr_mimetype];
|
||||
mt = cfgFilterOutputMtype.empty() ? "text/html" :
|
||||
cfgFilterOutputMtype;
|
||||
|
||||
// If this is text/plain and not utf-8 or untrusted, transcode to utf-8.
|
||||
if (!mt.compare(cstr_textplain) &&
|
||||
(!trustcharset || stringlowercmp("utf-8", charset))) {
|
||||
string transcoded;
|
||||
int ecnt;
|
||||
if (!transcode(output, transcoded, charset, "UTF-8", &ecnt)) {
|
||||
LOGERR(("mh_exec: transcode failed from [%s] to UTF-8\n",
|
||||
charset.c_str()));
|
||||
// Erase text in this case: it's garbage
|
||||
output.clear();
|
||||
} else {
|
||||
if (ecnt) {
|
||||
LOGDEB(("mh_exec: %d transcoding errors from [%s] to UTF-8\n",
|
||||
ecnt, charset.c_str()));
|
||||
}
|
||||
output = transcoded;
|
||||
charset = "utf-8";
|
||||
}
|
||||
// If this is text/plain transcode_to/check utf-8
|
||||
if (!mt.compare(cstr_textplain)) {
|
||||
(void)txtdcode("mh_exec");
|
||||
}
|
||||
|
||||
// Success. Store some external metadata
|
||||
|
||||
// Original charset. Can't be too sure about this actually. It's
|
||||
// just a hint anyway
|
||||
m_metaData["origcharset"] = m_dfltInputCharset;
|
||||
|
||||
// Supposed contents charset encoding. This could still be
|
||||
// overridden by the content-type meta tag for html, but this is
|
||||
// wasteful so we hope it's correct
|
||||
m_metaData[cstr_charset] = charset;
|
||||
m_metaData[cstr_mimetype] = mt;
|
||||
|
||||
string md5, xmd5, reason;
|
||||
if (MD5File(m_fn, md5, &reason)) {
|
||||
m_metaData["md5"] = MD5HexPrint(md5, xmd5);
|
||||
|
||||
@ -25,7 +25,6 @@
|
||||
#include "debuglog.h"
|
||||
#include "cancelcheck.h"
|
||||
#include "smallut.h"
|
||||
#include "transcode.h"
|
||||
#include "md5.h"
|
||||
#include "rclconfig.h"
|
||||
#include "mimetype.h"
|
||||
@ -283,39 +282,21 @@ bool MimeHandlerExecMultiple::next_document()
|
||||
}
|
||||
|
||||
// Charset. For many document types it doesn't matter. For text
|
||||
// and html it does. We supply a default from the
|
||||
// configuration.
|
||||
bool trustcharset = true;
|
||||
// and html it does. We supply a default from the configuration.
|
||||
if (charset.empty()) {
|
||||
charset = cfgFilterOutputCharset.empty() ? "utf-8" :
|
||||
cfgFilterOutputCharset;
|
||||
if (!stringlowercmp("default", charset)) {
|
||||
trustcharset = false;
|
||||
charset = m_dfltInputCharset;
|
||||
}
|
||||
}
|
||||
m_metaData[cstr_origcharset] = charset;
|
||||
m_metaData[cstr_charset] = charset;
|
||||
|
||||
string& output = m_metaData[cstr_content];
|
||||
if (!m_metaData[cstr_mimetype].compare(cstr_textplain) &&
|
||||
(!trustcharset || stringlowercmp("utf-8", charset))) {
|
||||
string transcoded;
|
||||
int ecnt;
|
||||
if (!transcode(output, transcoded, charset, "UTF-8", &ecnt)) {
|
||||
LOGERR(("mh_execm: transcode failed from [%s] to UTF-8\n",
|
||||
charset.c_str()));
|
||||
// Erase text in this case: it's garbage
|
||||
output.clear();
|
||||
} else {
|
||||
if (ecnt) {
|
||||
LOGDEB(("mh_exec: %d transcoding errors from [%s] to UTF-8\n",
|
||||
ecnt, charset.c_str()));
|
||||
}
|
||||
output = transcoded;
|
||||
charset = "utf-8";
|
||||
}
|
||||
if (!m_metaData[cstr_mimetype].compare(cstr_textplain)) {
|
||||
(void)txtdcode("mh_execm");
|
||||
}
|
||||
|
||||
m_metaData[cstr_charset] = charset;
|
||||
|
||||
if (eofnext_received)
|
||||
m_havedoc = false;
|
||||
|
||||
@ -162,7 +162,7 @@ bool MimeHandlerHtml::next_document()
|
||||
}
|
||||
}
|
||||
|
||||
m_metaData["origcharset"] = result.get_charset();
|
||||
m_metaData[cstr_origcharset] = result.get_charset();
|
||||
m_metaData[cstr_content] = result.dump;
|
||||
m_metaData[cstr_charset] = "utf-8";
|
||||
// Avoid setting empty values which would crush ones possibly inherited
|
||||
|
||||
@ -32,7 +32,6 @@ using namespace std;
|
||||
#include "csguess.h"
|
||||
#include "debuglog.h"
|
||||
#include "readfile.h"
|
||||
#include "transcode.h"
|
||||
#include "md5.h"
|
||||
#include "rclconfig.h"
|
||||
|
||||
@ -117,28 +116,21 @@ bool MimeHandlerText::next_document()
|
||||
if (m_havedoc == false)
|
||||
return false;
|
||||
|
||||
// We transcode even if defcharset is already utf-8:
|
||||
// We transcode even if defcharset is supposedly already utf-8:
|
||||
// this validates the encoding.
|
||||
LOGDEB1(("MimeHandlerText::mkDoc: transcod from %s to utf-8\n",
|
||||
m_dfltInputCharset.c_str()));
|
||||
int ecnt;
|
||||
bool ret;
|
||||
string& itext = m_metaData[cstr_content];
|
||||
if (!(ret=transcode(m_text, itext, m_dfltInputCharset, "UTF-8", &ecnt)) ||
|
||||
ecnt > int(itext.size() / 4)) {
|
||||
LOGERR(("MimeHandlerText::mkDoc: transcode to utf-8 failed "
|
||||
"for input charset [%s] ret %d ecnt %d\n",
|
||||
m_dfltInputCharset.c_str(), ret, ecnt));
|
||||
itext.erase();
|
||||
return false;
|
||||
}
|
||||
m_metaData["origcharset"] = m_dfltInputCharset;
|
||||
m_metaData[cstr_charset] = "utf-8";
|
||||
m_metaData[cstr_origcharset] = m_dfltInputCharset;
|
||||
m_metaData[cstr_mimetype] = cstr_textplain;
|
||||
|
||||
// If text length is 0 (the file is empty or oversize), or we have
|
||||
// read all at once, we're done
|
||||
if (m_text.length() == 0 || !m_paging) {
|
||||
size_t srclen = m_text.length();
|
||||
m_metaData[cstr_content].swap(m_text);
|
||||
|
||||
// txtdcode() truncates the text if transcoding fails
|
||||
(void)txtdcode("mh_text");
|
||||
|
||||
|
||||
// If the text length is 0 (the file is empty or oversize), or we are
|
||||
// not paging, we're done
|
||||
if (srclen == 0 || !m_paging) {
|
||||
m_havedoc = false;
|
||||
return true;
|
||||
} else {
|
||||
@ -150,8 +142,8 @@ bool MimeHandlerText::next_document()
|
||||
// be to use a different mtype for files over the page size,
|
||||
// and keep text/plain only for smaller files.
|
||||
char buf[30];
|
||||
sprintf(buf, "%lld", (long long)(m_offs - m_text.length()));
|
||||
if (m_offs - m_text.length() != 0)
|
||||
sprintf(buf, "%lld", (long long)(m_offs - srclen));
|
||||
if (m_offs - srclen != 0)
|
||||
m_metaData[cstr_ipath] = buf;
|
||||
readnext();
|
||||
return true;
|
||||
@ -161,7 +153,7 @@ bool MimeHandlerText::next_document()
|
||||
bool MimeHandlerText::readnext()
|
||||
{
|
||||
string reason;
|
||||
m_text.erase();
|
||||
m_text.clear();
|
||||
if (!file_to_string(m_fn, m_text, m_offs, m_pagesz, &reason)) {
|
||||
LOGERR(("MimeHandlerText: can't read file: %s\n", reason.c_str()));
|
||||
m_havedoc = false;
|
||||
|
||||
@ -92,6 +92,9 @@ public:
|
||||
m_reason.clear();
|
||||
}
|
||||
|
||||
// This only makes sense if the contents are currently txt/plain
|
||||
bool txtdcode(const string& who);
|
||||
|
||||
protected:
|
||||
bool preview() {return m_forPreview;}
|
||||
|
||||
|
||||
49
src/internfile/txtdcode.cpp
Normal file
49
src/internfile/txtdcode.cpp
Normal file
@ -0,0 +1,49 @@
|
||||
/*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Library General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
*/
|
||||
#include "autoconfig.h"
|
||||
|
||||
#include "cstr.h"
|
||||
#include "transcode.h"
|
||||
#include "mimehandler.h"
|
||||
#include "debuglog.h"
|
||||
|
||||
bool RecollFilter::txtdcode(const string& who)
|
||||
{
|
||||
if (m_metaData[cstr_mimetype].compare(cstr_textplain)) {
|
||||
LOGERR(("%s::txtdcode: called on non txt/plain: %s\n", who.c_str(),
|
||||
m_metaData[cstr_mimetype].c_str()));
|
||||
return false;
|
||||
}
|
||||
|
||||
string& ocs = m_metaData[cstr_origcharset];
|
||||
string& itext = m_metaData[cstr_content];
|
||||
LOGDEB0(("%s::txtdcode: %d bytes from [%s] to UTF-8\n",
|
||||
who.c_str(), itext.size(), ocs.c_str()));
|
||||
int ecnt;
|
||||
bool ret;
|
||||
string otext;
|
||||
if (!(ret=transcode(itext, otext, ocs, "UTF-8", &ecnt)) ||
|
||||
ecnt > int(itext.size() / 4)) {
|
||||
LOGERR(("%s::txtdcode: transcode %d bytes to UTF-8 failed "
|
||||
"for input charset [%s] ret %d ecnt %d\n",
|
||||
who.c_str(), itext.size(), ocs.c_str(), ret, ecnt));
|
||||
itext.erase();
|
||||
return false;
|
||||
}
|
||||
itext.swap(otext);
|
||||
m_metaData[cstr_charset] = "UTF-8";
|
||||
return true;
|
||||
}
|
||||
@ -6,8 +6,8 @@ LIBS = librcl.a
|
||||
|
||||
all: $(LIBS)
|
||||
|
||||
OBJS = rclaspell.o beaglequeuecache.o cstr.o rclconfig.o rclinit.o textsplit.o unacpp.o beaglequeue.o csguess.o fsindexer.o indexer.o mimetype.o subtreelist.o htmlparse.o myhtmlparse.o mimehandler.o internfile.o mh_exec.o mh_execm.o mh_html.o mh_mail.o mh_mbox.o mh_text.o docseq.o docseqdb.o docseqhist.o filtseq.o dynconf.o plaintorich.o recollq.o reslistpager.o sortseq.o wasastringtoquery.o wasatorcl.o rcldb.o rcldoc.o rclquery.o searchdata.o stemdb.o stoplist.o base64.o circache.o closefrom.o conftree.o copyfile.o debuglog.o execmd.o fstreewalk.o idfile.o fileudi.o md5.o mimeparse.o netcon.o pathut.o pxattr.o rclionice.o readfile.o smallut.o transcode.o wipedir.o x11mon.o mime-getpart.o mime-parsefull.o mime-parseonlyheader.o mime-printbody.o mime-printdoc.o mime-printheader.o mime.o convert.o iodevice.o iofactory.o
|
||||
DEPS = rclaspell.dep.stamp beaglequeuecache.dep.stamp cstr.dep.stamp rclconfig.dep.stamp rclinit.dep.stamp textsplit.dep.stamp unacpp.dep.stamp beaglequeue.dep.stamp csguess.dep.stamp fsindexer.dep.stamp indexer.dep.stamp mimetype.dep.stamp subtreelist.dep.stamp htmlparse.dep.stamp myhtmlparse.dep.stamp mimehandler.dep.stamp internfile.dep.stamp mh_exec.dep.stamp mh_execm.dep.stamp mh_html.dep.stamp mh_mail.dep.stamp mh_mbox.dep.stamp mh_text.dep.stamp docseq.dep.stamp docseqdb.dep.stamp docseqhist.dep.stamp filtseq.dep.stamp dynconf.dep.stamp plaintorich.dep.stamp recollq.dep.stamp reslistpager.dep.stamp sortseq.dep.stamp wasastringtoquery.dep.stamp wasatorcl.dep.stamp rcldb.dep.stamp rcldoc.dep.stamp rclquery.dep.stamp searchdata.dep.stamp stemdb.dep.stamp stoplist.dep.stamp base64.dep.stamp circache.dep.stamp closefrom.dep.stamp conftree.dep.stamp copyfile.dep.stamp debuglog.dep.stamp execmd.dep.stamp fstreewalk.dep.stamp idfile.dep.stamp fileudi.dep.stamp md5.dep.stamp mimeparse.dep.stamp netcon.dep.stamp pathut.dep.stamp pxattr.dep.stamp rclionice.dep.stamp readfile.dep.stamp smallut.dep.stamp transcode.dep.stamp wipedir.dep.stamp x11mon.dep.stamp mime-getpart.dep.stamp mime-parsefull.dep.stamp mime-parseonlyheader.dep.stamp mime-printbody.dep.stamp mime-printdoc.dep.stamp mime-printheader.dep.stamp mime.dep.stamp convert.dep.stamp iodevice.dep.stamp iofactory.dep.stamp
|
||||
OBJS = rclaspell.o beaglequeuecache.o cstr.o rclconfig.o rclinit.o textsplit.o unacpp.o beaglequeue.o csguess.o fsindexer.o indexer.o mimetype.o subtreelist.o htmlparse.o myhtmlparse.o mimehandler.o internfile.o mh_exec.o mh_execm.o mh_html.o mh_mail.o mh_mbox.o mh_text.o txtdcode.o docseq.o docseqdb.o docseqhist.o filtseq.o dynconf.o plaintorich.o recollq.o reslistpager.o sortseq.o wasastringtoquery.o wasatorcl.o rcldb.o rcldoc.o rclquery.o searchdata.o stemdb.o stoplist.o base64.o circache.o closefrom.o conftree.o copyfile.o debuglog.o execmd.o fstreewalk.o idfile.o fileudi.o md5.o mimeparse.o netcon.o pathut.o pxattr.o rclionice.o readfile.o smallut.o transcode.o wipedir.o x11mon.o mime-getpart.o mime-parsefull.o mime-parseonlyheader.o mime-printbody.o mime-printdoc.o mime-printheader.o mime.o convert.o iodevice.o iofactory.o
|
||||
DEPS = rclaspell.dep.stamp beaglequeuecache.dep.stamp cstr.dep.stamp rclconfig.dep.stamp rclinit.dep.stamp textsplit.dep.stamp unacpp.dep.stamp beaglequeue.dep.stamp csguess.dep.stamp fsindexer.dep.stamp indexer.dep.stamp mimetype.dep.stamp subtreelist.dep.stamp htmlparse.dep.stamp myhtmlparse.dep.stamp mimehandler.dep.stamp internfile.dep.stamp mh_exec.dep.stamp mh_execm.dep.stamp mh_html.dep.stamp mh_mail.dep.stamp mh_mbox.dep.stamp mh_text.dep.stamp txtdcode.dep.stamp docseq.dep.stamp docseqdb.dep.stamp docseqhist.dep.stamp filtseq.dep.stamp dynconf.dep.stamp plaintorich.dep.stamp recollq.dep.stamp reslistpager.dep.stamp sortseq.dep.stamp wasastringtoquery.dep.stamp wasatorcl.dep.stamp rcldb.dep.stamp rcldoc.dep.stamp rclquery.dep.stamp searchdata.dep.stamp stemdb.dep.stamp stoplist.dep.stamp base64.dep.stamp circache.dep.stamp closefrom.dep.stamp conftree.dep.stamp copyfile.dep.stamp debuglog.dep.stamp execmd.dep.stamp fstreewalk.dep.stamp idfile.dep.stamp fileudi.dep.stamp md5.dep.stamp mimeparse.dep.stamp netcon.dep.stamp pathut.dep.stamp pxattr.dep.stamp rclionice.dep.stamp readfile.dep.stamp smallut.dep.stamp transcode.dep.stamp wipedir.dep.stamp x11mon.dep.stamp mime-getpart.dep.stamp mime-parsefull.dep.stamp mime-parseonlyheader.dep.stamp mime-printbody.dep.stamp mime-printdoc.dep.stamp mime-printheader.dep.stamp mime.dep.stamp convert.dep.stamp iodevice.dep.stamp iofactory.dep.stamp
|
||||
|
||||
librcl.a : $(DEPS) $(OBJS) unac.o
|
||||
ar ru librcl.a $(OBJS) unac.o
|
||||
@ -61,6 +61,8 @@ mh_mbox.o : ../internfile/mh_mbox.cpp $(depth)/mk/localdefs
|
||||
$(CXX) $(ALL_CXXFLAGS) -c ../internfile/mh_mbox.cpp
|
||||
mh_text.o : ../internfile/mh_text.cpp $(depth)/mk/localdefs
|
||||
$(CXX) $(ALL_CXXFLAGS) -c ../internfile/mh_text.cpp
|
||||
txtdcode.o : ../internfile/txtdcode.cpp $(depth)/mk/localdefs
|
||||
$(CXX) $(ALL_CXXFLAGS) -c ../internfile/txtdcode.cpp
|
||||
docseq.o : ../query/docseq.cpp $(depth)/mk/localdefs
|
||||
$(CXX) $(ALL_CXXFLAGS) -c ../query/docseq.cpp
|
||||
docseqdb.o : ../query/docseqdb.cpp $(depth)/mk/localdefs
|
||||
@ -232,6 +234,9 @@ mh_mbox.dep.stamp : ../internfile/mh_mbox.cpp $(depth)/mk/localdefs
|
||||
mh_text.dep.stamp : ../internfile/mh_text.cpp $(depth)/mk/localdefs
|
||||
$(CXX) -M $(ALL_CXXFLAGS) ../internfile/mh_text.cpp > mh_text.dep
|
||||
touch mh_text.dep.stamp
|
||||
txtdcode.dep.stamp : ../internfile/txtdcode.cpp $(depth)/mk/localdefs
|
||||
$(CXX) -M $(ALL_CXXFLAGS) ../internfile/txtdcode.cpp > txtdcode.dep
|
||||
touch txtdcode.dep.stamp
|
||||
docseq.dep.stamp : ../query/docseq.cpp $(depth)/mk/localdefs
|
||||
$(CXX) -M $(ALL_CXXFLAGS) ../query/docseq.cpp > docseq.dep
|
||||
touch docseq.dep.stamp
|
||||
@ -369,6 +374,7 @@ include mh_html.dep
|
||||
include mh_mail.dep
|
||||
include mh_mbox.dep
|
||||
include mh_text.dep
|
||||
include txtdcode.dep
|
||||
include docseq.dep
|
||||
include docseqdb.dep
|
||||
include docseqhist.dep
|
||||
|
||||
@ -27,6 +27,7 @@ ${depth}/internfile/mh_html.cpp \
|
||||
${depth}/internfile/mh_mail.cpp \
|
||||
${depth}/internfile/mh_mbox.cpp \
|
||||
${depth}/internfile/mh_text.cpp \
|
||||
${depth}/internfile/txtdcode.cpp \
|
||||
${depth}/query/docseq.cpp \
|
||||
${depth}/query/docseqdb.cpp \
|
||||
${depth}/query/docseqhist.cpp \
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user