diff --git a/src/common/cstr.h b/src/common/cstr.h index 001607c7..89a84206 100644 --- a/src/common/cstr.h +++ b/src/common/cstr.h @@ -54,6 +54,7 @@ DEF_CSTR(iso_8859_1, "ISO-8859-1"); DEF_CSTR(mimetype, "mimetype"); DEF_CSTR(minwilds, "*?["); DEF_CSTR(newline, "\n"); +DEF_CSTR(origcharset, "origcharset"); DEF_CSTR(null, ""); DEF_CSTR(plus, "+"); DEF_CSTR(textplain, "text/plain"); diff --git a/src/internfile/mh_exec.cpp b/src/internfile/mh_exec.cpp index d0c321b3..536c583f 100644 --- a/src/internfile/mh_exec.cpp +++ b/src/internfile/mh_exec.cpp @@ -21,7 +21,6 @@ #include "debuglog.h" #include "cancelcheck.h" #include "smallut.h" -#include "transcode.h" #include "md5.h" #include "rclconfig.h" @@ -146,53 +145,24 @@ bool MimeHandlerExec::next_document() void MimeHandlerExec::finaldetails() { - string& output = m_metaData[cstr_content]; + m_metaData[cstr_origcharset] = m_dfltInputCharset; - // If output is text/plain (not text/html), we may have to convert - // it to utf-8, because this is the last point where it can be done. // cfgFilterOutputCharset comes from the mimeconf filter definition line - string charset = cfgFilterOutputCharset.empty() ? "utf-8" : - cfgFilterOutputCharset; - bool trustcharset = true; + string& charset = m_metaData[cstr_charset]; + charset = cfgFilterOutputCharset.empty() ? "UTF-8" : cfgFilterOutputCharset; if (!stringlowercmp("default", charset)) { charset = m_dfltInputCharset; - trustcharset = false; } - string mt = cfgFilterOutputMtype.empty() ? "text/html" : + + string& mt = m_metaData[cstr_mimetype]; + mt = cfgFilterOutputMtype.empty() ? "text/html" : cfgFilterOutputMtype; - // If this is text/plain and not utf-8 or untrusted, transcode to utf-8. - if (!mt.compare(cstr_textplain) && - (!trustcharset || stringlowercmp("utf-8", charset))) { - string transcoded; - int ecnt; - if (!transcode(output, transcoded, charset, "UTF-8", &ecnt)) { - LOGERR(("mh_exec: transcode failed from [%s] to UTF-8\n", - charset.c_str())); - // Erase text in this case: it's garbage - output.clear(); - } else { - if (ecnt) { - LOGDEB(("mh_exec: %d transcoding errors from [%s] to UTF-8\n", - ecnt, charset.c_str())); - } - output = transcoded; - charset = "utf-8"; - } + // If this is text/plain transcode_to/check utf-8 + if (!mt.compare(cstr_textplain)) { + (void)txtdcode("mh_exec"); } - // Success. Store some external metadata - - // Original charset. Can't be too sure about this actually. It's - // just a hint anyway - m_metaData["origcharset"] = m_dfltInputCharset; - - // Supposed contents charset encoding. This could still be - // overridden by the content-type meta tag for html, but this is - // wasteful so we hope it's correct - m_metaData[cstr_charset] = charset; - m_metaData[cstr_mimetype] = mt; - string md5, xmd5, reason; if (MD5File(m_fn, md5, &reason)) { m_metaData["md5"] = MD5HexPrint(md5, xmd5); diff --git a/src/internfile/mh_execm.cpp b/src/internfile/mh_execm.cpp index 9b2899c9..1c77ce1b 100644 --- a/src/internfile/mh_execm.cpp +++ b/src/internfile/mh_execm.cpp @@ -25,7 +25,6 @@ #include "debuglog.h" #include "cancelcheck.h" #include "smallut.h" -#include "transcode.h" #include "md5.h" #include "rclconfig.h" #include "mimetype.h" @@ -283,39 +282,21 @@ bool MimeHandlerExecMultiple::next_document() } // Charset. For many document types it doesn't matter. For text - // and html it does. We supply a default from the - // configuration. - bool trustcharset = true; + // and html it does. We supply a default from the configuration. if (charset.empty()) { charset = cfgFilterOutputCharset.empty() ? "utf-8" : cfgFilterOutputCharset; if (!stringlowercmp("default", charset)) { - trustcharset = false; charset = m_dfltInputCharset; } } + m_metaData[cstr_origcharset] = charset; + m_metaData[cstr_charset] = charset; - string& output = m_metaData[cstr_content]; - if (!m_metaData[cstr_mimetype].compare(cstr_textplain) && - (!trustcharset || stringlowercmp("utf-8", charset))) { - string transcoded; - int ecnt; - if (!transcode(output, transcoded, charset, "UTF-8", &ecnt)) { - LOGERR(("mh_execm: transcode failed from [%s] to UTF-8\n", - charset.c_str())); - // Erase text in this case: it's garbage - output.clear(); - } else { - if (ecnt) { - LOGDEB(("mh_exec: %d transcoding errors from [%s] to UTF-8\n", - ecnt, charset.c_str())); - } - output = transcoded; - charset = "utf-8"; - } + if (!m_metaData[cstr_mimetype].compare(cstr_textplain)) { + (void)txtdcode("mh_execm"); } - m_metaData[cstr_charset] = charset; if (eofnext_received) m_havedoc = false; diff --git a/src/internfile/mh_html.cpp b/src/internfile/mh_html.cpp index 4784beaf..b8d7bb45 100644 --- a/src/internfile/mh_html.cpp +++ b/src/internfile/mh_html.cpp @@ -162,7 +162,7 @@ bool MimeHandlerHtml::next_document() } } - m_metaData["origcharset"] = result.get_charset(); + m_metaData[cstr_origcharset] = result.get_charset(); m_metaData[cstr_content] = result.dump; m_metaData[cstr_charset] = "utf-8"; // Avoid setting empty values which would crush ones possibly inherited diff --git a/src/internfile/mh_text.cpp b/src/internfile/mh_text.cpp index 2cf0ad45..ed3f260f 100644 --- a/src/internfile/mh_text.cpp +++ b/src/internfile/mh_text.cpp @@ -32,7 +32,6 @@ using namespace std; #include "csguess.h" #include "debuglog.h" #include "readfile.h" -#include "transcode.h" #include "md5.h" #include "rclconfig.h" @@ -117,28 +116,21 @@ bool MimeHandlerText::next_document() if (m_havedoc == false) return false; - // We transcode even if defcharset is already utf-8: + // We transcode even if defcharset is supposedly already utf-8: // this validates the encoding. - LOGDEB1(("MimeHandlerText::mkDoc: transcod from %s to utf-8\n", - m_dfltInputCharset.c_str())); - int ecnt; - bool ret; - string& itext = m_metaData[cstr_content]; - if (!(ret=transcode(m_text, itext, m_dfltInputCharset, "UTF-8", &ecnt)) || - ecnt > int(itext.size() / 4)) { - LOGERR(("MimeHandlerText::mkDoc: transcode to utf-8 failed " - "for input charset [%s] ret %d ecnt %d\n", - m_dfltInputCharset.c_str(), ret, ecnt)); - itext.erase(); - return false; - } - m_metaData["origcharset"] = m_dfltInputCharset; - m_metaData[cstr_charset] = "utf-8"; + m_metaData[cstr_origcharset] = m_dfltInputCharset; m_metaData[cstr_mimetype] = cstr_textplain; - // If text length is 0 (the file is empty or oversize), or we have - // read all at once, we're done - if (m_text.length() == 0 || !m_paging) { + size_t srclen = m_text.length(); + m_metaData[cstr_content].swap(m_text); + + // txtdcode() truncates the text if transcoding fails + (void)txtdcode("mh_text"); + + + // If the text length is 0 (the file is empty or oversize), or we are + // not paging, we're done + if (srclen == 0 || !m_paging) { m_havedoc = false; return true; } else { @@ -150,8 +142,8 @@ bool MimeHandlerText::next_document() // be to use a different mtype for files over the page size, // and keep text/plain only for smaller files. char buf[30]; - sprintf(buf, "%lld", (long long)(m_offs - m_text.length())); - if (m_offs - m_text.length() != 0) + sprintf(buf, "%lld", (long long)(m_offs - srclen)); + if (m_offs - srclen != 0) m_metaData[cstr_ipath] = buf; readnext(); return true; @@ -161,7 +153,7 @@ bool MimeHandlerText::next_document() bool MimeHandlerText::readnext() { string reason; - m_text.erase(); + m_text.clear(); if (!file_to_string(m_fn, m_text, m_offs, m_pagesz, &reason)) { LOGERR(("MimeHandlerText: can't read file: %s\n", reason.c_str())); m_havedoc = false; diff --git a/src/internfile/mimehandler.h b/src/internfile/mimehandler.h index 90f2459d..964ec81e 100644 --- a/src/internfile/mimehandler.h +++ b/src/internfile/mimehandler.h @@ -92,6 +92,9 @@ public: m_reason.clear(); } + // This only makes sense if the contents are currently txt/plain + bool txtdcode(const string& who); + protected: bool preview() {return m_forPreview;} diff --git a/src/internfile/txtdcode.cpp b/src/internfile/txtdcode.cpp new file mode 100644 index 00000000..2bae470a --- /dev/null +++ b/src/internfile/txtdcode.cpp @@ -0,0 +1,49 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Library General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#include "autoconfig.h" + +#include "cstr.h" +#include "transcode.h" +#include "mimehandler.h" +#include "debuglog.h" + +bool RecollFilter::txtdcode(const string& who) +{ + if (m_metaData[cstr_mimetype].compare(cstr_textplain)) { + LOGERR(("%s::txtdcode: called on non txt/plain: %s\n", who.c_str(), + m_metaData[cstr_mimetype].c_str())); + return false; + } + + string& ocs = m_metaData[cstr_origcharset]; + string& itext = m_metaData[cstr_content]; + LOGDEB0(("%s::txtdcode: %d bytes from [%s] to UTF-8\n", + who.c_str(), itext.size(), ocs.c_str())); + int ecnt; + bool ret; + string otext; + if (!(ret=transcode(itext, otext, ocs, "UTF-8", &ecnt)) || + ecnt > int(itext.size() / 4)) { + LOGERR(("%s::txtdcode: transcode %d bytes to UTF-8 failed " + "for input charset [%s] ret %d ecnt %d\n", + who.c_str(), itext.size(), ocs.c_str(), ret, ecnt)); + itext.erase(); + return false; + } + itext.swap(otext); + m_metaData[cstr_charset] = "UTF-8"; + return true; +} diff --git a/src/lib/Makefile b/src/lib/Makefile index 88405471..4fc0caa4 100644 --- a/src/lib/Makefile +++ b/src/lib/Makefile @@ -6,8 +6,8 @@ LIBS = librcl.a all: $(LIBS) -OBJS = rclaspell.o beaglequeuecache.o cstr.o rclconfig.o rclinit.o textsplit.o unacpp.o beaglequeue.o csguess.o fsindexer.o indexer.o mimetype.o subtreelist.o htmlparse.o myhtmlparse.o mimehandler.o internfile.o mh_exec.o mh_execm.o mh_html.o mh_mail.o mh_mbox.o mh_text.o docseq.o docseqdb.o docseqhist.o filtseq.o dynconf.o plaintorich.o recollq.o reslistpager.o sortseq.o wasastringtoquery.o wasatorcl.o rcldb.o rcldoc.o rclquery.o searchdata.o stemdb.o stoplist.o base64.o circache.o closefrom.o conftree.o copyfile.o debuglog.o execmd.o fstreewalk.o idfile.o fileudi.o md5.o mimeparse.o netcon.o pathut.o pxattr.o rclionice.o readfile.o smallut.o transcode.o wipedir.o x11mon.o mime-getpart.o mime-parsefull.o mime-parseonlyheader.o mime-printbody.o mime-printdoc.o mime-printheader.o mime.o convert.o iodevice.o iofactory.o -DEPS = rclaspell.dep.stamp beaglequeuecache.dep.stamp cstr.dep.stamp rclconfig.dep.stamp rclinit.dep.stamp textsplit.dep.stamp unacpp.dep.stamp beaglequeue.dep.stamp csguess.dep.stamp fsindexer.dep.stamp indexer.dep.stamp mimetype.dep.stamp subtreelist.dep.stamp htmlparse.dep.stamp myhtmlparse.dep.stamp mimehandler.dep.stamp internfile.dep.stamp mh_exec.dep.stamp mh_execm.dep.stamp mh_html.dep.stamp mh_mail.dep.stamp mh_mbox.dep.stamp mh_text.dep.stamp docseq.dep.stamp docseqdb.dep.stamp docseqhist.dep.stamp filtseq.dep.stamp dynconf.dep.stamp plaintorich.dep.stamp recollq.dep.stamp reslistpager.dep.stamp sortseq.dep.stamp wasastringtoquery.dep.stamp wasatorcl.dep.stamp rcldb.dep.stamp rcldoc.dep.stamp rclquery.dep.stamp searchdata.dep.stamp stemdb.dep.stamp stoplist.dep.stamp base64.dep.stamp circache.dep.stamp closefrom.dep.stamp conftree.dep.stamp copyfile.dep.stamp debuglog.dep.stamp execmd.dep.stamp fstreewalk.dep.stamp idfile.dep.stamp fileudi.dep.stamp md5.dep.stamp mimeparse.dep.stamp netcon.dep.stamp pathut.dep.stamp pxattr.dep.stamp rclionice.dep.stamp readfile.dep.stamp smallut.dep.stamp transcode.dep.stamp wipedir.dep.stamp x11mon.dep.stamp mime-getpart.dep.stamp mime-parsefull.dep.stamp mime-parseonlyheader.dep.stamp mime-printbody.dep.stamp mime-printdoc.dep.stamp mime-printheader.dep.stamp mime.dep.stamp convert.dep.stamp iodevice.dep.stamp iofactory.dep.stamp +OBJS = rclaspell.o beaglequeuecache.o cstr.o rclconfig.o rclinit.o textsplit.o unacpp.o beaglequeue.o csguess.o fsindexer.o indexer.o mimetype.o subtreelist.o htmlparse.o myhtmlparse.o mimehandler.o internfile.o mh_exec.o mh_execm.o mh_html.o mh_mail.o mh_mbox.o mh_text.o txtdcode.o docseq.o docseqdb.o docseqhist.o filtseq.o dynconf.o plaintorich.o recollq.o reslistpager.o sortseq.o wasastringtoquery.o wasatorcl.o rcldb.o rcldoc.o rclquery.o searchdata.o stemdb.o stoplist.o base64.o circache.o closefrom.o conftree.o copyfile.o debuglog.o execmd.o fstreewalk.o idfile.o fileudi.o md5.o mimeparse.o netcon.o pathut.o pxattr.o rclionice.o readfile.o smallut.o transcode.o wipedir.o x11mon.o mime-getpart.o mime-parsefull.o mime-parseonlyheader.o mime-printbody.o mime-printdoc.o mime-printheader.o mime.o convert.o iodevice.o iofactory.o +DEPS = rclaspell.dep.stamp beaglequeuecache.dep.stamp cstr.dep.stamp rclconfig.dep.stamp rclinit.dep.stamp textsplit.dep.stamp unacpp.dep.stamp beaglequeue.dep.stamp csguess.dep.stamp fsindexer.dep.stamp indexer.dep.stamp mimetype.dep.stamp subtreelist.dep.stamp htmlparse.dep.stamp myhtmlparse.dep.stamp mimehandler.dep.stamp internfile.dep.stamp mh_exec.dep.stamp mh_execm.dep.stamp mh_html.dep.stamp mh_mail.dep.stamp mh_mbox.dep.stamp mh_text.dep.stamp txtdcode.dep.stamp docseq.dep.stamp docseqdb.dep.stamp docseqhist.dep.stamp filtseq.dep.stamp dynconf.dep.stamp plaintorich.dep.stamp recollq.dep.stamp reslistpager.dep.stamp sortseq.dep.stamp wasastringtoquery.dep.stamp wasatorcl.dep.stamp rcldb.dep.stamp rcldoc.dep.stamp rclquery.dep.stamp searchdata.dep.stamp stemdb.dep.stamp stoplist.dep.stamp base64.dep.stamp circache.dep.stamp closefrom.dep.stamp conftree.dep.stamp copyfile.dep.stamp debuglog.dep.stamp execmd.dep.stamp fstreewalk.dep.stamp idfile.dep.stamp fileudi.dep.stamp md5.dep.stamp mimeparse.dep.stamp netcon.dep.stamp pathut.dep.stamp pxattr.dep.stamp rclionice.dep.stamp readfile.dep.stamp smallut.dep.stamp transcode.dep.stamp wipedir.dep.stamp x11mon.dep.stamp mime-getpart.dep.stamp mime-parsefull.dep.stamp mime-parseonlyheader.dep.stamp mime-printbody.dep.stamp mime-printdoc.dep.stamp mime-printheader.dep.stamp mime.dep.stamp convert.dep.stamp iodevice.dep.stamp iofactory.dep.stamp librcl.a : $(DEPS) $(OBJS) unac.o ar ru librcl.a $(OBJS) unac.o @@ -61,6 +61,8 @@ mh_mbox.o : ../internfile/mh_mbox.cpp $(depth)/mk/localdefs $(CXX) $(ALL_CXXFLAGS) -c ../internfile/mh_mbox.cpp mh_text.o : ../internfile/mh_text.cpp $(depth)/mk/localdefs $(CXX) $(ALL_CXXFLAGS) -c ../internfile/mh_text.cpp +txtdcode.o : ../internfile/txtdcode.cpp $(depth)/mk/localdefs + $(CXX) $(ALL_CXXFLAGS) -c ../internfile/txtdcode.cpp docseq.o : ../query/docseq.cpp $(depth)/mk/localdefs $(CXX) $(ALL_CXXFLAGS) -c ../query/docseq.cpp docseqdb.o : ../query/docseqdb.cpp $(depth)/mk/localdefs @@ -232,6 +234,9 @@ mh_mbox.dep.stamp : ../internfile/mh_mbox.cpp $(depth)/mk/localdefs mh_text.dep.stamp : ../internfile/mh_text.cpp $(depth)/mk/localdefs $(CXX) -M $(ALL_CXXFLAGS) ../internfile/mh_text.cpp > mh_text.dep touch mh_text.dep.stamp +txtdcode.dep.stamp : ../internfile/txtdcode.cpp $(depth)/mk/localdefs + $(CXX) -M $(ALL_CXXFLAGS) ../internfile/txtdcode.cpp > txtdcode.dep + touch txtdcode.dep.stamp docseq.dep.stamp : ../query/docseq.cpp $(depth)/mk/localdefs $(CXX) -M $(ALL_CXXFLAGS) ../query/docseq.cpp > docseq.dep touch docseq.dep.stamp @@ -369,6 +374,7 @@ include mh_html.dep include mh_mail.dep include mh_mbox.dep include mh_text.dep +include txtdcode.dep include docseq.dep include docseqdb.dep include docseqhist.dep diff --git a/src/lib/mkMake b/src/lib/mkMake index 724048a6..11784bf8 100755 --- a/src/lib/mkMake +++ b/src/lib/mkMake @@ -27,6 +27,7 @@ ${depth}/internfile/mh_html.cpp \ ${depth}/internfile/mh_mail.cpp \ ${depth}/internfile/mh_mbox.cpp \ ${depth}/internfile/mh_text.cpp \ +${depth}/internfile/txtdcode.cpp \ ${depth}/query/docseq.cpp \ ${depth}/query/docseqdb.cpp \ ${depth}/query/docseqhist.cpp \