From c5ebe00247271611de755efbe4e5f47dddcf22db Mon Sep 17 00:00:00 2001 From: dockes Date: Wed, 30 May 2007 12:31:19 +0000 Subject: [PATCH] improve transcode error printing --- src/index/indexer.cpp | 12 +++++++++--- src/internfile/mh_html.cpp | 24 ++++++++++++++++++++---- src/internfile/mh_html.h | 3 ++- src/qtgui/plaintorich.cpp | 3 +-- src/query/xadump.cpp | 4 +--- src/utils/transcode.cpp | 5 ++--- 6 files changed, 35 insertions(+), 16 deletions(-) diff --git a/src/index/indexer.cpp b/src/index/indexer.cpp index 9b491755..10237f8f 100644 --- a/src/index/indexer.cpp +++ b/src/index/indexer.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: indexer.cpp,v 1.55 2007-05-22 07:40:00 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: indexer.cpp,v 1.56 2007-05-30 12:31:19 dockes Exp $ (C) 2004 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -394,8 +394,14 @@ DbIndexer::processone(const std::string &fn, const struct stat *stp, // If this fails, the file name won't be indexed, no big deal // Note that we used to do the full path here, but I ended up believing // that it made more sense to use only the file name - string utf8fn; - transcode(path_getsimple(fn), utf8fn, charset, "UTF-8"); + string utf8fn; int ercnt; + if (!transcode(path_getsimple(fn), utf8fn, charset, "UTF-8", &ercnt)) { + LOGERR(("processone: fn transcode failure from [%s] to UTF-8: %s\n", + charset.c_str(), path_getsimple(fn).c_str())); + } else if (ercnt) { + LOGDEB(("processone: fn transcode %d errors from [%s] to UTF-8: %s\n", + ercnt, charset.c_str(), path_getsimple(fn).c_str())); + } FileInterner::Status fis = FileInterner::FIAgain; bool hadNullIpath = false; diff --git a/src/internfile/mh_html.cpp b/src/internfile/mh_html.cpp index 7e1d42d2..e4702e69 100644 --- a/src/internfile/mh_html.cpp +++ b/src/internfile/mh_html.cpp @@ -49,6 +49,7 @@ bool MimeHandlerHtml::set_document_file(const string &fn) LOGINFO(("textHtmlToDoc: cant read: %s\n", fn.c_str())); return false; } + m_filename = fn; return set_document_string(otext); } @@ -64,8 +65,13 @@ bool MimeHandlerHtml::next_document() if (m_havedoc == false) return false; m_havedoc = false; + // If set_doc(fn), take note of file name. + string fn = m_filename; + m_filename.erase(); + string charset = m_defcharset; - LOGDEB(("textHtmlToDoc: next_document. defcharset: %s\n",charset.c_str())); + LOGDEB(("textHtmlToDoc: next_document. defcharset: %s\n", + charset.c_str())); // - We first try to convert from the default configured charset // (which may depend of the current directory) to utf-8. If this @@ -82,13 +88,23 @@ bool MimeHandlerHtml::next_document() LOGDEB(("Html::mkDoc: pass %d\n", pass)); MyHtmlParser p; // Try transcoding. If it fails, use original text. - if (!transcode(m_html, transcoded, charset, "UTF-8")) { - LOGERR(("textHtmlToDoc: transcode failed from cs '%s' to UTF-8\n", - charset.c_str())); + int ecnt; + if (!transcode(m_html, transcoded, charset, "UTF-8", &ecnt)) { + LOGDEB(("textHtmlToDoc: transcode failed from cs '%s' to UTF-8 for" + "[%s]", charset.c_str(), fn.empty()?"unknown":fn.c_str())); transcoded = m_html; // We don't know the charset, at all p.ocharset = p.charset = charset = ""; } else { + if (ecnt) { + if (pass == 0) { + LOGDEB(("textHtmlToDoc: init transcode had %d errors for " + "[%s]", ecnt, fn.empty()?"unknown":fn.c_str())); + } else { + LOGERR(("textHtmlToDoc: final transcode had %d errors for " + "[%s]", ecnt, fn.empty()?"unknown":fn.c_str())); + } + } // ocharset has the putative source charset, transcoded is now // in utf-8 p.ocharset = charset; diff --git a/src/internfile/mh_html.h b/src/internfile/mh_html.h index fea22b2a..b15c5a77 100644 --- a/src/internfile/mh_html.h +++ b/src/internfile/mh_html.h @@ -16,7 +16,7 @@ */ #ifndef _HTML_H_INCLUDED_ #define _HTML_H_INCLUDED_ -/* @(#$Id: mh_html.h,v 1.9 2006-12-16 15:39:54 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: mh_html.h,v 1.10 2007-05-30 12:31:19 dockes Exp $ (C) 2004 J.F.Dockes */ #include @@ -38,6 +38,7 @@ class MimeHandlerHtml : public RecollFilter { } virtual bool next_document(); private: + string m_filename; string m_html; }; diff --git a/src/qtgui/plaintorich.cpp b/src/qtgui/plaintorich.cpp index 29e40928..908f7ae6 100644 --- a/src/qtgui/plaintorich.cpp +++ b/src/qtgui/plaintorich.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: plaintorich.cpp,v 1.21 2007-05-23 09:19:48 dockes Exp $ (C) 2005 J.F.Dockes"; +static char rcsid[] = "@(#$Id: plaintorich.cpp,v 1.22 2007-05-30 12:31:19 dockes Exp $ (C) 2005 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -39,7 +39,6 @@ using std::set; #include "debuglog.h" #include "textsplit.h" #include "utf8iter.h" -#include "transcode.h" #include "smallut.h" #include "plaintorich.h" #include "cancelcheck.h" diff --git a/src/query/xadump.cpp b/src/query/xadump.cpp index 5ee76ac7..5694f88b 100644 --- a/src/query/xadump.cpp +++ b/src/query/xadump.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: xadump.cpp,v 1.14 2007-01-13 14:41:40 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: xadump.cpp,v 1.15 2007-05-30 12:31:19 dockes Exp $ (C) 2004 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -24,8 +24,6 @@ static char rcsid[] = "@(#$Id: xadump.cpp,v 1.14 2007-01-13 14:41:40 dockes Exp #include #include -#include "transcode.h" - #ifndef NO_NAMESPACES using namespace std; #endif /* NO_NAMESPACES */ diff --git a/src/utils/transcode.cpp b/src/utils/transcode.cpp index 78dc9f85..c4bac5e4 100644 --- a/src/utils/transcode.cpp +++ b/src/utils/transcode.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: transcode.cpp,v 1.9 2006-11-20 15:29:08 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: transcode.cpp,v 1.10 2007-05-30 12:31:19 dockes Exp $ (C) 2004 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -100,9 +100,8 @@ bool transcode(const string &in, string &out, const string &icode, error: if (icopen) iconv_close(ic); - //fprintf(stderr, "TRANSCODE OUT:\n%s\n", out.c_str()); if (mecnt) - LOGINFO(("transcode: [%s]->[%s] %d errors\n", + LOGDEB(("transcode: [%s]->[%s] %d errors\n", icode.c_str(), ocode.c_str(), mecnt)); if (ecnt) *ecnt = mecnt;