improve transcode error printing

This commit is contained in:
dockes 2007-05-30 12:31:19 +00:00
parent 441820d1ef
commit c5ebe00247
6 changed files with 35 additions and 16 deletions

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: indexer.cpp,v 1.55 2007-05-22 07:40:00 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: indexer.cpp,v 1.56 2007-05-30 12:31:19 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -394,8 +394,14 @@ DbIndexer::processone(const std::string &fn, const struct stat *stp,
// If this fails, the file name won't be indexed, no big deal
// Note that we used to do the full path here, but I ended up believing
// that it made more sense to use only the file name
string utf8fn;
transcode(path_getsimple(fn), utf8fn, charset, "UTF-8");
string utf8fn; int ercnt;
if (!transcode(path_getsimple(fn), utf8fn, charset, "UTF-8", &ercnt)) {
LOGERR(("processone: fn transcode failure from [%s] to UTF-8: %s\n",
charset.c_str(), path_getsimple(fn).c_str()));
} else if (ercnt) {
LOGDEB(("processone: fn transcode %d errors from [%s] to UTF-8: %s\n",
ercnt, charset.c_str(), path_getsimple(fn).c_str()));
}
FileInterner::Status fis = FileInterner::FIAgain;
bool hadNullIpath = false;

View File

@ -49,6 +49,7 @@ bool MimeHandlerHtml::set_document_file(const string &fn)
LOGINFO(("textHtmlToDoc: cant read: %s\n", fn.c_str()));
return false;
}
m_filename = fn;
return set_document_string(otext);
}
@ -64,8 +65,13 @@ bool MimeHandlerHtml::next_document()
if (m_havedoc == false)
return false;
m_havedoc = false;
// If set_doc(fn), take note of file name.
string fn = m_filename;
m_filename.erase();
string charset = m_defcharset;
LOGDEB(("textHtmlToDoc: next_document. defcharset: %s\n",charset.c_str()));
LOGDEB(("textHtmlToDoc: next_document. defcharset: %s\n",
charset.c_str()));
// - We first try to convert from the default configured charset
// (which may depend of the current directory) to utf-8. If this
@ -82,13 +88,23 @@ bool MimeHandlerHtml::next_document()
LOGDEB(("Html::mkDoc: pass %d\n", pass));
MyHtmlParser p;
// Try transcoding. If it fails, use original text.
if (!transcode(m_html, transcoded, charset, "UTF-8")) {
LOGERR(("textHtmlToDoc: transcode failed from cs '%s' to UTF-8\n",
charset.c_str()));
int ecnt;
if (!transcode(m_html, transcoded, charset, "UTF-8", &ecnt)) {
LOGDEB(("textHtmlToDoc: transcode failed from cs '%s' to UTF-8 for"
"[%s]", charset.c_str(), fn.empty()?"unknown":fn.c_str()));
transcoded = m_html;
// We don't know the charset, at all
p.ocharset = p.charset = charset = "";
} else {
if (ecnt) {
if (pass == 0) {
LOGDEB(("textHtmlToDoc: init transcode had %d errors for "
"[%s]", ecnt, fn.empty()?"unknown":fn.c_str()));
} else {
LOGERR(("textHtmlToDoc: final transcode had %d errors for "
"[%s]", ecnt, fn.empty()?"unknown":fn.c_str()));
}
}
// ocharset has the putative source charset, transcoded is now
// in utf-8
p.ocharset = charset;

View File

@ -16,7 +16,7 @@
*/
#ifndef _HTML_H_INCLUDED_
#define _HTML_H_INCLUDED_
/* @(#$Id: mh_html.h,v 1.9 2006-12-16 15:39:54 dockes Exp $ (C) 2004 J.F.Dockes */
/* @(#$Id: mh_html.h,v 1.10 2007-05-30 12:31:19 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string>
@ -38,6 +38,7 @@ class MimeHandlerHtml : public RecollFilter {
}
virtual bool next_document();
private:
string m_filename;
string m_html;
};

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: plaintorich.cpp,v 1.21 2007-05-23 09:19:48 dockes Exp $ (C) 2005 J.F.Dockes";
static char rcsid[] = "@(#$Id: plaintorich.cpp,v 1.22 2007-05-30 12:31:19 dockes Exp $ (C) 2005 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -39,7 +39,6 @@ using std::set;
#include "debuglog.h"
#include "textsplit.h"
#include "utf8iter.h"
#include "transcode.h"
#include "smallut.h"
#include "plaintorich.h"
#include "cancelcheck.h"

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: xadump.cpp,v 1.14 2007-01-13 14:41:40 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: xadump.cpp,v 1.15 2007-05-30 12:31:19 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -24,8 +24,6 @@ static char rcsid[] = "@(#$Id: xadump.cpp,v 1.14 2007-01-13 14:41:40 dockes Exp
#include <string>
#include <vector>
#include "transcode.h"
#ifndef NO_NAMESPACES
using namespace std;
#endif /* NO_NAMESPACES */

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: transcode.cpp,v 1.9 2006-11-20 15:29:08 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: transcode.cpp,v 1.10 2007-05-30 12:31:19 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -100,9 +100,8 @@ bool transcode(const string &in, string &out, const string &icode,
error:
if (icopen)
iconv_close(ic);
//fprintf(stderr, "TRANSCODE OUT:\n%s\n", out.c_str());
if (mecnt)
LOGINFO(("transcode: [%s]->[%s] %d errors\n",
LOGDEB(("transcode: [%s]->[%s] %d errors\n",
icode.c_str(), ocode.c_str(), mecnt));
if (ecnt)
*ecnt = mecnt;