From b9bb21f1183258d85a9508ce3ac872633f8d0ba6 Mon Sep 17 00:00:00 2001 From: dockes Date: Wed, 26 Jan 2005 13:03:02 +0000 Subject: [PATCH] sort of indexes html --- src/index/recollindex.cpp | 11 ++++------- src/internfile/mh_html.cpp | 28 +++++++++++++++++----------- src/internfile/mimehandler.cpp | 3 +-- src/rcldb/rcldb.cpp | 3 +-- 4 files changed, 23 insertions(+), 22 deletions(-) diff --git a/src/index/recollindex.cpp b/src/index/recollindex.cpp index fefcf0dd..a9746a8a 100644 --- a/src/index/recollindex.cpp +++ b/src/index/recollindex.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: recollindex.cpp,v 1.5 2005-01-25 14:37:21 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: recollindex.cpp,v 1.6 2005-01-26 13:03:02 dockes Exp $ (C) 2004 J.F.Dockes"; #endif #include @@ -19,6 +19,7 @@ static char rcsid[] = "@(#$Id: recollindex.cpp,v 1.5 2005-01-25 14:37:21 dockes #include "csguess.h" #include "transcode.h" #include "mimehandler.h" +#include "debuglog.h" using namespace std; @@ -76,21 +77,17 @@ indexfile(void *cdata, const std::string &fn, const struct stat *stp, if (flg == FsTreeWalker::FtwDirEnter || flg == FsTreeWalker::FtwDirReturn) { me->config->setKeyDir(fn); - cout << "indexfile: [" << fn << "]" << endl; - cout << " defcharset: " << me->config->getDefCharset() - << " deflang: " << me->config->getDefLang() << endl; - return FsTreeWalker::FtwOk; } string mime = mimetype(fn, me->config->getMimeMap()); if (mime.length() == 0) { - cout << "indexfile: " << "(no mime)" << " " << fn << endl; + LOGDEB(("indexfile: (no mime) [%s]\n", fn.c_str())); // No mime type ?? pass on. return FsTreeWalker::FtwOk; } - cout << "indexfile: " << mime << " " << fn << endl; + LOGDEB(("indexfile: %s [%s]\n", mime.c_str(), fn.c_str())); // Look for appropriate handler MimeHandlerFunc fun = getMimeHandler(mime, me->config->getMimeConf()); diff --git a/src/internfile/mh_html.cpp b/src/internfile/mh_html.cpp index 9b98e4d1..1d7e29bd 100644 --- a/src/internfile/mh_html.cpp +++ b/src/internfile/mh_html.cpp @@ -37,7 +37,8 @@ class MyHtmlParser : public HtmlParser { bool in_script_tag; bool in_style_tag; string title, sample, keywords, dump; - string charset; // This is the charset our user thinks the doc is in + string ocharset; // This is the charset our user thinks the doc was + string charset; // This is the charset it was supposedly converted to string doccharset; // Set this to value of charset parameter in header bool indexing_allowed; void process_text(const string &text); @@ -125,12 +126,18 @@ MyHtmlParser::opening_tag(const string &tag, const map &p) map::const_iterator k; if ((k = p.params.find("charset")) != p.params.end()) { doccharset = k->second; - if (doccharset != charset) + if (doccharset != ocharset) { + LOGDEB1(("Doc specified charset '%s' " + "differs from announced '%s'\n", + doccharset.c_str(), ocharset.c_str())); throw true; + } } } } } + } else if (tag == "p" || tag == "br") { + dump += "\n"; } else if (tag == "script") { in_script_tag = true; } else if (tag == "style") { @@ -179,14 +186,11 @@ bool textHtmlToDoc(RclConfig *conf, const string &fn, } else charset = conf->defcharset; - LOGDEB(("textHtmlToDoc: charset before parsing: %s\n", - charset.c_str())); + LOGDEB(("textHtmlToDoc: charset before parsing: %s\n", charset.c_str())); MyHtmlParser pres; for (int pass = 0; pass < 2; pass++) { string transcoded; - LOGDEB(("textHtmlToDoc: transcode from %s to %s\n", - charset.c_str(), "UTF-8")); MyHtmlParser p; // Try transcoding. If it fails, use original text. @@ -195,10 +199,11 @@ bool textHtmlToDoc(RclConfig *conf, const string &fn, charset.c_str())); transcoded = otext; // We don't know the charset, at all - p.charset = charset = ""; + p.ocharset = p.charset = charset = ""; } else { - // charset has the putative source charset, transcoded is now + // ocharset has the putative source charset, transcoded is now // in utf-8 + p.ocharset = charset; p.charset = "utf-8"; } @@ -206,10 +211,10 @@ bool textHtmlToDoc(RclConfig *conf, const string &fn, p.parse_html(transcoded); } catch (bool) { pres = p; - if (!pres.doccharset.empty() && pres.doccharset != charset) { + if (!pres.doccharset.empty() && + pres.doccharset != pres.ocharset) { LOGDEB(("textHtmlToDoc: charset '%s' doc charset '%s'," - "reparse\n", charset.c_str(), - pres.doccharset.c_str())); + "reparse\n", charset.c_str(),pres.doccharset.c_str())); charset = pres.doccharset; } else break; @@ -219,6 +224,7 @@ bool textHtmlToDoc(RclConfig *conf, const string &fn, Rcl::Doc out; out.origcharset = charset; out.text = pres.dump; + // LOGDEB(("textHtmlToDoc: dump : %s\n", pres.dump.c_str())); out.title = pres.title; out.keywords = pres.keywords; out.abstract = pres.sample; diff --git a/src/internfile/mimehandler.cpp b/src/internfile/mimehandler.cpp index 547ecec6..53180539 100644 --- a/src/internfile/mimehandler.cpp +++ b/src/internfile/mimehandler.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: mimehandler.cpp,v 1.2 2005-01-26 11:47:27 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: mimehandler.cpp,v 1.3 2005-01-26 13:03:02 dockes Exp $ (C) 2004 J.F.Dockes"; #endif #include @@ -79,7 +79,6 @@ MimeHandlerFunc getMimeHandler(const std::string &mtype, ConfTree *mhandlers) // Retrieve handler function according to type if (!strcasecmp(toks[0].c_str(), "internal")) { - cerr << "Internal Handler" << endl; map::const_iterator it = ihandlers.find(mtype); if (it == ihandlers.end()) { diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index d6466d75..0e7b4211 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.8 2005-01-26 11:47:27 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.9 2005-01-26 13:03:02 dockes Exp $ (C) 2004 J.F.Dockes"; #endif #include @@ -232,7 +232,6 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &doc) } splitter.text_to_words(noacc); - LOGDEB(("Rcl::Db::add: doc split\n")); splitData.basepos += splitData.curpos + 100; if (!dumb_string(doc.text, noacc)) { LOGERR(("Rcl::Db::add: dum_string failed\n"));