sort of indexes html

This commit is contained in:
dockes 2005-01-26 13:03:02 +00:00
parent 0b18276947
commit b9bb21f118
4 changed files with 23 additions and 22 deletions

View File

@ -1,5 +1,5 @@
#ifndef lint #ifndef lint
static char rcsid[] = "@(#$Id: recollindex.cpp,v 1.5 2005-01-25 14:37:21 dockes Exp $ (C) 2004 J.F.Dockes"; static char rcsid[] = "@(#$Id: recollindex.cpp,v 1.6 2005-01-26 13:03:02 dockes Exp $ (C) 2004 J.F.Dockes";
#endif #endif
#include <sys/stat.h> #include <sys/stat.h>
@ -19,6 +19,7 @@ static char rcsid[] = "@(#$Id: recollindex.cpp,v 1.5 2005-01-25 14:37:21 dockes
#include "csguess.h" #include "csguess.h"
#include "transcode.h" #include "transcode.h"
#include "mimehandler.h" #include "mimehandler.h"
#include "debuglog.h"
using namespace std; using namespace std;
@ -76,21 +77,17 @@ indexfile(void *cdata, const std::string &fn, const struct stat *stp,
if (flg == FsTreeWalker::FtwDirEnter || if (flg == FsTreeWalker::FtwDirEnter ||
flg == FsTreeWalker::FtwDirReturn) { flg == FsTreeWalker::FtwDirReturn) {
me->config->setKeyDir(fn); me->config->setKeyDir(fn);
cout << "indexfile: [" << fn << "]" << endl;
cout << " defcharset: " << me->config->getDefCharset()
<< " deflang: " << me->config->getDefLang() << endl;
return FsTreeWalker::FtwOk; return FsTreeWalker::FtwOk;
} }
string mime = mimetype(fn, me->config->getMimeMap()); string mime = mimetype(fn, me->config->getMimeMap());
if (mime.length() == 0) { if (mime.length() == 0) {
cout << "indexfile: " << "(no mime)" << " " << fn << endl; LOGDEB(("indexfile: (no mime) [%s]\n", fn.c_str()));
// No mime type ?? pass on. // No mime type ?? pass on.
return FsTreeWalker::FtwOk; return FsTreeWalker::FtwOk;
} }
cout << "indexfile: " << mime << " " << fn << endl; LOGDEB(("indexfile: %s [%s]\n", mime.c_str(), fn.c_str()));
// Look for appropriate handler // Look for appropriate handler
MimeHandlerFunc fun = getMimeHandler(mime, me->config->getMimeConf()); MimeHandlerFunc fun = getMimeHandler(mime, me->config->getMimeConf());

View File

@ -37,7 +37,8 @@ class MyHtmlParser : public HtmlParser {
bool in_script_tag; bool in_script_tag;
bool in_style_tag; bool in_style_tag;
string title, sample, keywords, dump; string title, sample, keywords, dump;
string charset; // This is the charset our user thinks the doc is in string ocharset; // This is the charset our user thinks the doc was
string charset; // This is the charset it was supposedly converted to
string doccharset; // Set this to value of charset parameter in header string doccharset; // Set this to value of charset parameter in header
bool indexing_allowed; bool indexing_allowed;
void process_text(const string &text); void process_text(const string &text);
@ -125,12 +126,18 @@ MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
map<string, string>::const_iterator k; map<string, string>::const_iterator k;
if ((k = p.params.find("charset")) != p.params.end()) { if ((k = p.params.find("charset")) != p.params.end()) {
doccharset = k->second; doccharset = k->second;
if (doccharset != charset) if (doccharset != ocharset) {
LOGDEB1(("Doc specified charset '%s' "
"differs from announced '%s'\n",
doccharset.c_str(), ocharset.c_str()));
throw true; throw true;
}
} }
} }
} }
} }
} else if (tag == "p" || tag == "br") {
dump += "\n";
} else if (tag == "script") { } else if (tag == "script") {
in_script_tag = true; in_script_tag = true;
} else if (tag == "style") { } else if (tag == "style") {
@ -179,14 +186,11 @@ bool textHtmlToDoc(RclConfig *conf, const string &fn,
} else } else
charset = conf->defcharset; charset = conf->defcharset;
LOGDEB(("textHtmlToDoc: charset before parsing: %s\n", LOGDEB(("textHtmlToDoc: charset before parsing: %s\n", charset.c_str()));
charset.c_str()));
MyHtmlParser pres; MyHtmlParser pres;
for (int pass = 0; pass < 2; pass++) { for (int pass = 0; pass < 2; pass++) {
string transcoded; string transcoded;
LOGDEB(("textHtmlToDoc: transcode from %s to %s\n",
charset.c_str(), "UTF-8"));
MyHtmlParser p; MyHtmlParser p;
// Try transcoding. If it fails, use original text. // Try transcoding. If it fails, use original text.
@ -195,10 +199,11 @@ bool textHtmlToDoc(RclConfig *conf, const string &fn,
charset.c_str())); charset.c_str()));
transcoded = otext; transcoded = otext;
// We don't know the charset, at all // We don't know the charset, at all
p.charset = charset = ""; p.ocharset = p.charset = charset = "";
} else { } else {
// charset has the putative source charset, transcoded is now // ocharset has the putative source charset, transcoded is now
// in utf-8 // in utf-8
p.ocharset = charset;
p.charset = "utf-8"; p.charset = "utf-8";
} }
@ -206,10 +211,10 @@ bool textHtmlToDoc(RclConfig *conf, const string &fn,
p.parse_html(transcoded); p.parse_html(transcoded);
} catch (bool) { } catch (bool) {
pres = p; pres = p;
if (!pres.doccharset.empty() && pres.doccharset != charset) { if (!pres.doccharset.empty() &&
pres.doccharset != pres.ocharset) {
LOGDEB(("textHtmlToDoc: charset '%s' doc charset '%s'," LOGDEB(("textHtmlToDoc: charset '%s' doc charset '%s',"
"reparse\n", charset.c_str(), "reparse\n", charset.c_str(),pres.doccharset.c_str()));
pres.doccharset.c_str()));
charset = pres.doccharset; charset = pres.doccharset;
} else } else
break; break;
@ -219,6 +224,7 @@ bool textHtmlToDoc(RclConfig *conf, const string &fn,
Rcl::Doc out; Rcl::Doc out;
out.origcharset = charset; out.origcharset = charset;
out.text = pres.dump; out.text = pres.dump;
// LOGDEB(("textHtmlToDoc: dump : %s\n", pres.dump.c_str()));
out.title = pres.title; out.title = pres.title;
out.keywords = pres.keywords; out.keywords = pres.keywords;
out.abstract = pres.sample; out.abstract = pres.sample;

View File

@ -1,5 +1,5 @@
#ifndef lint #ifndef lint
static char rcsid[] = "@(#$Id: mimehandler.cpp,v 1.2 2005-01-26 11:47:27 dockes Exp $ (C) 2004 J.F.Dockes"; static char rcsid[] = "@(#$Id: mimehandler.cpp,v 1.3 2005-01-26 13:03:02 dockes Exp $ (C) 2004 J.F.Dockes";
#endif #endif
#include <iostream> #include <iostream>
@ -79,7 +79,6 @@ MimeHandlerFunc getMimeHandler(const std::string &mtype, ConfTree *mhandlers)
// Retrieve handler function according to type // Retrieve handler function according to type
if (!strcasecmp(toks[0].c_str(), "internal")) { if (!strcasecmp(toks[0].c_str(), "internal")) {
cerr << "Internal Handler" << endl;
map<string, MimeHandlerFunc>::const_iterator it = map<string, MimeHandlerFunc>::const_iterator it =
ihandlers.find(mtype); ihandlers.find(mtype);
if (it == ihandlers.end()) { if (it == ihandlers.end()) {

View File

@ -1,5 +1,5 @@
#ifndef lint #ifndef lint
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.8 2005-01-26 11:47:27 dockes Exp $ (C) 2004 J.F.Dockes"; static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.9 2005-01-26 13:03:02 dockes Exp $ (C) 2004 J.F.Dockes";
#endif #endif
#include <sys/stat.h> #include <sys/stat.h>
@ -232,7 +232,6 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &doc)
} }
splitter.text_to_words(noacc); splitter.text_to_words(noacc);
LOGDEB(("Rcl::Db::add: doc split\n"));
splitData.basepos += splitData.curpos + 100; splitData.basepos += splitData.curpos + 100;
if (!dumb_string(doc.text, noacc)) { if (!dumb_string(doc.text, noacc)) {
LOGERR(("Rcl::Db::add: dum_string failed\n")); LOGERR(("Rcl::Db::add: dum_string failed\n"));