sort of indexes html

This commit is contained in:
dockes 2005-01-26 13:03:02 +00:00
parent 0b18276947
commit b9bb21f118
4 changed files with 23 additions and 22 deletions

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: recollindex.cpp,v 1.5 2005-01-25 14:37:21 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: recollindex.cpp,v 1.6 2005-01-26 13:03:02 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
#include <sys/stat.h>
@ -19,6 +19,7 @@ static char rcsid[] = "@(#$Id: recollindex.cpp,v 1.5 2005-01-25 14:37:21 dockes
#include "csguess.h"
#include "transcode.h"
#include "mimehandler.h"
#include "debuglog.h"
using namespace std;
@ -76,21 +77,17 @@ indexfile(void *cdata, const std::string &fn, const struct stat *stp,
if (flg == FsTreeWalker::FtwDirEnter ||
flg == FsTreeWalker::FtwDirReturn) {
me->config->setKeyDir(fn);
cout << "indexfile: [" << fn << "]" << endl;
cout << " defcharset: " << me->config->getDefCharset()
<< " deflang: " << me->config->getDefLang() << endl;
return FsTreeWalker::FtwOk;
}
string mime = mimetype(fn, me->config->getMimeMap());
if (mime.length() == 0) {
cout << "indexfile: " << "(no mime)" << " " << fn << endl;
LOGDEB(("indexfile: (no mime) [%s]\n", fn.c_str()));
// No mime type ?? pass on.
return FsTreeWalker::FtwOk;
}
cout << "indexfile: " << mime << " " << fn << endl;
LOGDEB(("indexfile: %s [%s]\n", mime.c_str(), fn.c_str()));
// Look for appropriate handler
MimeHandlerFunc fun = getMimeHandler(mime, me->config->getMimeConf());

View File

@ -37,7 +37,8 @@ class MyHtmlParser : public HtmlParser {
bool in_script_tag;
bool in_style_tag;
string title, sample, keywords, dump;
string charset; // This is the charset our user thinks the doc is in
string ocharset; // This is the charset our user thinks the doc was
string charset; // This is the charset it was supposedly converted to
string doccharset; // Set this to value of charset parameter in header
bool indexing_allowed;
void process_text(const string &text);
@ -125,12 +126,18 @@ MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
map<string, string>::const_iterator k;
if ((k = p.params.find("charset")) != p.params.end()) {
doccharset = k->second;
if (doccharset != charset)
if (doccharset != ocharset) {
LOGDEB1(("Doc specified charset '%s' "
"differs from announced '%s'\n",
doccharset.c_str(), ocharset.c_str()));
throw true;
}
}
}
}
}
} else if (tag == "p" || tag == "br") {
dump += "\n";
} else if (tag == "script") {
in_script_tag = true;
} else if (tag == "style") {
@ -179,14 +186,11 @@ bool textHtmlToDoc(RclConfig *conf, const string &fn,
} else
charset = conf->defcharset;
LOGDEB(("textHtmlToDoc: charset before parsing: %s\n",
charset.c_str()));
LOGDEB(("textHtmlToDoc: charset before parsing: %s\n", charset.c_str()));
MyHtmlParser pres;
for (int pass = 0; pass < 2; pass++) {
string transcoded;
LOGDEB(("textHtmlToDoc: transcode from %s to %s\n",
charset.c_str(), "UTF-8"));
MyHtmlParser p;
// Try transcoding. If it fails, use original text.
@ -195,10 +199,11 @@ bool textHtmlToDoc(RclConfig *conf, const string &fn,
charset.c_str()));
transcoded = otext;
// We don't know the charset, at all
p.charset = charset = "";
p.ocharset = p.charset = charset = "";
} else {
// charset has the putative source charset, transcoded is now
// ocharset has the putative source charset, transcoded is now
// in utf-8
p.ocharset = charset;
p.charset = "utf-8";
}
@ -206,10 +211,10 @@ bool textHtmlToDoc(RclConfig *conf, const string &fn,
p.parse_html(transcoded);
} catch (bool) {
pres = p;
if (!pres.doccharset.empty() && pres.doccharset != charset) {
if (!pres.doccharset.empty() &&
pres.doccharset != pres.ocharset) {
LOGDEB(("textHtmlToDoc: charset '%s' doc charset '%s',"
"reparse\n", charset.c_str(),
pres.doccharset.c_str()));
"reparse\n", charset.c_str(),pres.doccharset.c_str()));
charset = pres.doccharset;
} else
break;
@ -219,6 +224,7 @@ bool textHtmlToDoc(RclConfig *conf, const string &fn,
Rcl::Doc out;
out.origcharset = charset;
out.text = pres.dump;
// LOGDEB(("textHtmlToDoc: dump : %s\n", pres.dump.c_str()));
out.title = pres.title;
out.keywords = pres.keywords;
out.abstract = pres.sample;

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: mimehandler.cpp,v 1.2 2005-01-26 11:47:27 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: mimehandler.cpp,v 1.3 2005-01-26 13:03:02 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
#include <iostream>
@ -79,7 +79,6 @@ MimeHandlerFunc getMimeHandler(const std::string &mtype, ConfTree *mhandlers)
// Retrieve handler function according to type
if (!strcasecmp(toks[0].c_str(), "internal")) {
cerr << "Internal Handler" << endl;
map<string, MimeHandlerFunc>::const_iterator it =
ihandlers.find(mtype);
if (it == ihandlers.end()) {

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.8 2005-01-26 11:47:27 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.9 2005-01-26 13:03:02 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
#include <sys/stat.h>
@ -232,7 +232,6 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &doc)
}
splitter.text_to_words(noacc);
LOGDEB(("Rcl::Db::add: doc split\n"));
splitData.basepos += splitData.curpos + 100;
if (!dumb_string(doc.text, noacc)) {
LOGERR(("Rcl::Db::add: dum_string failed\n"));