sort of indexes html
This commit is contained in:
parent
0b18276947
commit
b9bb21f118
@ -1,5 +1,5 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: recollindex.cpp,v 1.5 2005-01-25 14:37:21 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: recollindex.cpp,v 1.6 2005-01-26 13:03:02 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
#endif
|
||||
|
||||
#include <sys/stat.h>
|
||||
@ -19,6 +19,7 @@ static char rcsid[] = "@(#$Id: recollindex.cpp,v 1.5 2005-01-25 14:37:21 dockes
|
||||
#include "csguess.h"
|
||||
#include "transcode.h"
|
||||
#include "mimehandler.h"
|
||||
#include "debuglog.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
@ -76,21 +77,17 @@ indexfile(void *cdata, const std::string &fn, const struct stat *stp,
|
||||
if (flg == FsTreeWalker::FtwDirEnter ||
|
||||
flg == FsTreeWalker::FtwDirReturn) {
|
||||
me->config->setKeyDir(fn);
|
||||
cout << "indexfile: [" << fn << "]" << endl;
|
||||
cout << " defcharset: " << me->config->getDefCharset()
|
||||
<< " deflang: " << me->config->getDefLang() << endl;
|
||||
|
||||
return FsTreeWalker::FtwOk;
|
||||
}
|
||||
|
||||
string mime = mimetype(fn, me->config->getMimeMap());
|
||||
if (mime.length() == 0) {
|
||||
cout << "indexfile: " << "(no mime)" << " " << fn << endl;
|
||||
LOGDEB(("indexfile: (no mime) [%s]\n", fn.c_str()));
|
||||
// No mime type ?? pass on.
|
||||
return FsTreeWalker::FtwOk;
|
||||
}
|
||||
|
||||
cout << "indexfile: " << mime << " " << fn << endl;
|
||||
LOGDEB(("indexfile: %s [%s]\n", mime.c_str(), fn.c_str()));
|
||||
|
||||
// Look for appropriate handler
|
||||
MimeHandlerFunc fun = getMimeHandler(mime, me->config->getMimeConf());
|
||||
|
||||
@ -37,7 +37,8 @@ class MyHtmlParser : public HtmlParser {
|
||||
bool in_script_tag;
|
||||
bool in_style_tag;
|
||||
string title, sample, keywords, dump;
|
||||
string charset; // This is the charset our user thinks the doc is in
|
||||
string ocharset; // This is the charset our user thinks the doc was
|
||||
string charset; // This is the charset it was supposedly converted to
|
||||
string doccharset; // Set this to value of charset parameter in header
|
||||
bool indexing_allowed;
|
||||
void process_text(const string &text);
|
||||
@ -125,12 +126,18 @@ MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
|
||||
map<string, string>::const_iterator k;
|
||||
if ((k = p.params.find("charset")) != p.params.end()) {
|
||||
doccharset = k->second;
|
||||
if (doccharset != charset)
|
||||
if (doccharset != ocharset) {
|
||||
LOGDEB1(("Doc specified charset '%s' "
|
||||
"differs from announced '%s'\n",
|
||||
doccharset.c_str(), ocharset.c_str()));
|
||||
throw true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if (tag == "p" || tag == "br") {
|
||||
dump += "\n";
|
||||
} else if (tag == "script") {
|
||||
in_script_tag = true;
|
||||
} else if (tag == "style") {
|
||||
@ -179,14 +186,11 @@ bool textHtmlToDoc(RclConfig *conf, const string &fn,
|
||||
} else
|
||||
charset = conf->defcharset;
|
||||
|
||||
LOGDEB(("textHtmlToDoc: charset before parsing: %s\n",
|
||||
charset.c_str()));
|
||||
LOGDEB(("textHtmlToDoc: charset before parsing: %s\n", charset.c_str()));
|
||||
|
||||
MyHtmlParser pres;
|
||||
for (int pass = 0; pass < 2; pass++) {
|
||||
string transcoded;
|
||||
LOGDEB(("textHtmlToDoc: transcode from %s to %s\n",
|
||||
charset.c_str(), "UTF-8"));
|
||||
|
||||
MyHtmlParser p;
|
||||
// Try transcoding. If it fails, use original text.
|
||||
@ -195,10 +199,11 @@ bool textHtmlToDoc(RclConfig *conf, const string &fn,
|
||||
charset.c_str()));
|
||||
transcoded = otext;
|
||||
// We don't know the charset, at all
|
||||
p.charset = charset = "";
|
||||
p.ocharset = p.charset = charset = "";
|
||||
} else {
|
||||
// charset has the putative source charset, transcoded is now
|
||||
// ocharset has the putative source charset, transcoded is now
|
||||
// in utf-8
|
||||
p.ocharset = charset;
|
||||
p.charset = "utf-8";
|
||||
}
|
||||
|
||||
@ -206,10 +211,10 @@ bool textHtmlToDoc(RclConfig *conf, const string &fn,
|
||||
p.parse_html(transcoded);
|
||||
} catch (bool) {
|
||||
pres = p;
|
||||
if (!pres.doccharset.empty() && pres.doccharset != charset) {
|
||||
if (!pres.doccharset.empty() &&
|
||||
pres.doccharset != pres.ocharset) {
|
||||
LOGDEB(("textHtmlToDoc: charset '%s' doc charset '%s',"
|
||||
"reparse\n", charset.c_str(),
|
||||
pres.doccharset.c_str()));
|
||||
"reparse\n", charset.c_str(),pres.doccharset.c_str()));
|
||||
charset = pres.doccharset;
|
||||
} else
|
||||
break;
|
||||
@ -219,6 +224,7 @@ bool textHtmlToDoc(RclConfig *conf, const string &fn,
|
||||
Rcl::Doc out;
|
||||
out.origcharset = charset;
|
||||
out.text = pres.dump;
|
||||
// LOGDEB(("textHtmlToDoc: dump : %s\n", pres.dump.c_str()));
|
||||
out.title = pres.title;
|
||||
out.keywords = pres.keywords;
|
||||
out.abstract = pres.sample;
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: mimehandler.cpp,v 1.2 2005-01-26 11:47:27 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: mimehandler.cpp,v 1.3 2005-01-26 13:03:02 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
#endif
|
||||
|
||||
#include <iostream>
|
||||
@ -79,7 +79,6 @@ MimeHandlerFunc getMimeHandler(const std::string &mtype, ConfTree *mhandlers)
|
||||
|
||||
// Retrieve handler function according to type
|
||||
if (!strcasecmp(toks[0].c_str(), "internal")) {
|
||||
cerr << "Internal Handler" << endl;
|
||||
map<string, MimeHandlerFunc>::const_iterator it =
|
||||
ihandlers.find(mtype);
|
||||
if (it == ihandlers.end()) {
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.8 2005-01-26 11:47:27 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.9 2005-01-26 13:03:02 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
#endif
|
||||
|
||||
#include <sys/stat.h>
|
||||
@ -232,7 +232,6 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &doc)
|
||||
}
|
||||
splitter.text_to_words(noacc);
|
||||
|
||||
LOGDEB(("Rcl::Db::add: doc split\n"));
|
||||
splitData.basepos += splitData.curpos + 100;
|
||||
if (!dumb_string(doc.text, noacc)) {
|
||||
LOGERR(("Rcl::Db::add: dum_string failed\n"));
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user