merged modifs from xapian/omega 0.8.5

This commit is contained in:
dockes 2005-01-28 09:37:37 +00:00
parent 44d2b70fdf
commit 6d35f5430c
5 changed files with 53 additions and 141 deletions

View File

@ -24,144 +24,19 @@
// This file has code from omindex + an adaptor function for recoll at the end
#include "htmlparse.h"
#include "mimehandler.h"
#include "debuglog.h"
#include "csguess.h"
#include "readfile.h"
#include "transcode.h"
#include "mimeparse.h"
class MyHtmlParser : public HtmlParser {
public:
bool in_script_tag;
bool in_style_tag;
string title, sample, keywords, dump;
string ocharset; // This is the charset our user thinks the doc was
string charset; // This is the charset it was supposedly converted to
string doccharset; // Set this to value of charset parameter in header
bool indexing_allowed;
void process_text(const string &text);
void opening_tag(const string &tag, const map<string,string> &p);
void closing_tag(const string &tag);
MyHtmlParser() :
in_script_tag(false),
in_style_tag(false),
indexing_allowed(true) { }
};
void
MyHtmlParser::process_text(const string &text)
{
// some tags are meaningful mid-word so this is simplistic at best...
if (!in_script_tag && !in_style_tag) {
string::size_type firstchar = text.find_first_not_of(" \t\n\r");
if (firstchar != string::npos) {
dump += text.substr(firstchar);
dump += " ";
}
}
}
// lets hope that the charset includes ascii values...
static inline void
lowercase_term(string &term)
{
string::iterator i = term.begin();
while (i != term.end()) {
if (*i >= 'A' && *i <= 'Z')
*i = *i + 'a' - 'A';
i++;
}
}
#include "myhtmlparse.h"
#include "indextext.h"
#include <iostream>
using namespace std;
void
MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
{
#if 0
cout << "TAG: " << tag << ": " << endl;
map<string, string>::const_iterator x;
for (x = p.begin(); x != p.end(); x++) {
cout << " " << x->first << " -> '" << x->second << "'" << endl;
}
#endif
if (tag == "meta") {
map<string, string>::const_iterator i, j;
if ((i = p.find("content")) != p.end()) {
if ((j = p.find("name")) != p.end()) {
string name = j->second;
lowercase_term(name);
if (name == "description") {
if (sample.empty()) {
sample = i->second;
decode_entities(sample);
}
} else if (name == "keywords") {
if (!keywords.empty()) keywords += ' ';
string tmp = i->second;
decode_entities(tmp);
keywords += tmp;
} else if (name == "robots") {
string val = i->second;
decode_entities(val);
lowercase_term(val);
if (val.find("none") != string::npos ||
val.find("noindex") != string::npos) {
indexing_allowed = false;
throw true;
}
}
} else if ((j = p.find("http-equiv")) != p.end()) {
string hequiv = j->second;
lowercase_term(hequiv);
if (hequiv == "content-type") {
string value = i->second;
MimeHeaderValue p = parseMimeHeaderValue(value);
map<string, string>::const_iterator k;
if ((k = p.params.find("charset")) != p.params.end()) {
doccharset = k->second;
if (doccharset != ocharset) {
LOGDEB1(("Doc specified charset '%s' "
"differs from announced '%s'\n",
doccharset.c_str(), ocharset.c_str()));
throw true;
}
}
}
}
}
} else if (tag == "p" || tag == "br" || tag == "li") {
dump += "\n";
} else if (tag == "script") {
in_script_tag = true;
} else if (tag == "style") {
in_style_tag = true;
} else if (tag == "body") {
dump = "";
}
}
void
MyHtmlParser::closing_tag(const string &tag)
{
if (tag == "title") {
title = dump;
dump = "";
} else if (tag == "script") {
in_script_tag = false;
} else if (tag == "style") {
in_style_tag = false;
} else if (tag == "body") {
throw true;
}
}
bool textHtmlToDoc(RclConfig *conf, const string &fn,
const string &mtype, Rcl::Doc &docout)
{

View File

@ -25,6 +25,8 @@
#include "indextext.h" // for lowercase_term()
#include "mimeparse.h"
void
MyHtmlParser::process_text(const string &text)
{
@ -50,12 +52,11 @@ void
MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
{
#if 0
cout << "<" << tag;
cout << "TAG: " << tag << ": " << endl;
map<string, string>::const_iterator x;
for (x = p.begin(); x != p.end(); x++) {
cout << " " << x->first << "=\"" << x->second << "\"";
cout << " " << x->first << " -> '" << x->second << "'" << endl;
}
cout << ">\n";
#endif
if (tag.empty()) return;
switch (tag[0]) {
@ -67,7 +68,10 @@ MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
dump = "";
break;
}
if (tag == "blockquote" || tag == "br") pending_space = true;
if (tag == "blockquote" || tag == "br") {
dump += '\n';
pending_space = true;
}
break;
case 'c':
if (tag == "center") pending_space = true;
@ -84,8 +88,10 @@ MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
break;
case 'h':
// hr, and h1, ..., h6
if (tag.length() == 2 && strchr("r123456", tag[1]))
if (tag.length() == 2 && strchr("r123456", tag[1])) {
dump += '\n';
pending_space = true;
}
break;
case 'i':
if (tag == "iframe" || tag == "img" || tag == "isindex" ||
@ -95,11 +101,14 @@ MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
if (tag == "keygen") pending_space = true;
break;
case 'l':
if (tag == "legend" || tag == "li" || tag == "listing")
if (tag == "legend" || tag == "li" || tag == "listing") {
dump += '\n';
pending_space = true;
}
break;
case 'm':
if (tag == "meta") {
LOGDEB(("Found META\n"));
map<string, string>::const_iterator i, j;
if ((i = p.find("content")) != p.end()) {
if ((j = p.find("name")) != p.end()) {
@ -125,6 +134,26 @@ MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
throw true;
}
}
} else if ((j = p.find("http-equiv")) != p.end()) {
LOGDEB(("Found http-equiv\n"));
string hequiv = j->second;
lowercase_term(hequiv);
if (hequiv == "content-type") {
string value = i->second;
MimeHeaderValue p = parseMimeHeaderValue(value);
map<string, string>::const_iterator k;
if ((k = p.params.find("charset")) !=
p.params.end()) {
doccharset = k->second;
if (doccharset != ocharset) {
LOGDEB1(("Doc specified charset '%s' "
"differs from announced '%s'\n",
doccharset.c_str(),
ocharset.c_str()));
throw true;
}
}
}
}
}
break;
@ -136,8 +165,10 @@ MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
if (tag == "ol" || tag == "option") pending_space = true;
break;
case 'p':
if (tag == "p" || tag == "pre" || tag == "plaintext")
if (tag == "p" || tag == "pre" || tag == "plaintext") {
dump += '\n';
pending_space = true;
}
break;
case 'q':
if (tag == "q") pending_space = true;

View File

@ -1,4 +1,3 @@
=======
/* myhtmlparse.h: subclass of HtmlParser for extracting text
*
* ----START-LICENCE----
@ -35,6 +34,9 @@ class MyHtmlParser : public HtmlParser {
bool in_style_tag;
bool pending_space;
string title, sample, keywords, dump;
string ocharset; // This is the charset our user thinks the doc was
string charset; // This is the charset it was supposedly converted to
string doccharset; // Set this to value of charset parameter in header
bool indexing_allowed;
void process_text(const string &text);
void opening_tag(const string &tag, const map<string,string> &p);

View File

@ -9,14 +9,14 @@ all: $(LIBS)
OBJS = conftree.o csguess.o debuglog.o \
fstreewalk.o html.o htmlparse.o \
mimehandler.o mimeparse.o mimetype.o pathut.o \
mimehandler.o mimeparse.o mimetype.o myhtmlparse.o pathut.o \
rclconfig.o rcldb.o readfile.o \
textsplit.o transcode.o \
unacpp.o unac.o
SRCS = ../utils/conftree.cpp ../index/csguess.cpp ../utils/debuglog.cpp \
../utils/fstreewalk.cpp ../common/html.cpp ../common/htmlparse.cpp \
../common/mimehandler.cpp ../utils/mimeparse.cpp ../index/mimetype.cpp \
../utils/pathut.cpp \
../common/myhtmlparse.cpp ../utils/pathut.cpp \
../common/rclconfig.cpp ../common/rcldb.cpp ../utils/readfile.cpp \
../common/textsplit.cpp ../utils/transcode.cpp \
../common/unacpp.cpp ../unac/unac.c
@ -46,6 +46,8 @@ mimeparse.o : ../utils/mimeparse.cpp
$(CXX) $(CXXFLAGS) -c $<
mimetype.o : ../index/mimetype.cpp
$(CXX) $(CXXFLAGS) -c $<
myhtmlparse.o : ../common/myhtmlparse.cpp
$(CXX) $(CXXFLAGS) -c $<
pathut.o : ../utils/pathut.cpp
$(CXX) $(CXXFLAGS) -c $<
rclconfig.o : ../common/rclconfig.cpp

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.10 2005-01-28 08:41:40 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.11 2005-01-28 09:37:37 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
#include <sys/stat.h>
@ -85,18 +85,20 @@ bool Rcl::Db::open(const string& dir, OpenMode mode)
try {
switch (mode) {
case DbUpd:
ndb->wdb = Xapian::Auto::open(dir, Xapian::DB_CREATE_OR_OPEN);
ndb->wdb =
Xapian::WritableDatabase(dir, Xapian::DB_CREATE_OR_OPEN);
ndb->updated.resize(ndb->wdb.get_lastdocid() + 1);
ndb->iswritable = true;
break;
case DbTrunc:
ndb->wdb = Xapian::Auto::open(dir, Xapian::DB_CREATE_OR_OVERWRITE);
ndb->wdb =
Xapian::WritableDatabase(dir, Xapian::DB_CREATE_OR_OVERWRITE);
ndb->iswritable = true;
break;
case DbRO:
default:
ndb->iswritable = false;
ndb->db = Xapian::Auto::open(dir, Xapian::DB_OPEN);
ndb->db = Xapian::Database(dir);
break;
}
ndb->isopen = true;