merged modifs from xapian/omega 0.8.5
This commit is contained in:
parent
44d2b70fdf
commit
6d35f5430c
@ -24,144 +24,19 @@
|
||||
|
||||
// This file has code from omindex + an adaptor function for recoll at the end
|
||||
|
||||
#include "htmlparse.h"
|
||||
#include "mimehandler.h"
|
||||
#include "debuglog.h"
|
||||
#include "csguess.h"
|
||||
#include "readfile.h"
|
||||
#include "transcode.h"
|
||||
#include "mimeparse.h"
|
||||
|
||||
class MyHtmlParser : public HtmlParser {
|
||||
public:
|
||||
bool in_script_tag;
|
||||
bool in_style_tag;
|
||||
string title, sample, keywords, dump;
|
||||
string ocharset; // This is the charset our user thinks the doc was
|
||||
string charset; // This is the charset it was supposedly converted to
|
||||
string doccharset; // Set this to value of charset parameter in header
|
||||
bool indexing_allowed;
|
||||
void process_text(const string &text);
|
||||
void opening_tag(const string &tag, const map<string,string> &p);
|
||||
void closing_tag(const string &tag);
|
||||
MyHtmlParser() :
|
||||
in_script_tag(false),
|
||||
in_style_tag(false),
|
||||
indexing_allowed(true) { }
|
||||
};
|
||||
|
||||
void
|
||||
MyHtmlParser::process_text(const string &text)
|
||||
{
|
||||
// some tags are meaningful mid-word so this is simplistic at best...
|
||||
|
||||
if (!in_script_tag && !in_style_tag) {
|
||||
string::size_type firstchar = text.find_first_not_of(" \t\n\r");
|
||||
if (firstchar != string::npos) {
|
||||
dump += text.substr(firstchar);
|
||||
dump += " ";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// lets hope that the charset includes ascii values...
|
||||
static inline void
|
||||
lowercase_term(string &term)
|
||||
{
|
||||
string::iterator i = term.begin();
|
||||
while (i != term.end()) {
|
||||
if (*i >= 'A' && *i <= 'Z')
|
||||
*i = *i + 'a' - 'A';
|
||||
i++;
|
||||
}
|
||||
}
|
||||
#include "myhtmlparse.h"
|
||||
#include "indextext.h"
|
||||
|
||||
#include <iostream>
|
||||
using namespace std;
|
||||
|
||||
|
||||
void
|
||||
MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
|
||||
{
|
||||
#if 0
|
||||
cout << "TAG: " << tag << ": " << endl;
|
||||
map<string, string>::const_iterator x;
|
||||
for (x = p.begin(); x != p.end(); x++) {
|
||||
cout << " " << x->first << " -> '" << x->second << "'" << endl;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (tag == "meta") {
|
||||
map<string, string>::const_iterator i, j;
|
||||
if ((i = p.find("content")) != p.end()) {
|
||||
if ((j = p.find("name")) != p.end()) {
|
||||
string name = j->second;
|
||||
lowercase_term(name);
|
||||
if (name == "description") {
|
||||
if (sample.empty()) {
|
||||
sample = i->second;
|
||||
decode_entities(sample);
|
||||
}
|
||||
} else if (name == "keywords") {
|
||||
if (!keywords.empty()) keywords += ' ';
|
||||
string tmp = i->second;
|
||||
decode_entities(tmp);
|
||||
keywords += tmp;
|
||||
} else if (name == "robots") {
|
||||
string val = i->second;
|
||||
decode_entities(val);
|
||||
lowercase_term(val);
|
||||
if (val.find("none") != string::npos ||
|
||||
val.find("noindex") != string::npos) {
|
||||
indexing_allowed = false;
|
||||
throw true;
|
||||
}
|
||||
}
|
||||
} else if ((j = p.find("http-equiv")) != p.end()) {
|
||||
string hequiv = j->second;
|
||||
lowercase_term(hequiv);
|
||||
if (hequiv == "content-type") {
|
||||
string value = i->second;
|
||||
MimeHeaderValue p = parseMimeHeaderValue(value);
|
||||
map<string, string>::const_iterator k;
|
||||
if ((k = p.params.find("charset")) != p.params.end()) {
|
||||
doccharset = k->second;
|
||||
if (doccharset != ocharset) {
|
||||
LOGDEB1(("Doc specified charset '%s' "
|
||||
"differs from announced '%s'\n",
|
||||
doccharset.c_str(), ocharset.c_str()));
|
||||
throw true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if (tag == "p" || tag == "br" || tag == "li") {
|
||||
dump += "\n";
|
||||
} else if (tag == "script") {
|
||||
in_script_tag = true;
|
||||
} else if (tag == "style") {
|
||||
in_style_tag = true;
|
||||
} else if (tag == "body") {
|
||||
dump = "";
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
MyHtmlParser::closing_tag(const string &tag)
|
||||
{
|
||||
if (tag == "title") {
|
||||
title = dump;
|
||||
dump = "";
|
||||
} else if (tag == "script") {
|
||||
in_script_tag = false;
|
||||
} else if (tag == "style") {
|
||||
in_style_tag = false;
|
||||
} else if (tag == "body") {
|
||||
throw true;
|
||||
}
|
||||
}
|
||||
|
||||
bool textHtmlToDoc(RclConfig *conf, const string &fn,
|
||||
const string &mtype, Rcl::Doc &docout)
|
||||
{
|
||||
|
||||
@ -25,6 +25,8 @@
|
||||
|
||||
#include "indextext.h" // for lowercase_term()
|
||||
|
||||
#include "mimeparse.h"
|
||||
|
||||
void
|
||||
MyHtmlParser::process_text(const string &text)
|
||||
{
|
||||
@ -50,12 +52,11 @@ void
|
||||
MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
|
||||
{
|
||||
#if 0
|
||||
cout << "<" << tag;
|
||||
cout << "TAG: " << tag << ": " << endl;
|
||||
map<string, string>::const_iterator x;
|
||||
for (x = p.begin(); x != p.end(); x++) {
|
||||
cout << " " << x->first << "=\"" << x->second << "\"";
|
||||
cout << " " << x->first << " -> '" << x->second << "'" << endl;
|
||||
}
|
||||
cout << ">\n";
|
||||
#endif
|
||||
if (tag.empty()) return;
|
||||
switch (tag[0]) {
|
||||
@ -67,7 +68,10 @@ MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
|
||||
dump = "";
|
||||
break;
|
||||
}
|
||||
if (tag == "blockquote" || tag == "br") pending_space = true;
|
||||
if (tag == "blockquote" || tag == "br") {
|
||||
dump += '\n';
|
||||
pending_space = true;
|
||||
}
|
||||
break;
|
||||
case 'c':
|
||||
if (tag == "center") pending_space = true;
|
||||
@ -84,8 +88,10 @@ MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
|
||||
break;
|
||||
case 'h':
|
||||
// hr, and h1, ..., h6
|
||||
if (tag.length() == 2 && strchr("r123456", tag[1]))
|
||||
if (tag.length() == 2 && strchr("r123456", tag[1])) {
|
||||
dump += '\n';
|
||||
pending_space = true;
|
||||
}
|
||||
break;
|
||||
case 'i':
|
||||
if (tag == "iframe" || tag == "img" || tag == "isindex" ||
|
||||
@ -95,11 +101,14 @@ MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
|
||||
if (tag == "keygen") pending_space = true;
|
||||
break;
|
||||
case 'l':
|
||||
if (tag == "legend" || tag == "li" || tag == "listing")
|
||||
if (tag == "legend" || tag == "li" || tag == "listing") {
|
||||
dump += '\n';
|
||||
pending_space = true;
|
||||
}
|
||||
break;
|
||||
case 'm':
|
||||
if (tag == "meta") {
|
||||
LOGDEB(("Found META\n"));
|
||||
map<string, string>::const_iterator i, j;
|
||||
if ((i = p.find("content")) != p.end()) {
|
||||
if ((j = p.find("name")) != p.end()) {
|
||||
@ -125,6 +134,26 @@ MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
|
||||
throw true;
|
||||
}
|
||||
}
|
||||
} else if ((j = p.find("http-equiv")) != p.end()) {
|
||||
LOGDEB(("Found http-equiv\n"));
|
||||
string hequiv = j->second;
|
||||
lowercase_term(hequiv);
|
||||
if (hequiv == "content-type") {
|
||||
string value = i->second;
|
||||
MimeHeaderValue p = parseMimeHeaderValue(value);
|
||||
map<string, string>::const_iterator k;
|
||||
if ((k = p.params.find("charset")) !=
|
||||
p.params.end()) {
|
||||
doccharset = k->second;
|
||||
if (doccharset != ocharset) {
|
||||
LOGDEB1(("Doc specified charset '%s' "
|
||||
"differs from announced '%s'\n",
|
||||
doccharset.c_str(),
|
||||
ocharset.c_str()));
|
||||
throw true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
@ -136,8 +165,10 @@ MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
|
||||
if (tag == "ol" || tag == "option") pending_space = true;
|
||||
break;
|
||||
case 'p':
|
||||
if (tag == "p" || tag == "pre" || tag == "plaintext")
|
||||
if (tag == "p" || tag == "pre" || tag == "plaintext") {
|
||||
dump += '\n';
|
||||
pending_space = true;
|
||||
}
|
||||
break;
|
||||
case 'q':
|
||||
if (tag == "q") pending_space = true;
|
||||
|
||||
@ -1,4 +1,3 @@
|
||||
=======
|
||||
/* myhtmlparse.h: subclass of HtmlParser for extracting text
|
||||
*
|
||||
* ----START-LICENCE----
|
||||
@ -35,6 +34,9 @@ class MyHtmlParser : public HtmlParser {
|
||||
bool in_style_tag;
|
||||
bool pending_space;
|
||||
string title, sample, keywords, dump;
|
||||
string ocharset; // This is the charset our user thinks the doc was
|
||||
string charset; // This is the charset it was supposedly converted to
|
||||
string doccharset; // Set this to value of charset parameter in header
|
||||
bool indexing_allowed;
|
||||
void process_text(const string &text);
|
||||
void opening_tag(const string &tag, const map<string,string> &p);
|
||||
|
||||
@ -9,14 +9,14 @@ all: $(LIBS)
|
||||
|
||||
OBJS = conftree.o csguess.o debuglog.o \
|
||||
fstreewalk.o html.o htmlparse.o \
|
||||
mimehandler.o mimeparse.o mimetype.o pathut.o \
|
||||
mimehandler.o mimeparse.o mimetype.o myhtmlparse.o pathut.o \
|
||||
rclconfig.o rcldb.o readfile.o \
|
||||
textsplit.o transcode.o \
|
||||
unacpp.o unac.o
|
||||
SRCS = ../utils/conftree.cpp ../index/csguess.cpp ../utils/debuglog.cpp \
|
||||
../utils/fstreewalk.cpp ../common/html.cpp ../common/htmlparse.cpp \
|
||||
../common/mimehandler.cpp ../utils/mimeparse.cpp ../index/mimetype.cpp \
|
||||
../utils/pathut.cpp \
|
||||
../common/myhtmlparse.cpp ../utils/pathut.cpp \
|
||||
../common/rclconfig.cpp ../common/rcldb.cpp ../utils/readfile.cpp \
|
||||
../common/textsplit.cpp ../utils/transcode.cpp \
|
||||
../common/unacpp.cpp ../unac/unac.c
|
||||
@ -46,6 +46,8 @@ mimeparse.o : ../utils/mimeparse.cpp
|
||||
$(CXX) $(CXXFLAGS) -c $<
|
||||
mimetype.o : ../index/mimetype.cpp
|
||||
$(CXX) $(CXXFLAGS) -c $<
|
||||
myhtmlparse.o : ../common/myhtmlparse.cpp
|
||||
$(CXX) $(CXXFLAGS) -c $<
|
||||
pathut.o : ../utils/pathut.cpp
|
||||
$(CXX) $(CXXFLAGS) -c $<
|
||||
rclconfig.o : ../common/rclconfig.cpp
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.10 2005-01-28 08:41:40 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.11 2005-01-28 09:37:37 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
#endif
|
||||
|
||||
#include <sys/stat.h>
|
||||
@ -85,18 +85,20 @@ bool Rcl::Db::open(const string& dir, OpenMode mode)
|
||||
try {
|
||||
switch (mode) {
|
||||
case DbUpd:
|
||||
ndb->wdb = Xapian::Auto::open(dir, Xapian::DB_CREATE_OR_OPEN);
|
||||
ndb->wdb =
|
||||
Xapian::WritableDatabase(dir, Xapian::DB_CREATE_OR_OPEN);
|
||||
ndb->updated.resize(ndb->wdb.get_lastdocid() + 1);
|
||||
ndb->iswritable = true;
|
||||
break;
|
||||
case DbTrunc:
|
||||
ndb->wdb = Xapian::Auto::open(dir, Xapian::DB_CREATE_OR_OVERWRITE);
|
||||
ndb->wdb =
|
||||
Xapian::WritableDatabase(dir, Xapian::DB_CREATE_OR_OVERWRITE);
|
||||
ndb->iswritable = true;
|
||||
break;
|
||||
case DbRO:
|
||||
default:
|
||||
ndb->iswritable = false;
|
||||
ndb->db = Xapian::Auto::open(dir, Xapian::DB_OPEN);
|
||||
ndb->db = Xapian::Database(dir);
|
||||
break;
|
||||
}
|
||||
ndb->isopen = true;
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user