merged modifs from xapian/omega 0.8.5
This commit is contained in:
parent
44d2b70fdf
commit
6d35f5430c
@ -24,144 +24,19 @@
|
|||||||
|
|
||||||
// This file has code from omindex + an adaptor function for recoll at the end
|
// This file has code from omindex + an adaptor function for recoll at the end
|
||||||
|
|
||||||
#include "htmlparse.h"
|
|
||||||
#include "mimehandler.h"
|
#include "mimehandler.h"
|
||||||
#include "debuglog.h"
|
#include "debuglog.h"
|
||||||
#include "csguess.h"
|
#include "csguess.h"
|
||||||
#include "readfile.h"
|
#include "readfile.h"
|
||||||
#include "transcode.h"
|
#include "transcode.h"
|
||||||
#include "mimeparse.h"
|
#include "mimeparse.h"
|
||||||
|
#include "myhtmlparse.h"
|
||||||
class MyHtmlParser : public HtmlParser {
|
#include "indextext.h"
|
||||||
public:
|
|
||||||
bool in_script_tag;
|
|
||||||
bool in_style_tag;
|
|
||||||
string title, sample, keywords, dump;
|
|
||||||
string ocharset; // This is the charset our user thinks the doc was
|
|
||||||
string charset; // This is the charset it was supposedly converted to
|
|
||||||
string doccharset; // Set this to value of charset parameter in header
|
|
||||||
bool indexing_allowed;
|
|
||||||
void process_text(const string &text);
|
|
||||||
void opening_tag(const string &tag, const map<string,string> &p);
|
|
||||||
void closing_tag(const string &tag);
|
|
||||||
MyHtmlParser() :
|
|
||||||
in_script_tag(false),
|
|
||||||
in_style_tag(false),
|
|
||||||
indexing_allowed(true) { }
|
|
||||||
};
|
|
||||||
|
|
||||||
void
|
|
||||||
MyHtmlParser::process_text(const string &text)
|
|
||||||
{
|
|
||||||
// some tags are meaningful mid-word so this is simplistic at best...
|
|
||||||
|
|
||||||
if (!in_script_tag && !in_style_tag) {
|
|
||||||
string::size_type firstchar = text.find_first_not_of(" \t\n\r");
|
|
||||||
if (firstchar != string::npos) {
|
|
||||||
dump += text.substr(firstchar);
|
|
||||||
dump += " ";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// lets hope that the charset includes ascii values...
|
|
||||||
static inline void
|
|
||||||
lowercase_term(string &term)
|
|
||||||
{
|
|
||||||
string::iterator i = term.begin();
|
|
||||||
while (i != term.end()) {
|
|
||||||
if (*i >= 'A' && *i <= 'Z')
|
|
||||||
*i = *i + 'a' - 'A';
|
|
||||||
i++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
using namespace std;
|
using namespace std;
|
||||||
|
|
||||||
|
|
||||||
void
|
|
||||||
MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
|
|
||||||
{
|
|
||||||
#if 0
|
|
||||||
cout << "TAG: " << tag << ": " << endl;
|
|
||||||
map<string, string>::const_iterator x;
|
|
||||||
for (x = p.begin(); x != p.end(); x++) {
|
|
||||||
cout << " " << x->first << " -> '" << x->second << "'" << endl;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
if (tag == "meta") {
|
|
||||||
map<string, string>::const_iterator i, j;
|
|
||||||
if ((i = p.find("content")) != p.end()) {
|
|
||||||
if ((j = p.find("name")) != p.end()) {
|
|
||||||
string name = j->second;
|
|
||||||
lowercase_term(name);
|
|
||||||
if (name == "description") {
|
|
||||||
if (sample.empty()) {
|
|
||||||
sample = i->second;
|
|
||||||
decode_entities(sample);
|
|
||||||
}
|
|
||||||
} else if (name == "keywords") {
|
|
||||||
if (!keywords.empty()) keywords += ' ';
|
|
||||||
string tmp = i->second;
|
|
||||||
decode_entities(tmp);
|
|
||||||
keywords += tmp;
|
|
||||||
} else if (name == "robots") {
|
|
||||||
string val = i->second;
|
|
||||||
decode_entities(val);
|
|
||||||
lowercase_term(val);
|
|
||||||
if (val.find("none") != string::npos ||
|
|
||||||
val.find("noindex") != string::npos) {
|
|
||||||
indexing_allowed = false;
|
|
||||||
throw true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else if ((j = p.find("http-equiv")) != p.end()) {
|
|
||||||
string hequiv = j->second;
|
|
||||||
lowercase_term(hequiv);
|
|
||||||
if (hequiv == "content-type") {
|
|
||||||
string value = i->second;
|
|
||||||
MimeHeaderValue p = parseMimeHeaderValue(value);
|
|
||||||
map<string, string>::const_iterator k;
|
|
||||||
if ((k = p.params.find("charset")) != p.params.end()) {
|
|
||||||
doccharset = k->second;
|
|
||||||
if (doccharset != ocharset) {
|
|
||||||
LOGDEB1(("Doc specified charset '%s' "
|
|
||||||
"differs from announced '%s'\n",
|
|
||||||
doccharset.c_str(), ocharset.c_str()));
|
|
||||||
throw true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else if (tag == "p" || tag == "br" || tag == "li") {
|
|
||||||
dump += "\n";
|
|
||||||
} else if (tag == "script") {
|
|
||||||
in_script_tag = true;
|
|
||||||
} else if (tag == "style") {
|
|
||||||
in_style_tag = true;
|
|
||||||
} else if (tag == "body") {
|
|
||||||
dump = "";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void
|
|
||||||
MyHtmlParser::closing_tag(const string &tag)
|
|
||||||
{
|
|
||||||
if (tag == "title") {
|
|
||||||
title = dump;
|
|
||||||
dump = "";
|
|
||||||
} else if (tag == "script") {
|
|
||||||
in_script_tag = false;
|
|
||||||
} else if (tag == "style") {
|
|
||||||
in_style_tag = false;
|
|
||||||
} else if (tag == "body") {
|
|
||||||
throw true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
bool textHtmlToDoc(RclConfig *conf, const string &fn,
|
bool textHtmlToDoc(RclConfig *conf, const string &fn,
|
||||||
const string &mtype, Rcl::Doc &docout)
|
const string &mtype, Rcl::Doc &docout)
|
||||||
{
|
{
|
||||||
|
|||||||
@ -25,6 +25,8 @@
|
|||||||
|
|
||||||
#include "indextext.h" // for lowercase_term()
|
#include "indextext.h" // for lowercase_term()
|
||||||
|
|
||||||
|
#include "mimeparse.h"
|
||||||
|
|
||||||
void
|
void
|
||||||
MyHtmlParser::process_text(const string &text)
|
MyHtmlParser::process_text(const string &text)
|
||||||
{
|
{
|
||||||
@ -50,12 +52,11 @@ void
|
|||||||
MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
|
MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
|
||||||
{
|
{
|
||||||
#if 0
|
#if 0
|
||||||
cout << "<" << tag;
|
cout << "TAG: " << tag << ": " << endl;
|
||||||
map<string, string>::const_iterator x;
|
map<string, string>::const_iterator x;
|
||||||
for (x = p.begin(); x != p.end(); x++) {
|
for (x = p.begin(); x != p.end(); x++) {
|
||||||
cout << " " << x->first << "=\"" << x->second << "\"";
|
cout << " " << x->first << " -> '" << x->second << "'" << endl;
|
||||||
}
|
}
|
||||||
cout << ">\n";
|
|
||||||
#endif
|
#endif
|
||||||
if (tag.empty()) return;
|
if (tag.empty()) return;
|
||||||
switch (tag[0]) {
|
switch (tag[0]) {
|
||||||
@ -67,7 +68,10 @@ MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
|
|||||||
dump = "";
|
dump = "";
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if (tag == "blockquote" || tag == "br") pending_space = true;
|
if (tag == "blockquote" || tag == "br") {
|
||||||
|
dump += '\n';
|
||||||
|
pending_space = true;
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
case 'c':
|
case 'c':
|
||||||
if (tag == "center") pending_space = true;
|
if (tag == "center") pending_space = true;
|
||||||
@ -84,8 +88,10 @@ MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
|
|||||||
break;
|
break;
|
||||||
case 'h':
|
case 'h':
|
||||||
// hr, and h1, ..., h6
|
// hr, and h1, ..., h6
|
||||||
if (tag.length() == 2 && strchr("r123456", tag[1]))
|
if (tag.length() == 2 && strchr("r123456", tag[1])) {
|
||||||
|
dump += '\n';
|
||||||
pending_space = true;
|
pending_space = true;
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
case 'i':
|
case 'i':
|
||||||
if (tag == "iframe" || tag == "img" || tag == "isindex" ||
|
if (tag == "iframe" || tag == "img" || tag == "isindex" ||
|
||||||
@ -95,11 +101,14 @@ MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
|
|||||||
if (tag == "keygen") pending_space = true;
|
if (tag == "keygen") pending_space = true;
|
||||||
break;
|
break;
|
||||||
case 'l':
|
case 'l':
|
||||||
if (tag == "legend" || tag == "li" || tag == "listing")
|
if (tag == "legend" || tag == "li" || tag == "listing") {
|
||||||
|
dump += '\n';
|
||||||
pending_space = true;
|
pending_space = true;
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
case 'm':
|
case 'm':
|
||||||
if (tag == "meta") {
|
if (tag == "meta") {
|
||||||
|
LOGDEB(("Found META\n"));
|
||||||
map<string, string>::const_iterator i, j;
|
map<string, string>::const_iterator i, j;
|
||||||
if ((i = p.find("content")) != p.end()) {
|
if ((i = p.find("content")) != p.end()) {
|
||||||
if ((j = p.find("name")) != p.end()) {
|
if ((j = p.find("name")) != p.end()) {
|
||||||
@ -125,6 +134,26 @@ MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
|
|||||||
throw true;
|
throw true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
} else if ((j = p.find("http-equiv")) != p.end()) {
|
||||||
|
LOGDEB(("Found http-equiv\n"));
|
||||||
|
string hequiv = j->second;
|
||||||
|
lowercase_term(hequiv);
|
||||||
|
if (hequiv == "content-type") {
|
||||||
|
string value = i->second;
|
||||||
|
MimeHeaderValue p = parseMimeHeaderValue(value);
|
||||||
|
map<string, string>::const_iterator k;
|
||||||
|
if ((k = p.params.find("charset")) !=
|
||||||
|
p.params.end()) {
|
||||||
|
doccharset = k->second;
|
||||||
|
if (doccharset != ocharset) {
|
||||||
|
LOGDEB1(("Doc specified charset '%s' "
|
||||||
|
"differs from announced '%s'\n",
|
||||||
|
doccharset.c_str(),
|
||||||
|
ocharset.c_str()));
|
||||||
|
throw true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
@ -136,8 +165,10 @@ MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
|
|||||||
if (tag == "ol" || tag == "option") pending_space = true;
|
if (tag == "ol" || tag == "option") pending_space = true;
|
||||||
break;
|
break;
|
||||||
case 'p':
|
case 'p':
|
||||||
if (tag == "p" || tag == "pre" || tag == "plaintext")
|
if (tag == "p" || tag == "pre" || tag == "plaintext") {
|
||||||
|
dump += '\n';
|
||||||
pending_space = true;
|
pending_space = true;
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
case 'q':
|
case 'q':
|
||||||
if (tag == "q") pending_space = true;
|
if (tag == "q") pending_space = true;
|
||||||
|
|||||||
@ -1,4 +1,3 @@
|
|||||||
=======
|
|
||||||
/* myhtmlparse.h: subclass of HtmlParser for extracting text
|
/* myhtmlparse.h: subclass of HtmlParser for extracting text
|
||||||
*
|
*
|
||||||
* ----START-LICENCE----
|
* ----START-LICENCE----
|
||||||
@ -35,6 +34,9 @@ class MyHtmlParser : public HtmlParser {
|
|||||||
bool in_style_tag;
|
bool in_style_tag;
|
||||||
bool pending_space;
|
bool pending_space;
|
||||||
string title, sample, keywords, dump;
|
string title, sample, keywords, dump;
|
||||||
|
string ocharset; // This is the charset our user thinks the doc was
|
||||||
|
string charset; // This is the charset it was supposedly converted to
|
||||||
|
string doccharset; // Set this to value of charset parameter in header
|
||||||
bool indexing_allowed;
|
bool indexing_allowed;
|
||||||
void process_text(const string &text);
|
void process_text(const string &text);
|
||||||
void opening_tag(const string &tag, const map<string,string> &p);
|
void opening_tag(const string &tag, const map<string,string> &p);
|
||||||
|
|||||||
@ -9,14 +9,14 @@ all: $(LIBS)
|
|||||||
|
|
||||||
OBJS = conftree.o csguess.o debuglog.o \
|
OBJS = conftree.o csguess.o debuglog.o \
|
||||||
fstreewalk.o html.o htmlparse.o \
|
fstreewalk.o html.o htmlparse.o \
|
||||||
mimehandler.o mimeparse.o mimetype.o pathut.o \
|
mimehandler.o mimeparse.o mimetype.o myhtmlparse.o pathut.o \
|
||||||
rclconfig.o rcldb.o readfile.o \
|
rclconfig.o rcldb.o readfile.o \
|
||||||
textsplit.o transcode.o \
|
textsplit.o transcode.o \
|
||||||
unacpp.o unac.o
|
unacpp.o unac.o
|
||||||
SRCS = ../utils/conftree.cpp ../index/csguess.cpp ../utils/debuglog.cpp \
|
SRCS = ../utils/conftree.cpp ../index/csguess.cpp ../utils/debuglog.cpp \
|
||||||
../utils/fstreewalk.cpp ../common/html.cpp ../common/htmlparse.cpp \
|
../utils/fstreewalk.cpp ../common/html.cpp ../common/htmlparse.cpp \
|
||||||
../common/mimehandler.cpp ../utils/mimeparse.cpp ../index/mimetype.cpp \
|
../common/mimehandler.cpp ../utils/mimeparse.cpp ../index/mimetype.cpp \
|
||||||
../utils/pathut.cpp \
|
../common/myhtmlparse.cpp ../utils/pathut.cpp \
|
||||||
../common/rclconfig.cpp ../common/rcldb.cpp ../utils/readfile.cpp \
|
../common/rclconfig.cpp ../common/rcldb.cpp ../utils/readfile.cpp \
|
||||||
../common/textsplit.cpp ../utils/transcode.cpp \
|
../common/textsplit.cpp ../utils/transcode.cpp \
|
||||||
../common/unacpp.cpp ../unac/unac.c
|
../common/unacpp.cpp ../unac/unac.c
|
||||||
@ -46,6 +46,8 @@ mimeparse.o : ../utils/mimeparse.cpp
|
|||||||
$(CXX) $(CXXFLAGS) -c $<
|
$(CXX) $(CXXFLAGS) -c $<
|
||||||
mimetype.o : ../index/mimetype.cpp
|
mimetype.o : ../index/mimetype.cpp
|
||||||
$(CXX) $(CXXFLAGS) -c $<
|
$(CXX) $(CXXFLAGS) -c $<
|
||||||
|
myhtmlparse.o : ../common/myhtmlparse.cpp
|
||||||
|
$(CXX) $(CXXFLAGS) -c $<
|
||||||
pathut.o : ../utils/pathut.cpp
|
pathut.o : ../utils/pathut.cpp
|
||||||
$(CXX) $(CXXFLAGS) -c $<
|
$(CXX) $(CXXFLAGS) -c $<
|
||||||
rclconfig.o : ../common/rclconfig.cpp
|
rclconfig.o : ../common/rclconfig.cpp
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
#ifndef lint
|
#ifndef lint
|
||||||
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.10 2005-01-28 08:41:40 dockes Exp $ (C) 2004 J.F.Dockes";
|
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.11 2005-01-28 09:37:37 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#include <sys/stat.h>
|
#include <sys/stat.h>
|
||||||
@ -85,18 +85,20 @@ bool Rcl::Db::open(const string& dir, OpenMode mode)
|
|||||||
try {
|
try {
|
||||||
switch (mode) {
|
switch (mode) {
|
||||||
case DbUpd:
|
case DbUpd:
|
||||||
ndb->wdb = Xapian::Auto::open(dir, Xapian::DB_CREATE_OR_OPEN);
|
ndb->wdb =
|
||||||
|
Xapian::WritableDatabase(dir, Xapian::DB_CREATE_OR_OPEN);
|
||||||
ndb->updated.resize(ndb->wdb.get_lastdocid() + 1);
|
ndb->updated.resize(ndb->wdb.get_lastdocid() + 1);
|
||||||
ndb->iswritable = true;
|
ndb->iswritable = true;
|
||||||
break;
|
break;
|
||||||
case DbTrunc:
|
case DbTrunc:
|
||||||
ndb->wdb = Xapian::Auto::open(dir, Xapian::DB_CREATE_OR_OVERWRITE);
|
ndb->wdb =
|
||||||
|
Xapian::WritableDatabase(dir, Xapian::DB_CREATE_OR_OVERWRITE);
|
||||||
ndb->iswritable = true;
|
ndb->iswritable = true;
|
||||||
break;
|
break;
|
||||||
case DbRO:
|
case DbRO:
|
||||||
default:
|
default:
|
||||||
ndb->iswritable = false;
|
ndb->iswritable = false;
|
||||||
ndb->db = Xapian::Auto::open(dir, Xapian::DB_OPEN);
|
ndb->db = Xapian::Database(dir);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
ndb->isopen = true;
|
ndb->isopen = true;
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user