From 6d35f5430cb5e1776578299b6d25b810eec5c53b Mon Sep 17 00:00:00 2001 From: dockes Date: Fri, 28 Jan 2005 09:37:37 +0000 Subject: [PATCH] merged modifs from xapian/omega 0.8.5 --- src/internfile/mh_html.cpp | 129 +-------------------------------- src/internfile/myhtmlparse.cpp | 45 ++++++++++-- src/internfile/myhtmlparse.h | 4 +- src/lib/Makefile | 6 +- src/rcldb/rcldb.cpp | 10 ++- 5 files changed, 53 insertions(+), 141 deletions(-) diff --git a/src/internfile/mh_html.cpp b/src/internfile/mh_html.cpp index 374d43ed..3a6076b1 100644 --- a/src/internfile/mh_html.cpp +++ b/src/internfile/mh_html.cpp @@ -24,144 +24,19 @@ // This file has code from omindex + an adaptor function for recoll at the end -#include "htmlparse.h" #include "mimehandler.h" #include "debuglog.h" #include "csguess.h" #include "readfile.h" #include "transcode.h" #include "mimeparse.h" - -class MyHtmlParser : public HtmlParser { - public: - bool in_script_tag; - bool in_style_tag; - string title, sample, keywords, dump; - string ocharset; // This is the charset our user thinks the doc was - string charset; // This is the charset it was supposedly converted to - string doccharset; // Set this to value of charset parameter in header - bool indexing_allowed; - void process_text(const string &text); - void opening_tag(const string &tag, const map &p); - void closing_tag(const string &tag); - MyHtmlParser() : - in_script_tag(false), - in_style_tag(false), - indexing_allowed(true) { } -}; - -void -MyHtmlParser::process_text(const string &text) -{ - // some tags are meaningful mid-word so this is simplistic at best... - - if (!in_script_tag && !in_style_tag) { - string::size_type firstchar = text.find_first_not_of(" \t\n\r"); - if (firstchar != string::npos) { - dump += text.substr(firstchar); - dump += " "; - } - } -} - -// lets hope that the charset includes ascii values... -static inline void -lowercase_term(string &term) -{ - string::iterator i = term.begin(); - while (i != term.end()) { - if (*i >= 'A' && *i <= 'Z') - *i = *i + 'a' - 'A'; - i++; - } -} +#include "myhtmlparse.h" +#include "indextext.h" #include using namespace std; -void -MyHtmlParser::opening_tag(const string &tag, const map &p) -{ -#if 0 - cout << "TAG: " << tag << ": " << endl; - map::const_iterator x; - for (x = p.begin(); x != p.end(); x++) { - cout << " " << x->first << " -> '" << x->second << "'" << endl; - } -#endif - - if (tag == "meta") { - map::const_iterator i, j; - if ((i = p.find("content")) != p.end()) { - if ((j = p.find("name")) != p.end()) { - string name = j->second; - lowercase_term(name); - if (name == "description") { - if (sample.empty()) { - sample = i->second; - decode_entities(sample); - } - } else if (name == "keywords") { - if (!keywords.empty()) keywords += ' '; - string tmp = i->second; - decode_entities(tmp); - keywords += tmp; - } else if (name == "robots") { - string val = i->second; - decode_entities(val); - lowercase_term(val); - if (val.find("none") != string::npos || - val.find("noindex") != string::npos) { - indexing_allowed = false; - throw true; - } - } - } else if ((j = p.find("http-equiv")) != p.end()) { - string hequiv = j->second; - lowercase_term(hequiv); - if (hequiv == "content-type") { - string value = i->second; - MimeHeaderValue p = parseMimeHeaderValue(value); - map::const_iterator k; - if ((k = p.params.find("charset")) != p.params.end()) { - doccharset = k->second; - if (doccharset != ocharset) { - LOGDEB1(("Doc specified charset '%s' " - "differs from announced '%s'\n", - doccharset.c_str(), ocharset.c_str())); - throw true; - } - } - } - } - } - } else if (tag == "p" || tag == "br" || tag == "li") { - dump += "\n"; - } else if (tag == "script") { - in_script_tag = true; - } else if (tag == "style") { - in_style_tag = true; - } else if (tag == "body") { - dump = ""; - } -} - -void -MyHtmlParser::closing_tag(const string &tag) -{ - if (tag == "title") { - title = dump; - dump = ""; - } else if (tag == "script") { - in_script_tag = false; - } else if (tag == "style") { - in_style_tag = false; - } else if (tag == "body") { - throw true; - } -} - bool textHtmlToDoc(RclConfig *conf, const string &fn, const string &mtype, Rcl::Doc &docout) { diff --git a/src/internfile/myhtmlparse.cpp b/src/internfile/myhtmlparse.cpp index 2594d5a3..db087822 100644 --- a/src/internfile/myhtmlparse.cpp +++ b/src/internfile/myhtmlparse.cpp @@ -25,6 +25,8 @@ #include "indextext.h" // for lowercase_term() +#include "mimeparse.h" + void MyHtmlParser::process_text(const string &text) { @@ -50,12 +52,11 @@ void MyHtmlParser::opening_tag(const string &tag, const map &p) { #if 0 - cout << "<" << tag; + cout << "TAG: " << tag << ": " << endl; map::const_iterator x; for (x = p.begin(); x != p.end(); x++) { - cout << " " << x->first << "=\"" << x->second << "\""; + cout << " " << x->first << " -> '" << x->second << "'" << endl; } - cout << ">\n"; #endif if (tag.empty()) return; switch (tag[0]) { @@ -67,7 +68,10 @@ MyHtmlParser::opening_tag(const string &tag, const map &p) dump = ""; break; } - if (tag == "blockquote" || tag == "br") pending_space = true; + if (tag == "blockquote" || tag == "br") { + dump += '\n'; + pending_space = true; + } break; case 'c': if (tag == "center") pending_space = true; @@ -84,8 +88,10 @@ MyHtmlParser::opening_tag(const string &tag, const map &p) break; case 'h': // hr, and h1, ..., h6 - if (tag.length() == 2 && strchr("r123456", tag[1])) + if (tag.length() == 2 && strchr("r123456", tag[1])) { + dump += '\n'; pending_space = true; + } break; case 'i': if (tag == "iframe" || tag == "img" || tag == "isindex" || @@ -95,11 +101,14 @@ MyHtmlParser::opening_tag(const string &tag, const map &p) if (tag == "keygen") pending_space = true; break; case 'l': - if (tag == "legend" || tag == "li" || tag == "listing") + if (tag == "legend" || tag == "li" || tag == "listing") { + dump += '\n'; pending_space = true; + } break; case 'm': if (tag == "meta") { + LOGDEB(("Found META\n")); map::const_iterator i, j; if ((i = p.find("content")) != p.end()) { if ((j = p.find("name")) != p.end()) { @@ -125,6 +134,26 @@ MyHtmlParser::opening_tag(const string &tag, const map &p) throw true; } } + } else if ((j = p.find("http-equiv")) != p.end()) { + LOGDEB(("Found http-equiv\n")); + string hequiv = j->second; + lowercase_term(hequiv); + if (hequiv == "content-type") { + string value = i->second; + MimeHeaderValue p = parseMimeHeaderValue(value); + map::const_iterator k; + if ((k = p.params.find("charset")) != + p.params.end()) { + doccharset = k->second; + if (doccharset != ocharset) { + LOGDEB1(("Doc specified charset '%s' " + "differs from announced '%s'\n", + doccharset.c_str(), + ocharset.c_str())); + throw true; + } + } + } } } break; @@ -136,8 +165,10 @@ MyHtmlParser::opening_tag(const string &tag, const map &p) if (tag == "ol" || tag == "option") pending_space = true; break; case 'p': - if (tag == "p" || tag == "pre" || tag == "plaintext") + if (tag == "p" || tag == "pre" || tag == "plaintext") { + dump += '\n'; pending_space = true; + } break; case 'q': if (tag == "q") pending_space = true; diff --git a/src/internfile/myhtmlparse.h b/src/internfile/myhtmlparse.h index 0fa1dd1e..6d5536de 100644 --- a/src/internfile/myhtmlparse.h +++ b/src/internfile/myhtmlparse.h @@ -1,4 +1,3 @@ -======= /* myhtmlparse.h: subclass of HtmlParser for extracting text * * ----START-LICENCE---- @@ -35,6 +34,9 @@ class MyHtmlParser : public HtmlParser { bool in_style_tag; bool pending_space; string title, sample, keywords, dump; + string ocharset; // This is the charset our user thinks the doc was + string charset; // This is the charset it was supposedly converted to + string doccharset; // Set this to value of charset parameter in header bool indexing_allowed; void process_text(const string &text); void opening_tag(const string &tag, const map &p); diff --git a/src/lib/Makefile b/src/lib/Makefile index 975666f9..e1a437e2 100644 --- a/src/lib/Makefile +++ b/src/lib/Makefile @@ -9,14 +9,14 @@ all: $(LIBS) OBJS = conftree.o csguess.o debuglog.o \ fstreewalk.o html.o htmlparse.o \ - mimehandler.o mimeparse.o mimetype.o pathut.o \ + mimehandler.o mimeparse.o mimetype.o myhtmlparse.o pathut.o \ rclconfig.o rcldb.o readfile.o \ textsplit.o transcode.o \ unacpp.o unac.o SRCS = ../utils/conftree.cpp ../index/csguess.cpp ../utils/debuglog.cpp \ ../utils/fstreewalk.cpp ../common/html.cpp ../common/htmlparse.cpp \ ../common/mimehandler.cpp ../utils/mimeparse.cpp ../index/mimetype.cpp \ - ../utils/pathut.cpp \ + ../common/myhtmlparse.cpp ../utils/pathut.cpp \ ../common/rclconfig.cpp ../common/rcldb.cpp ../utils/readfile.cpp \ ../common/textsplit.cpp ../utils/transcode.cpp \ ../common/unacpp.cpp ../unac/unac.c @@ -46,6 +46,8 @@ mimeparse.o : ../utils/mimeparse.cpp $(CXX) $(CXXFLAGS) -c $< mimetype.o : ../index/mimetype.cpp $(CXX) $(CXXFLAGS) -c $< +myhtmlparse.o : ../common/myhtmlparse.cpp + $(CXX) $(CXXFLAGS) -c $< pathut.o : ../utils/pathut.cpp $(CXX) $(CXXFLAGS) -c $< rclconfig.o : ../common/rclconfig.cpp diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index 7e02bf32..890b6703 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.10 2005-01-28 08:41:40 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.11 2005-01-28 09:37:37 dockes Exp $ (C) 2004 J.F.Dockes"; #endif #include @@ -85,18 +85,20 @@ bool Rcl::Db::open(const string& dir, OpenMode mode) try { switch (mode) { case DbUpd: - ndb->wdb = Xapian::Auto::open(dir, Xapian::DB_CREATE_OR_OPEN); + ndb->wdb = + Xapian::WritableDatabase(dir, Xapian::DB_CREATE_OR_OPEN); ndb->updated.resize(ndb->wdb.get_lastdocid() + 1); ndb->iswritable = true; break; case DbTrunc: - ndb->wdb = Xapian::Auto::open(dir, Xapian::DB_CREATE_OR_OVERWRITE); + ndb->wdb = + Xapian::WritableDatabase(dir, Xapian::DB_CREATE_OR_OVERWRITE); ndb->iswritable = true; break; case DbRO: default: ndb->iswritable = false; - ndb->db = Xapian::Auto::open(dir, Xapian::DB_OPEN); + ndb->db = Xapian::Database(dir); break; } ndb->isopen = true;