From 6d35f5430cb5e1776578299b6d25b810eec5c53b Mon Sep 17 00:00:00 2001
From: dockes <none@none>
Date: Fri, 28 Jan 2005 09:37:37 +0000
Subject: [PATCH] merged modifs from xapian/omega 0.8.5

---
 src/internfile/mh_html.cpp     | 129 +--------------------------------
 src/internfile/myhtmlparse.cpp |  45 ++++++++++--
 src/internfile/myhtmlparse.h   |   4 +-
 src/lib/Makefile               |   6 +-
 src/rcldb/rcldb.cpp            |  10 ++-
 5 files changed, 53 insertions(+), 141 deletions(-)
diff --git a/src/internfile/mh_html.cpp b/src/internfile/mh_html.cpp
index 374d43ed..3a6076b1 100644
--- a/src/internfile/mh_html.cpp
+++ b/src/internfile/mh_html.cpp
@@ -24,144 +24,19 @@
 
 // This file has code from omindex + an adaptor function for recoll at the end
 
-#include "htmlparse.h"
 #include "mimehandler.h"
 #include "debuglog.h"
 #include "csguess.h"
 #include "readfile.h"
 #include "transcode.h"
 #include "mimeparse.h"
-
-class MyHtmlParser : public HtmlParser {
- public:
-    bool in_script_tag;
-    bool in_style_tag;
-    string title, sample, keywords, dump;
-    string ocharset; // This is the charset our user thinks the doc was
-    string charset; // This is the charset it was supposedly converted to
-    string doccharset; // Set this to value of charset parameter in header
-    bool indexing_allowed;
-    void process_text(const string &text);
-    void opening_tag(const string &tag, const map<string,string> &p);
-    void closing_tag(const string &tag);
-    MyHtmlParser() :
-	in_script_tag(false),
-	in_style_tag(false),
-	indexing_allowed(true) { }
-};
-
-void
-MyHtmlParser::process_text(const string &text)
-{
-    // some tags are meaningful mid-word so this is simplistic at best...
-
-    if (!in_script_tag && !in_style_tag) {
-	string::size_type firstchar = text.find_first_not_of(" \t\n\r");
-	if (firstchar != string::npos) {
-	    dump += text.substr(firstchar);
-	    dump += " ";
-	}
-    }
-}
-
-// lets hope that the charset includes ascii values...
-static inline void
-lowercase_term(string &term)
-{
-    string::iterator i = term.begin();
-    while (i != term.end()) {
-	if (*i >= 'A' && *i <= 'Z')
-	    *i = *i + 'a' - 'A';
-        i++;
-    }
-}
+#include "myhtmlparse.h"
+#include "indextext.h"
 
 #include <iostream>
 using namespace std;
 
 
-void
-MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
-{
-#if 0
-    cout << "TAG: " << tag << ": " << endl;
-    map<string, string>::const_iterator x;
-    for (x = p.begin(); x != p.end(); x++) {
-	cout << "  " << x->first << " -> '" << x->second << "'" << endl;
-    }
-#endif
-    
-    if (tag == "meta") {
-	map<string, string>::const_iterator i, j;
-	if ((i = p.find("content")) != p.end()) {
-	    if ((j = p.find("name")) != p.end()) {
-		string name = j->second;
-		lowercase_term(name);
-		if (name == "description") {
-		    if (sample.empty()) {
-			sample = i->second;
-			decode_entities(sample);
-		    }
-		} else if (name == "keywords") {
-		    if (!keywords.empty()) keywords += ' ';
-		    string tmp = i->second;
-		    decode_entities(tmp);
-		    keywords += tmp;
-		} else if (name == "robots") {
-		    string val = i->second;
-		    decode_entities(val);
-		    lowercase_term(val);
-		    if (val.find("none") != string::npos ||
-			val.find("noindex") != string::npos) {
-			indexing_allowed = false;
-			throw true;
-		    }
-		}
-	    } else if ((j = p.find("http-equiv")) != p.end()) {
-		string hequiv = j->second;
-		lowercase_term(hequiv);
-		if (hequiv == "content-type") {
-		    string value = i->second;
-		    MimeHeaderValue p = parseMimeHeaderValue(value);
-		    map<string, string>::const_iterator k;
-		    if ((k = p.params.find("charset")) != p.params.end()) {
-			doccharset = k->second;
-			if (doccharset != ocharset) {
-			    LOGDEB1(("Doc specified charset '%s' "
-				     "differs from announced '%s'\n",
-				     doccharset.c_str(), ocharset.c_str()));
-			    throw true;
-			}
-		    }
-		}
-	    }
-	}
-    } else if (tag == "p" || tag == "br" || tag == "li") {
-	dump += "\n";
-    } else if (tag == "script") {
-	in_script_tag = true;
-    } else if (tag == "style") {
-	in_style_tag = true;
-    } else if (tag == "body") {
-	dump = "";
-    }
-}
-
-void
-MyHtmlParser::closing_tag(const string &tag)
-{
-    if (tag == "title") {
-	title = dump;
-	dump = "";
-    } else if (tag == "script") {
-	in_script_tag = false;
-    } else if (tag == "style") {
-	in_style_tag = false;
-    } else if (tag == "body") {
-	throw true;
-    }
-}
-
 bool textHtmlToDoc(RclConfig *conf, const string &fn, 
 			 const string &mtype, Rcl::Doc &docout)
 {
diff --git a/src/internfile/myhtmlparse.cpp b/src/internfile/myhtmlparse.cpp
index 2594d5a3..db087822 100644
--- a/src/internfile/myhtmlparse.cpp
+++ b/src/internfile/myhtmlparse.cpp
@@ -25,6 +25,8 @@
 
 #include "indextext.h" // for lowercase_term()
 
+#include "mimeparse.h"
+
 void
 MyHtmlParser::process_text(const string &text)
 {
@@ -50,12 +52,11 @@ void
 MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
 {
 #if 0
-    cout << "<" << tag;
+    cout << "TAG: " << tag << ": " << endl;
     map<string, string>::const_iterator x;
     for (x = p.begin(); x != p.end(); x++) {
-	cout << " " << x->first << "=\"" << x->second << "\"";
+	cout << "  " << x->first << " -> '" << x->second << "'" << endl;
     }
-    cout << ">\n";
 #endif
     if (tag.empty()) return;
     switch (tag[0]) {
@@ -67,7 +68,10 @@ MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
 		dump = "";
 		break;
 	    }
-	    if (tag == "blockquote" || tag == "br") pending_space = true;
+	    if (tag == "blockquote" || tag == "br") {
+		dump += '\n';
+		pending_space = true;
+	    }
 	    break;
 	case 'c':
 	    if (tag == "center") pending_space = true;
@@ -84,8 +88,10 @@ MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
 	    break;
 	case 'h':
 	    // hr, and h1, ..., h6
-	    if (tag.length() == 2 && strchr("r123456", tag[1]))
+	    if (tag.length() == 2 && strchr("r123456", tag[1])) {
+		dump += '\n';
 		pending_space = true;
+	    }
 	    break;
 	case 'i':
 	    if (tag == "iframe" || tag == "img" || tag == "isindex" ||
@@ -95,11 +101,14 @@ MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
 	    if (tag == "keygen") pending_space = true;
 	    break;
 	case 'l':
-	    if (tag == "legend" || tag == "li" || tag == "listing")
+	    if (tag == "legend" || tag == "li" || tag == "listing") {
+		dump += '\n';
 		pending_space = true;
+	    }
 	    break;
 	case 'm':
 	    if (tag == "meta") {
+		    LOGDEB(("Found META\n"));
 		map<string, string>::const_iterator i, j;
 		if ((i = p.find("content")) != p.end()) {
 		    if ((j = p.find("name")) != p.end()) {
@@ -125,6 +134,26 @@ MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
 				throw true;
 			    }
 			}
+		    } else if ((j = p.find("http-equiv")) != p.end()) {
+			LOGDEB(("Found http-equiv\n"));
+			string hequiv = j->second;
+			lowercase_term(hequiv);
+			if (hequiv == "content-type") {
+			    string value = i->second;
+			    MimeHeaderValue p = parseMimeHeaderValue(value);
+			    map<string, string>::const_iterator k;
+			    if ((k = p.params.find("charset")) != 
+				p.params.end()) {
+				doccharset = k->second;
+				if (doccharset != ocharset) {
+				    LOGDEB1(("Doc specified charset '%s' "
+					     "differs from announced '%s'\n",
+					     doccharset.c_str(), 
+					     ocharset.c_str()));
+				    throw true;
+				}
+			    }
+			}
 		    }
 		}
 		break;
@@ -136,8 +165,10 @@ MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
 	    if (tag == "ol" || tag == "option") pending_space = true;
 	    break;
 	case 'p':
-	    if (tag == "p" || tag == "pre" || tag == "plaintext")
+	    if (tag == "p" || tag == "pre" || tag == "plaintext") {
+		dump += '\n';
 		pending_space = true;
+	    }
 	    break;
 	case 'q':
 	    if (tag == "q") pending_space = true;
diff --git a/src/internfile/myhtmlparse.h b/src/internfile/myhtmlparse.h
index 0fa1dd1e..6d5536de 100644
--- a/src/internfile/myhtmlparse.h
+++ b/src/internfile/myhtmlparse.h
@@ -1,4 +1,3 @@
-=======
 /* myhtmlparse.h: subclass of HtmlParser for extracting text
  *
  * ----START-LICENCE----
@@ -35,6 +34,9 @@ class MyHtmlParser : public HtmlParser {
 	bool in_style_tag;
 	bool pending_space;
     	string title, sample, keywords, dump;
+        string ocharset; // This is the charset our user thinks the doc was
+        string charset; // This is the charset it was supposedly converted to
+        string doccharset; // Set this to value of charset parameter in header
 	bool indexing_allowed;
 	void process_text(const string &text);
 	void opening_tag(const string &tag, const map<string,string> &p);
diff --git a/src/lib/Makefile b/src/lib/Makefile
index 975666f9..e1a437e2 100644
--- a/src/lib/Makefile
+++ b/src/lib/Makefile
@@ -9,14 +9,14 @@ all: $(LIBS)
 
 OBJS = conftree.o csguess.o debuglog.o \
      fstreewalk.o html.o htmlparse.o \
-     mimehandler.o mimeparse.o mimetype.o pathut.o \
+     mimehandler.o mimeparse.o mimetype.o myhtmlparse.o pathut.o \
      rclconfig.o rcldb.o readfile.o \
      textsplit.o transcode.o \
      unacpp.o unac.o
 SRCS = ../utils/conftree.cpp ../index/csguess.cpp ../utils/debuglog.cpp \
      ../utils/fstreewalk.cpp ../common/html.cpp ../common/htmlparse.cpp \
      ../common/mimehandler.cpp ../utils/mimeparse.cpp ../index/mimetype.cpp \
-     ../utils/pathut.cpp \
+     ../common/myhtmlparse.cpp ../utils/pathut.cpp \
      ../common/rclconfig.cpp ../common/rcldb.cpp ../utils/readfile.cpp \
      ../common/textsplit.cpp ../utils/transcode.cpp \
      ../common/unacpp.cpp ../unac/unac.c
@@ -46,6 +46,8 @@ mimeparse.o : ../utils/mimeparse.cpp
 	$(CXX) $(CXXFLAGS) -c $<
 mimetype.o : ../index/mimetype.cpp 
 	$(CXX) $(CXXFLAGS) -c $<
+myhtmlparse.o : ../common/myhtmlparse.cpp 
+	$(CXX) $(CXXFLAGS) -c $<
 pathut.o : ../utils/pathut.cpp 
 	$(CXX) $(CXXFLAGS) -c $<
 rclconfig.o : ../common/rclconfig.cpp 
diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp
index 7e02bf32..890b6703 100644
--- a/src/rcldb/rcldb.cpp
+++ b/src/rcldb/rcldb.cpp
@@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.10 2005-01-28 08:41:40 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.11 2005-01-28 09:37:37 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 
 #include <sys/stat.h>
@@ -85,18 +85,20 @@ bool Rcl::Db::open(const string& dir, OpenMode mode)
     try {
 	switch (mode) {
 	case DbUpd:
-	    ndb->wdb = Xapian::Auto::open(dir, Xapian::DB_CREATE_OR_OPEN);
+	    ndb->wdb = 
+		Xapian::WritableDatabase(dir, Xapian::DB_CREATE_OR_OPEN);
 	    ndb->updated.resize(ndb->wdb.get_lastdocid() + 1);
 	    ndb->iswritable = true;
 	    break;
 	case DbTrunc:
-	    ndb->wdb = Xapian::Auto::open(dir, Xapian::DB_CREATE_OR_OVERWRITE);
+	    ndb->wdb = 
+		Xapian::WritableDatabase(dir, Xapian::DB_CREATE_OR_OVERWRITE);
 	    ndb->iswritable = true;
 	    break;
 	case DbRO:
 	default:
 	    ndb->iswritable = false;
-	    ndb->db = Xapian::Auto::open(dir, Xapian::DB_OPEN);
+	    ndb->db = Xapian::Database(dir);
 	    break;
 	}
 	ndb->isopen = true;