From 0b1827694770ed0e1d58f07bf4426f5c36a0dc14 Mon Sep 17 00:00:00 2001 From: dockes Date: Wed, 26 Jan 2005 11:47:27 +0000 Subject: [PATCH] ckpt --- src/internfile/mh_html.cpp | 227 ++++++++++++++++++++++++++++ src/internfile/mimehandler.cpp | 3 +- src/internfile/mimehandler.h | 5 +- src/lib/Makefile | 15 +- src/qtgui/recollmain.ui | 268 +++++++++++++++++++++------------ src/qtgui/recollmain.ui.h | 132 +++++++++++++--- src/query/qtry.cpp | 22 +-- src/rcldb/rcldb.cpp | 16 +- src/utils/Makefile | 10 +- 9 files changed, 562 insertions(+), 136 deletions(-) create mode 100644 src/internfile/mh_html.cpp diff --git a/src/internfile/mh_html.cpp b/src/internfile/mh_html.cpp new file mode 100644 index 00000000..9b98e4d1 --- /dev/null +++ b/src/internfile/mh_html.cpp @@ -0,0 +1,227 @@ +/* htmlparse.cc: simple HTML parser for omega indexer + * + * ----START-LICENCE---- + * Copyright 1999,2000,2001 BrightStation PLC + * Copyright 2001 Ananova Ltd + * Copyright 2002 Olly Betts + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 + * USA + * -----END-LICENCE----- + */ + +// This file has code from omindex + an adaptor function for recoll at the end + +#include "htmlparse.h" +#include "mimehandler.h" +#include "debuglog.h" +#include "csguess.h" +#include "readfile.h" +#include "transcode.h" +#include "mimeparse.h" + +class MyHtmlParser : public HtmlParser { + public: + bool in_script_tag; + bool in_style_tag; + string title, sample, keywords, dump; + string charset; // This is the charset our user thinks the doc is in + string doccharset; // Set this to value of charset parameter in header + bool indexing_allowed; + void process_text(const string &text); + void opening_tag(const string &tag, const map &p); + void closing_tag(const string &tag); + MyHtmlParser() : + in_script_tag(false), + in_style_tag(false), + indexing_allowed(true) { } +}; + +void +MyHtmlParser::process_text(const string &text) +{ + // some tags are meaningful mid-word so this is simplistic at best... + + if (!in_script_tag && !in_style_tag) { + string::size_type firstchar = text.find_first_not_of(" \t\n\r"); + if (firstchar != string::npos) { + dump += text.substr(firstchar); + dump += " "; + } + } +} + +// lets hope that the charset includes ascii values... +static inline void +lowercase_term(string &term) +{ + string::iterator i = term.begin(); + while (i != term.end()) { + if (*i >= 'A' && *i <= 'Z') + *i = *i + 'a' - 'A'; + i++; + } +} + +#include +using namespace std; + + +void +MyHtmlParser::opening_tag(const string &tag, const map &p) +{ +#if 0 + cout << "TAG: " << tag << ": " << endl; + map::const_iterator x; + for (x = p.begin(); x != p.end(); x++) { + cout << " " << x->first << " -> '" << x->second << "'" << endl; + } +#endif + + if (tag == "meta") { + map::const_iterator i, j; + if ((i = p.find("content")) != p.end()) { + if ((j = p.find("name")) != p.end()) { + string name = j->second; + lowercase_term(name); + if (name == "description") { + if (sample.empty()) { + sample = i->second; + decode_entities(sample); + } + } else if (name == "keywords") { + if (!keywords.empty()) keywords += ' '; + string tmp = i->second; + decode_entities(tmp); + keywords += tmp; + } else if (name == "robots") { + string val = i->second; + decode_entities(val); + lowercase_term(val); + if (val.find("none") != string::npos || + val.find("noindex") != string::npos) { + indexing_allowed = false; + throw true; + } + } + } else if ((j = p.find("http-equiv")) != p.end()) { + string hequiv = j->second; + lowercase_term(hequiv); + if (hequiv == "content-type") { + string value = i->second; + MimeHeaderValue p = parseMimeHeaderValue(value); + map::const_iterator k; + if ((k = p.params.find("charset")) != p.params.end()) { + doccharset = k->second; + if (doccharset != charset) + throw true; + } + } + } + } + } else if (tag == "script") { + in_script_tag = true; + } else if (tag == "style") { + in_style_tag = true; + } else if (tag == "body") { + dump = ""; + } +} + +void +MyHtmlParser::closing_tag(const string &tag) +{ + if (tag == "title") { + title = dump; + dump = ""; + } else if (tag == "script") { + in_script_tag = false; + } else if (tag == "style") { + in_style_tag = false; + } else if (tag == "body") { + throw true; + } +} + +bool textHtmlToDoc(RclConfig *conf, const string &fn, + const string &mtype, Rcl::Doc &docout) +{ + LOGDEB(("textHtmlToDoc: %s\n", fn.c_str())); + string otext; + if (!file_to_string(fn, otext)) { + LOGINFO(("textHtmlToDoc: cant read: %s\n", fn.c_str())); + return false; + } + + // Character set handling: + + // - We first try to convert from the default configured charset + // (which may depend of the current directory) to utf-8. If this + // fails, we keep the original text + // - During parsing, if we find a charset parameter, and it differs from + // what we started with, we abort and restart with the parameter value + // instead of the configuration one. + string charset; + if (conf->guesscharset) { + charset = csguess(otext, conf->defcharset); + } else + charset = conf->defcharset; + + LOGDEB(("textHtmlToDoc: charset before parsing: %s\n", + charset.c_str())); + + MyHtmlParser pres; + for (int pass = 0; pass < 2; pass++) { + string transcoded; + LOGDEB(("textHtmlToDoc: transcode from %s to %s\n", + charset.c_str(), "UTF-8")); + + MyHtmlParser p; + // Try transcoding. If it fails, use original text. + if (!transcode(otext, transcoded, charset, "UTF-8")) { + LOGERR(("textHtmlToDoc: transcode failed from cs '%s' to UTF-8\n", + charset.c_str())); + transcoded = otext; + // We don't know the charset, at all + p.charset = charset = ""; + } else { + // charset has the putative source charset, transcoded is now + // in utf-8 + p.charset = "utf-8"; + } + + try { + p.parse_html(transcoded); + } catch (bool) { + pres = p; + if (!pres.doccharset.empty() && pres.doccharset != charset) { + LOGDEB(("textHtmlToDoc: charset '%s' doc charset '%s'," + "reparse\n", charset.c_str(), + pres.doccharset.c_str())); + charset = pres.doccharset; + } else + break; + } + } + + Rcl::Doc out; + out.origcharset = charset; + out.text = pres.dump; + out.title = pres.title; + out.keywords = pres.keywords; + out.abstract = pres.sample; + docout = out; + return true; +} diff --git a/src/internfile/mimehandler.cpp b/src/internfile/mimehandler.cpp index 713819df..547ecec6 100644 --- a/src/internfile/mimehandler.cpp +++ b/src/internfile/mimehandler.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: mimehandler.cpp,v 1.1 2005-01-25 14:37:57 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: mimehandler.cpp,v 1.2 2005-01-26 11:47:27 dockes Exp $ (C) 2004 J.F.Dockes"; #endif #include @@ -52,6 +52,7 @@ class IHandler_Init { public: IHandler_Init() { ihandlers["text/plain"] = textPlainToDoc; + ihandlers["text/html"] = textHtmlToDoc; // Add new associations here when needed } }; diff --git a/src/internfile/mimehandler.h b/src/internfile/mimehandler.h index 2578e8d8..4cedb41c 100644 --- a/src/internfile/mimehandler.h +++ b/src/internfile/mimehandler.h @@ -1,6 +1,6 @@ #ifndef _MIMEHANDLER_H_INCLUDED_ #define _MIMEHANDLER_H_INCLUDED_ -/* @(#$Id: mimehandler.h,v 1.1 2005-01-25 14:37:57 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: mimehandler.h,v 1.2 2005-01-26 11:47:27 dockes Exp $ (C) 2004 J.F.Dockes */ #include @@ -14,4 +14,7 @@ typedef bool (*MimeHandlerFunc)(RclConfig *, const std::string &, extern MimeHandlerFunc getMimeHandler(const std::string &mtype, ConfTree *mhandlers); +extern bool textHtmlToDoc(RclConfig *conf, const string &fn, + const string &mtype, Rcl::Doc &docout); + #endif /* _MIMEHANDLER_H_INCLUDED_ */ diff --git a/src/lib/Makefile b/src/lib/Makefile index c98709ba..975666f9 100644 --- a/src/lib/Makefile +++ b/src/lib/Makefile @@ -8,14 +8,15 @@ LIBS = librcl.a all: $(LIBS) OBJS = conftree.o csguess.o debuglog.o \ - fstreewalk.o \ - mimehandler.o mimetype.o pathut.o \ + fstreewalk.o html.o htmlparse.o \ + mimehandler.o mimeparse.o mimetype.o pathut.o \ rclconfig.o rcldb.o readfile.o \ textsplit.o transcode.o \ unacpp.o unac.o SRCS = ../utils/conftree.cpp ../index/csguess.cpp ../utils/debuglog.cpp \ - ../utils/fstreewalk.cpp \ - ../common/mimehandler.cpp ../index/mimetype.cpp ../utils/pathut.cpp \ + ../utils/fstreewalk.cpp ../common/html.cpp ../common/htmlparse.cpp \ + ../common/mimehandler.cpp ../utils/mimeparse.cpp ../index/mimetype.cpp \ + ../utils/pathut.cpp \ ../common/rclconfig.cpp ../common/rcldb.cpp ../utils/readfile.cpp \ ../common/textsplit.cpp ../utils/transcode.cpp \ ../common/unacpp.cpp ../unac/unac.c @@ -35,8 +36,14 @@ debuglog.o : ../utils/debuglog.cpp $(CXX) $(CXXFLAGS) -c $< fstreewalk.o : ../utils/fstreewalk.cpp $(CXX) $(CXXFLAGS) -c $< +html.o : ../common/html.cpp + $(CXX) $(CXXFLAGS) -c $< +htmlparse.o : ../common/htmlparse.cpp + $(CXX) $(CXXFLAGS) -c $< mimehandler.o : ../common/mimehandler.cpp $(CXX) $(CXXFLAGS) -c $< +mimeparse.o : ../utils/mimeparse.cpp + $(CXX) $(CXXFLAGS) -c $< mimetype.o : ../index/mimetype.cpp $(CXX) $(CXXFLAGS) -c $< pathut.o : ../utils/pathut.cpp diff --git a/src/qtgui/recollmain.ui b/src/qtgui/recollmain.ui index 96e29b15..4edb92b6 100644 --- a/src/qtgui/recollmain.ui +++ b/src/qtgui/recollmain.ui @@ -8,106 +8,177 @@ 0 0 - 774 - 619 + 782 + 622 + + + 7 + 7 + 0 + 0 + + recoll - + - layout7 + unnamed - - - 11 - 11 - 752 - 41 - - - + - unnamed + layout3 - + - queryText + unnamed - - LineEditPanel - - - Sunken - - - - - Search - - - pushButton1 - - - - - spacer1 - - - Horizontal - - - Expanding - - - - 40 - 20 - - - - - - - - splitter9 - - - - 11 - 58 - 752 - 491 - - - - Horizontal - - - - resTextEdit - + + + layout2 + + + + unnamed + + + + queryText + + + LineEditPanel + + + Sunken + + + + + Search + + + Search + + + + + spacer1 + + + Horizontal + + + Expanding + + + + 329 + 20 + + + + + + + + splitter9 + + + + 7 + 7 + 1 + 1 + + + + + 0 + 0 + + + + Horizontal + + + + resTextEdit + + + + 7 + 7 + 2 + 0 + + + + RichText + + + true + + + + + splitter8 + + + + 7 + 7 + 3 + 0 + + + + Vertical + + + + previewTextEdit + + + + 7 + 7 + 0 + 2 + + + + RichText + + + true + + + + + metaTextEdit + + + + 7 + 7 + 0 + 1 + + + + RichText + + + true + + + + + - - - splitter8 - - - Vertical - - - - textEdit12 - - - - - textEdit13 - - - - + @@ -216,18 +287,24 @@ RecollMain resTextEdit_clicked(int,int) - - resTextEdit - returnPressed() - RecollMain - resTextEdit_returnPressed() - fileExitAction activated() RecollMain fileExit() + + queryText + returnPressed() + RecollMain + queryText_returnPressed() + + + Search + clicked() + RecollMain + Search_clicked() + recollmain.ui.h @@ -238,7 +315,8 @@ helpContents() helpAbout() resTextEdit_clicked( int par, int car ) - resTextEdit_returnPressed() + queryText_returnPressed() + Search_clicked() diff --git a/src/qtgui/recollmain.ui.h b/src/qtgui/recollmain.ui.h index 90b56b36..5bc46a61 100644 --- a/src/qtgui/recollmain.ui.h +++ b/src/qtgui/recollmain.ui.h @@ -32,30 +32,126 @@ void RecollMain::helpAbout() { } +#include +#include "rcldb.h" +#include "rclconfig.h" +#include "debuglog.h" +#include "mimehandler.h" + +extern RclConfig *rclconfig; +extern Rcl::Db *rcldb; + +static string plaintorich(const string &in) +{ + string out = "

"; + for (unsigned int i = 0; i < in.length() ; i++) { + if (in[i] == '\n') { + out += "
"; + } else { + out += in[i]; + } + if (i == 10) { + out += ""; + } + if (i == 20) { + out += ""; + } + + } + return out; +} void RecollMain::resTextEdit_clicked( int par, int car ) { fprintf(stderr, "Clicked at paragraph %d, char %d\n", par, car); -} + Rcl::Doc doc; + doc.erase(); + if (rcldb->getDoc(par, doc)) { + + // Go to the file system to retrieve / convert the document text + // for preview: -#include "qfontdialog.h" + // Look for appropriate handler + MimeHandlerFunc fun = + getMimeHandler(doc.mimetype, rclconfig->getMimeConf()); + if (!fun) { + QMessageBox::warning(0, "Recoll", + QString("No mime handler for mime type ") + + doc.mimetype.c_str()); + return; + } -#define BS 200000 -void RecollMain::resTextEdit_returnPressed() -{ - fprintf(stderr, "ReturnPressed()\n"); - resTextEdit->setFont( QFontDialog::getFont( 0, resTextEdit->font() ) ); - const char *fname = "utf8.txt"; - FILE *fp = fopen(fname, "r"); - if (fp) { - char buf[BS]; - memset(buf,0, sizeof(buf)); - int n = fread(buf, 1, BS-1, fp); - fclose(fp); - QString str = QString::fromUtf8(buf, n); - resTextEdit->setTextFormat(RichText); - resTextEdit->setText(str); + string fn = doc.url.substr(6, string::npos); + Rcl::Doc fdoc; + if (!fun(rclconfig, fn, doc.mimetype, fdoc)) { + QMessageBox::warning(0, "Recoll", + QString("Failed to convert document for preview!\n") + + fn.c_str() + " mimetype " + + doc.mimetype.c_str()); + return; + } + + string rich = plaintorich(fdoc.text); + +#if 0 + //Highlighting; pass a list of (search term, style name) to plaintorich + // and create the corresponding styles with different colors here + // We need to : + // - Break the query into terms : wait for the query analyzer + // - Break the text into words. This should use a version of + // textsplit with an option to keep the punctuation (see how to do + // this). We do want the same splitter code to be used here and + // when indexing. + QStyleSheetItem *item = + new QStyleSheetItem( previewTextEdit->styleSheet(), "mytag" ); + item->setColor("red"); + item->setFontWeight(QFont::Bold); +#endif + QString str = QString::fromUtf8(rich.c_str(), rich.length()); + + previewTextEdit->setTextFormat(RichText); + previewTextEdit->setText(str); } - +} + +void RecollMain::queryText_returnPressed() +{ + LOGDEB(("RecollMain::queryText_returnPressed()\n")); + resTextEdit->clear(); + previewTextEdit->clear(); + + string rawq = queryText->text(); + rcldb->setQuery(rawq); + Rcl::Doc doc; + + // Insert results if any in result list window + QString result; + resTextEdit->append(""); + for (int i = 0;; i++) { + doc.erase(); + if (!rcldb->getDoc(i, doc)) + break; + LOGDEB(("Url: %s\n", doc.url.c_str())); + LOGDEB(("Mimetype: \n", doc.mimetype.c_str())); + LOGDEB(("Mtime: \n", doc.mtime.c_str())); + LOGDEB(("Origcharset: \n", doc.origcharset.c_str())); + LOGDEB(("Title: \n", doc.title.c_str())); + LOGDEB(("Text: \n", doc.text.c_str())); + LOGDEB(("Keywords: \n", doc.keywords.c_str())); + LOGDEB(("Abstract: \n", doc.abstract.c_str())); + + result = "

" + doc.url + "

"; + resTextEdit->append(result); + } + resTextEdit->append("
"); + + // Display preview for 1st doc in list + resTextEdit_clicked(0, 0); +} + + +void RecollMain::Search_clicked() +{ + queryText_returnPressed(); } diff --git a/src/query/qtry.cpp b/src/query/qtry.cpp index 971dc0d0..994b3169 100644 --- a/src/query/qtry.cpp +++ b/src/query/qtry.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: qtry.cpp,v 1.2 2005-01-25 14:37:21 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: qtry.cpp,v 1.3 2005-01-26 11:47:27 dockes Exp $ (C) 2004 J.F.Dockes"; #endif // Tests with the query interface @@ -62,20 +62,20 @@ int main(int argc, char **argv) if (argc < 1) Usage(); - RclConfig *config = new RclConfig; + RclConfig *rclconfig = new RclConfig; - if (!config->ok()) + if (!rclconfig->ok()) cerr << "Config could not be built" << endl; string dbdir; - if (config->getConfParam(string("dbdir"), dbdir) == 0) { + if (rclconfig->getConfParam(string("dbdir"), dbdir) == 0) { cerr << "No database directory in configuration" << endl; exit(1); } - Rcl::Db *db = new Rcl::Db; + Rcl::Db *rcldb = new Rcl::Db; - if (!db->open(dbdir, Rcl::Db::DbRO)) { + if (!rcldb->open(dbdir, Rcl::Db::DbRO)) { fprintf(stderr, "Could not open database\n"); exit(1); } @@ -84,12 +84,12 @@ int main(int argc, char **argv) string query; while (argc--) query += string(*argv++) + " " ; - db->setQuery(query); + rcldb->setQuery(query); int i = 0; Rcl::Doc doc; for (i=0;;i++) { doc.erase(); - if (!db->getDoc(i, doc)) + if (!rcldb->getDoc(i, doc)) break; cout << "Url: " << doc.url << endl; @@ -107,7 +107,7 @@ int main(int argc, char **argv) // Look for appropriate handler MimeHandlerFunc fun = getMimeHandler(doc.mimetype, - config->getMimeConf()); + rclconfig->getMimeConf()); if (!fun) { cout << "No mime handler !" << endl; continue; @@ -116,7 +116,7 @@ int main(int argc, char **argv) cout << "Filename: " << fn << endl; Rcl::Doc fdoc; - if (!fun(config, fn, doc.mimetype, fdoc)) { + if (!fun(rclconfig, fn, doc.mimetype, fdoc)) { cout << "Failed to convert/preview document!" << endl; continue; } @@ -125,7 +125,7 @@ int main(int argc, char **argv) transcode(fdoc.text, printable, "UTF-8", outencoding); cout << printable << endl; } - delete db; + delete rcldb; cerr << "Exiting" << endl; exit(0); } diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index a6d1a3f0..d6466d75 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.7 2005-01-25 14:37:21 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.8 2005-01-26 11:47:27 dockes Exp $ (C) 2004 J.F.Dockes"; #endif #include @@ -201,6 +201,7 @@ bool dumb_string(const string &in, string &out) bool Rcl::Db::add(const string &fn, const Rcl::Doc &doc) { + LOGDEB(("Rcl::Db::add: fn %s\n", fn.c_str())); if (pdata == 0) return false; Native *ndb = (Native *)pdata; @@ -226,24 +227,29 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &doc) string noacc; if (!unac_cpp(doc.title, noacc)) { + LOGERR(("Rcl::Db::add: unac failed\n")); return false; } splitter.text_to_words(noacc); + LOGDEB(("Rcl::Db::add: doc split\n")); splitData.basepos += splitData.curpos + 100; if (!dumb_string(doc.text, noacc)) { + LOGERR(("Rcl::Db::add: dum_string failed\n")); return false; } splitter.text_to_words(noacc); splitData.basepos += splitData.curpos + 100; if (!dumb_string(doc.keywords, noacc)) { + LOGERR(("Rcl::Db::add: dum_string failed\n")); return false; } splitter.text_to_words(noacc); splitData.basepos += splitData.curpos + 100; if (!dumb_string(doc.abstract, noacc)) { + LOGERR(("Rcl::Db::add: dum_string failed\n")); return false; } splitter.text_to_words(noacc); @@ -263,20 +269,20 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &doc) #if 0 if (did < updated.size()) { updated[did] = true; - LOGDEB1(("%s updated\n", fnc)); + LOGDEB(("%s updated\n", fnc)); } else { - LOGDEB1(("%s added\n", fnc)); + LOGDEB(("%s added\n", fnc)); } #endif } catch (...) { // FIXME: is this ever actually needed? ndb->wdb.add_document(newdocument); - LOGDEB1(("%s added (failed re-seek for duplicate).\n", fnc)); + LOGDEB(("%s added (failed re-seek for duplicate).\n", fnc)); } } else { try { ndb->wdb.add_document(newdocument); - LOGDEB1(("%s added\n", fnc)); + LOGDEB(("%s added\n", fnc)); } catch (...) { LOGERR(("%s : Got exception while adding doc\n", fnc)); return false; diff --git a/src/utils/Makefile b/src/utils/Makefile index 1c0ae9bf..0697fd95 100644 --- a/src/utils/Makefile +++ b/src/utils/Makefile @@ -3,7 +3,7 @@ CXXFLAGS = -I. BIGLIB = ../lib/librcl.a -PROGS = trfstreewalk trpathut execmd transcode +PROGS = trfstreewalk trpathut execmd transcode trmimeparse all: $(PROGS) FSTREEWALK_OBJS= trfstreewalk.o fstreewalk.o pathut.o @@ -30,5 +30,13 @@ transcode : $(TRANSCODE_OBJS) trtranscode.o : ../utils/transcode.cpp $(CXX) $(CXXFLAGS) -DTEST_TRANSCODE -c -o trtranscode.o \ transcode.cpp + +MIMEPARSE_OBJS= trmimeparse.o $(BIGLIB) +mimeparse : $(MIMEPARSE_OBJS) + $(CXX) $(CXXFLAGS) -o mimeparse $(MIMEPARSE_OBJS) \ + -L/usr/local/lib -liconv +trmimeparse.o : ../utils/mimeparse.cpp + $(CXX) $(CXXFLAGS) -DTEST_MIMEPARSE -c -o trmimeparse.o \ + mimeparse.cpp clean: rm -f *.o $(PROGS)