ckpt
This commit is contained in:
parent
c14b34647a
commit
0b18276947
227
src/internfile/mh_html.cpp
Normal file
227
src/internfile/mh_html.cpp
Normal file
@ -0,0 +1,227 @@
|
||||
/* htmlparse.cc: simple HTML parser for omega indexer
|
||||
*
|
||||
* ----START-LICENCE----
|
||||
* Copyright 1999,2000,2001 BrightStation PLC
|
||||
* Copyright 2001 Ananova Ltd
|
||||
* Copyright 2002 Olly Betts
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License as
|
||||
* published by the Free Software Foundation; either version 2 of the
|
||||
* License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
|
||||
* USA
|
||||
* -----END-LICENCE-----
|
||||
*/
|
||||
|
||||
// This file has code from omindex + an adaptor function for recoll at the end
|
||||
|
||||
#include "htmlparse.h"
|
||||
#include "mimehandler.h"
|
||||
#include "debuglog.h"
|
||||
#include "csguess.h"
|
||||
#include "readfile.h"
|
||||
#include "transcode.h"
|
||||
#include "mimeparse.h"
|
||||
|
||||
class MyHtmlParser : public HtmlParser {
|
||||
public:
|
||||
bool in_script_tag;
|
||||
bool in_style_tag;
|
||||
string title, sample, keywords, dump;
|
||||
string charset; // This is the charset our user thinks the doc is in
|
||||
string doccharset; // Set this to value of charset parameter in header
|
||||
bool indexing_allowed;
|
||||
void process_text(const string &text);
|
||||
void opening_tag(const string &tag, const map<string,string> &p);
|
||||
void closing_tag(const string &tag);
|
||||
MyHtmlParser() :
|
||||
in_script_tag(false),
|
||||
in_style_tag(false),
|
||||
indexing_allowed(true) { }
|
||||
};
|
||||
|
||||
void
|
||||
MyHtmlParser::process_text(const string &text)
|
||||
{
|
||||
// some tags are meaningful mid-word so this is simplistic at best...
|
||||
|
||||
if (!in_script_tag && !in_style_tag) {
|
||||
string::size_type firstchar = text.find_first_not_of(" \t\n\r");
|
||||
if (firstchar != string::npos) {
|
||||
dump += text.substr(firstchar);
|
||||
dump += " ";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// lets hope that the charset includes ascii values...
|
||||
static inline void
|
||||
lowercase_term(string &term)
|
||||
{
|
||||
string::iterator i = term.begin();
|
||||
while (i != term.end()) {
|
||||
if (*i >= 'A' && *i <= 'Z')
|
||||
*i = *i + 'a' - 'A';
|
||||
i++;
|
||||
}
|
||||
}
|
||||
|
||||
#include <iostream>
|
||||
using namespace std;
|
||||
|
||||
|
||||
void
|
||||
MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
|
||||
{
|
||||
#if 0
|
||||
cout << "TAG: " << tag << ": " << endl;
|
||||
map<string, string>::const_iterator x;
|
||||
for (x = p.begin(); x != p.end(); x++) {
|
||||
cout << " " << x->first << " -> '" << x->second << "'" << endl;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (tag == "meta") {
|
||||
map<string, string>::const_iterator i, j;
|
||||
if ((i = p.find("content")) != p.end()) {
|
||||
if ((j = p.find("name")) != p.end()) {
|
||||
string name = j->second;
|
||||
lowercase_term(name);
|
||||
if (name == "description") {
|
||||
if (sample.empty()) {
|
||||
sample = i->second;
|
||||
decode_entities(sample);
|
||||
}
|
||||
} else if (name == "keywords") {
|
||||
if (!keywords.empty()) keywords += ' ';
|
||||
string tmp = i->second;
|
||||
decode_entities(tmp);
|
||||
keywords += tmp;
|
||||
} else if (name == "robots") {
|
||||
string val = i->second;
|
||||
decode_entities(val);
|
||||
lowercase_term(val);
|
||||
if (val.find("none") != string::npos ||
|
||||
val.find("noindex") != string::npos) {
|
||||
indexing_allowed = false;
|
||||
throw true;
|
||||
}
|
||||
}
|
||||
} else if ((j = p.find("http-equiv")) != p.end()) {
|
||||
string hequiv = j->second;
|
||||
lowercase_term(hequiv);
|
||||
if (hequiv == "content-type") {
|
||||
string value = i->second;
|
||||
MimeHeaderValue p = parseMimeHeaderValue(value);
|
||||
map<string, string>::const_iterator k;
|
||||
if ((k = p.params.find("charset")) != p.params.end()) {
|
||||
doccharset = k->second;
|
||||
if (doccharset != charset)
|
||||
throw true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if (tag == "script") {
|
||||
in_script_tag = true;
|
||||
} else if (tag == "style") {
|
||||
in_style_tag = true;
|
||||
} else if (tag == "body") {
|
||||
dump = "";
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
MyHtmlParser::closing_tag(const string &tag)
|
||||
{
|
||||
if (tag == "title") {
|
||||
title = dump;
|
||||
dump = "";
|
||||
} else if (tag == "script") {
|
||||
in_script_tag = false;
|
||||
} else if (tag == "style") {
|
||||
in_style_tag = false;
|
||||
} else if (tag == "body") {
|
||||
throw true;
|
||||
}
|
||||
}
|
||||
|
||||
bool textHtmlToDoc(RclConfig *conf, const string &fn,
|
||||
const string &mtype, Rcl::Doc &docout)
|
||||
{
|
||||
LOGDEB(("textHtmlToDoc: %s\n", fn.c_str()));
|
||||
string otext;
|
||||
if (!file_to_string(fn, otext)) {
|
||||
LOGINFO(("textHtmlToDoc: cant read: %s\n", fn.c_str()));
|
||||
return false;
|
||||
}
|
||||
|
||||
// Character set handling:
|
||||
|
||||
// - We first try to convert from the default configured charset
|
||||
// (which may depend of the current directory) to utf-8. If this
|
||||
// fails, we keep the original text
|
||||
// - During parsing, if we find a charset parameter, and it differs from
|
||||
// what we started with, we abort and restart with the parameter value
|
||||
// instead of the configuration one.
|
||||
string charset;
|
||||
if (conf->guesscharset) {
|
||||
charset = csguess(otext, conf->defcharset);
|
||||
} else
|
||||
charset = conf->defcharset;
|
||||
|
||||
LOGDEB(("textHtmlToDoc: charset before parsing: %s\n",
|
||||
charset.c_str()));
|
||||
|
||||
MyHtmlParser pres;
|
||||
for (int pass = 0; pass < 2; pass++) {
|
||||
string transcoded;
|
||||
LOGDEB(("textHtmlToDoc: transcode from %s to %s\n",
|
||||
charset.c_str(), "UTF-8"));
|
||||
|
||||
MyHtmlParser p;
|
||||
// Try transcoding. If it fails, use original text.
|
||||
if (!transcode(otext, transcoded, charset, "UTF-8")) {
|
||||
LOGERR(("textHtmlToDoc: transcode failed from cs '%s' to UTF-8\n",
|
||||
charset.c_str()));
|
||||
transcoded = otext;
|
||||
// We don't know the charset, at all
|
||||
p.charset = charset = "";
|
||||
} else {
|
||||
// charset has the putative source charset, transcoded is now
|
||||
// in utf-8
|
||||
p.charset = "utf-8";
|
||||
}
|
||||
|
||||
try {
|
||||
p.parse_html(transcoded);
|
||||
} catch (bool) {
|
||||
pres = p;
|
||||
if (!pres.doccharset.empty() && pres.doccharset != charset) {
|
||||
LOGDEB(("textHtmlToDoc: charset '%s' doc charset '%s',"
|
||||
"reparse\n", charset.c_str(),
|
||||
pres.doccharset.c_str()));
|
||||
charset = pres.doccharset;
|
||||
} else
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
Rcl::Doc out;
|
||||
out.origcharset = charset;
|
||||
out.text = pres.dump;
|
||||
out.title = pres.title;
|
||||
out.keywords = pres.keywords;
|
||||
out.abstract = pres.sample;
|
||||
docout = out;
|
||||
return true;
|
||||
}
|
||||
@ -1,5 +1,5 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: mimehandler.cpp,v 1.1 2005-01-25 14:37:57 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: mimehandler.cpp,v 1.2 2005-01-26 11:47:27 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
#endif
|
||||
|
||||
#include <iostream>
|
||||
@ -52,6 +52,7 @@ class IHandler_Init {
|
||||
public:
|
||||
IHandler_Init() {
|
||||
ihandlers["text/plain"] = textPlainToDoc;
|
||||
ihandlers["text/html"] = textHtmlToDoc;
|
||||
// Add new associations here when needed
|
||||
}
|
||||
};
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
#ifndef _MIMEHANDLER_H_INCLUDED_
|
||||
#define _MIMEHANDLER_H_INCLUDED_
|
||||
/* @(#$Id: mimehandler.h,v 1.1 2005-01-25 14:37:57 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
/* @(#$Id: mimehandler.h,v 1.2 2005-01-26 11:47:27 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
|
||||
#include <string>
|
||||
|
||||
@ -14,4 +14,7 @@ typedef bool (*MimeHandlerFunc)(RclConfig *, const std::string &,
|
||||
extern MimeHandlerFunc getMimeHandler(const std::string &mtype,
|
||||
ConfTree *mhandlers);
|
||||
|
||||
extern bool textHtmlToDoc(RclConfig *conf, const string &fn,
|
||||
const string &mtype, Rcl::Doc &docout);
|
||||
|
||||
#endif /* _MIMEHANDLER_H_INCLUDED_ */
|
||||
|
||||
@ -8,14 +8,15 @@ LIBS = librcl.a
|
||||
all: $(LIBS)
|
||||
|
||||
OBJS = conftree.o csguess.o debuglog.o \
|
||||
fstreewalk.o \
|
||||
mimehandler.o mimetype.o pathut.o \
|
||||
fstreewalk.o html.o htmlparse.o \
|
||||
mimehandler.o mimeparse.o mimetype.o pathut.o \
|
||||
rclconfig.o rcldb.o readfile.o \
|
||||
textsplit.o transcode.o \
|
||||
unacpp.o unac.o
|
||||
SRCS = ../utils/conftree.cpp ../index/csguess.cpp ../utils/debuglog.cpp \
|
||||
../utils/fstreewalk.cpp \
|
||||
../common/mimehandler.cpp ../index/mimetype.cpp ../utils/pathut.cpp \
|
||||
../utils/fstreewalk.cpp ../common/html.cpp ../common/htmlparse.cpp \
|
||||
../common/mimehandler.cpp ../utils/mimeparse.cpp ../index/mimetype.cpp \
|
||||
../utils/pathut.cpp \
|
||||
../common/rclconfig.cpp ../common/rcldb.cpp ../utils/readfile.cpp \
|
||||
../common/textsplit.cpp ../utils/transcode.cpp \
|
||||
../common/unacpp.cpp ../unac/unac.c
|
||||
@ -35,8 +36,14 @@ debuglog.o : ../utils/debuglog.cpp
|
||||
$(CXX) $(CXXFLAGS) -c $<
|
||||
fstreewalk.o : ../utils/fstreewalk.cpp
|
||||
$(CXX) $(CXXFLAGS) -c $<
|
||||
html.o : ../common/html.cpp
|
||||
$(CXX) $(CXXFLAGS) -c $<
|
||||
htmlparse.o : ../common/htmlparse.cpp
|
||||
$(CXX) $(CXXFLAGS) -c $<
|
||||
mimehandler.o : ../common/mimehandler.cpp
|
||||
$(CXX) $(CXXFLAGS) -c $<
|
||||
mimeparse.o : ../utils/mimeparse.cpp
|
||||
$(CXX) $(CXXFLAGS) -c $<
|
||||
mimetype.o : ../index/mimetype.cpp
|
||||
$(CXX) $(CXXFLAGS) -c $<
|
||||
pathut.o : ../utils/pathut.cpp
|
||||
|
||||
@ -8,106 +8,177 @@
|
||||
<rect>
|
||||
<x>0</x>
|
||||
<y>0</y>
|
||||
<width>774</width>
|
||||
<height>619</height>
|
||||
<width>782</width>
|
||||
<height>622</height>
|
||||
</rect>
|
||||
</property>
|
||||
<property name="sizePolicy">
|
||||
<sizepolicy>
|
||||
<hsizetype>7</hsizetype>
|
||||
<vsizetype>7</vsizetype>
|
||||
<horstretch>0</horstretch>
|
||||
<verstretch>0</verstretch>
|
||||
</sizepolicy>
|
||||
</property>
|
||||
<property name="caption">
|
||||
<string>recoll</string>
|
||||
</property>
|
||||
<widget class="QLayoutWidget">
|
||||
<hbox>
|
||||
<property name="name">
|
||||
<cstring>layout7</cstring>
|
||||
<cstring>unnamed</cstring>
|
||||
</property>
|
||||
<property name="geometry">
|
||||
<rect>
|
||||
<x>11</x>
|
||||
<y>11</y>
|
||||
<width>752</width>
|
||||
<height>41</height>
|
||||
</rect>
|
||||
</property>
|
||||
<hbox>
|
||||
<widget class="QLayoutWidget">
|
||||
<property name="name">
|
||||
<cstring>unnamed</cstring>
|
||||
<cstring>layout3</cstring>
|
||||
</property>
|
||||
<widget class="QLineEdit">
|
||||
<vbox>
|
||||
<property name="name">
|
||||
<cstring>queryText</cstring>
|
||||
<cstring>unnamed</cstring>
|
||||
</property>
|
||||
<property name="frameShape">
|
||||
<enum>LineEditPanel</enum>
|
||||
</property>
|
||||
<property name="frameShadow">
|
||||
<enum>Sunken</enum>
|
||||
</property>
|
||||
</widget>
|
||||
<widget class="QPushButton">
|
||||
<property name="name">
|
||||
<cstring>Search</cstring>
|
||||
</property>
|
||||
<property name="text">
|
||||
<string>pushButton1</string>
|
||||
</property>
|
||||
</widget>
|
||||
<spacer>
|
||||
<property name="name">
|
||||
<cstring>spacer1</cstring>
|
||||
</property>
|
||||
<property name="orientation">
|
||||
<enum>Horizontal</enum>
|
||||
</property>
|
||||
<property name="sizeType">
|
||||
<enum>Expanding</enum>
|
||||
</property>
|
||||
<property name="sizeHint">
|
||||
<size>
|
||||
<width>40</width>
|
||||
<height>20</height>
|
||||
</size>
|
||||
</property>
|
||||
</spacer>
|
||||
</hbox>
|
||||
</widget>
|
||||
<widget class="QSplitter">
|
||||
<property name="name">
|
||||
<cstring>splitter9</cstring>
|
||||
</property>
|
||||
<property name="geometry">
|
||||
<rect>
|
||||
<x>11</x>
|
||||
<y>58</y>
|
||||
<width>752</width>
|
||||
<height>491</height>
|
||||
</rect>
|
||||
</property>
|
||||
<property name="orientation">
|
||||
<enum>Horizontal</enum>
|
||||
</property>
|
||||
<widget class="QTextEdit">
|
||||
<property name="name">
|
||||
<cstring>resTextEdit</cstring>
|
||||
</property>
|
||||
<widget class="QLayoutWidget">
|
||||
<property name="name">
|
||||
<cstring>layout2</cstring>
|
||||
</property>
|
||||
<hbox>
|
||||
<property name="name">
|
||||
<cstring>unnamed</cstring>
|
||||
</property>
|
||||
<widget class="QLineEdit">
|
||||
<property name="name">
|
||||
<cstring>queryText</cstring>
|
||||
</property>
|
||||
<property name="frameShape">
|
||||
<enum>LineEditPanel</enum>
|
||||
</property>
|
||||
<property name="frameShadow">
|
||||
<enum>Sunken</enum>
|
||||
</property>
|
||||
</widget>
|
||||
<widget class="QPushButton">
|
||||
<property name="name">
|
||||
<cstring>Search</cstring>
|
||||
</property>
|
||||
<property name="text">
|
||||
<string>Search</string>
|
||||
</property>
|
||||
</widget>
|
||||
<spacer>
|
||||
<property name="name">
|
||||
<cstring>spacer1</cstring>
|
||||
</property>
|
||||
<property name="orientation">
|
||||
<enum>Horizontal</enum>
|
||||
</property>
|
||||
<property name="sizeType">
|
||||
<enum>Expanding</enum>
|
||||
</property>
|
||||
<property name="sizeHint">
|
||||
<size>
|
||||
<width>329</width>
|
||||
<height>20</height>
|
||||
</size>
|
||||
</property>
|
||||
</spacer>
|
||||
</hbox>
|
||||
</widget>
|
||||
<widget class="QSplitter">
|
||||
<property name="name">
|
||||
<cstring>splitter9</cstring>
|
||||
</property>
|
||||
<property name="sizePolicy">
|
||||
<sizepolicy>
|
||||
<hsizetype>7</hsizetype>
|
||||
<vsizetype>7</vsizetype>
|
||||
<horstretch>1</horstretch>
|
||||
<verstretch>1</verstretch>
|
||||
</sizepolicy>
|
||||
</property>
|
||||
<property name="minimumSize">
|
||||
<size>
|
||||
<width>0</width>
|
||||
<height>0</height>
|
||||
</size>
|
||||
</property>
|
||||
<property name="orientation">
|
||||
<enum>Horizontal</enum>
|
||||
</property>
|
||||
<widget class="QTextEdit">
|
||||
<property name="name">
|
||||
<cstring>resTextEdit</cstring>
|
||||
</property>
|
||||
<property name="sizePolicy">
|
||||
<sizepolicy>
|
||||
<hsizetype>7</hsizetype>
|
||||
<vsizetype>7</vsizetype>
|
||||
<horstretch>2</horstretch>
|
||||
<verstretch>0</verstretch>
|
||||
</sizepolicy>
|
||||
</property>
|
||||
<property name="textFormat">
|
||||
<enum>RichText</enum>
|
||||
</property>
|
||||
<property name="readOnly">
|
||||
<bool>true</bool>
|
||||
</property>
|
||||
</widget>
|
||||
<widget class="QSplitter">
|
||||
<property name="name">
|
||||
<cstring>splitter8</cstring>
|
||||
</property>
|
||||
<property name="sizePolicy">
|
||||
<sizepolicy>
|
||||
<hsizetype>7</hsizetype>
|
||||
<vsizetype>7</vsizetype>
|
||||
<horstretch>3</horstretch>
|
||||
<verstretch>0</verstretch>
|
||||
</sizepolicy>
|
||||
</property>
|
||||
<property name="orientation">
|
||||
<enum>Vertical</enum>
|
||||
</property>
|
||||
<widget class="QTextEdit">
|
||||
<property name="name">
|
||||
<cstring>previewTextEdit</cstring>
|
||||
</property>
|
||||
<property name="sizePolicy">
|
||||
<sizepolicy>
|
||||
<hsizetype>7</hsizetype>
|
||||
<vsizetype>7</vsizetype>
|
||||
<horstretch>0</horstretch>
|
||||
<verstretch>2</verstretch>
|
||||
</sizepolicy>
|
||||
</property>
|
||||
<property name="textFormat">
|
||||
<enum>RichText</enum>
|
||||
</property>
|
||||
<property name="readOnly">
|
||||
<bool>true</bool>
|
||||
</property>
|
||||
</widget>
|
||||
<widget class="QTextEdit">
|
||||
<property name="name">
|
||||
<cstring>metaTextEdit</cstring>
|
||||
</property>
|
||||
<property name="sizePolicy">
|
||||
<sizepolicy>
|
||||
<hsizetype>7</hsizetype>
|
||||
<vsizetype>7</vsizetype>
|
||||
<horstretch>0</horstretch>
|
||||
<verstretch>1</verstretch>
|
||||
</sizepolicy>
|
||||
</property>
|
||||
<property name="textFormat">
|
||||
<enum>RichText</enum>
|
||||
</property>
|
||||
<property name="readOnly">
|
||||
<bool>true</bool>
|
||||
</property>
|
||||
</widget>
|
||||
</widget>
|
||||
</widget>
|
||||
</vbox>
|
||||
</widget>
|
||||
<widget class="QSplitter">
|
||||
<property name="name">
|
||||
<cstring>splitter8</cstring>
|
||||
</property>
|
||||
<property name="orientation">
|
||||
<enum>Vertical</enum>
|
||||
</property>
|
||||
<widget class="QTextEdit">
|
||||
<property name="name">
|
||||
<cstring>textEdit12</cstring>
|
||||
</property>
|
||||
</widget>
|
||||
<widget class="QTextEdit">
|
||||
<property name="name">
|
||||
<cstring>textEdit13</cstring>
|
||||
</property>
|
||||
</widget>
|
||||
</widget>
|
||||
</widget>
|
||||
</hbox>
|
||||
</widget>
|
||||
<menubar>
|
||||
<property name="name">
|
||||
@ -216,18 +287,24 @@
|
||||
<receiver>RecollMain</receiver>
|
||||
<slot>resTextEdit_clicked(int,int)</slot>
|
||||
</connection>
|
||||
<connection>
|
||||
<sender>resTextEdit</sender>
|
||||
<signal>returnPressed()</signal>
|
||||
<receiver>RecollMain</receiver>
|
||||
<slot>resTextEdit_returnPressed()</slot>
|
||||
</connection>
|
||||
<connection>
|
||||
<sender>fileExitAction</sender>
|
||||
<signal>activated()</signal>
|
||||
<receiver>RecollMain</receiver>
|
||||
<slot>fileExit()</slot>
|
||||
</connection>
|
||||
<connection>
|
||||
<sender>queryText</sender>
|
||||
<signal>returnPressed()</signal>
|
||||
<receiver>RecollMain</receiver>
|
||||
<slot>queryText_returnPressed()</slot>
|
||||
</connection>
|
||||
<connection>
|
||||
<sender>Search</sender>
|
||||
<signal>clicked()</signal>
|
||||
<receiver>RecollMain</receiver>
|
||||
<slot>Search_clicked()</slot>
|
||||
</connection>
|
||||
</connections>
|
||||
<includes>
|
||||
<include location="local" impldecl="in implementation">recollmain.ui.h</include>
|
||||
@ -238,7 +315,8 @@
|
||||
<slot>helpContents()</slot>
|
||||
<slot>helpAbout()</slot>
|
||||
<slot>resTextEdit_clicked( int par, int car )</slot>
|
||||
<slot>resTextEdit_returnPressed()</slot>
|
||||
<slot>queryText_returnPressed()</slot>
|
||||
<slot>Search_clicked()</slot>
|
||||
</slots>
|
||||
<pixmapinproject/>
|
||||
<layoutdefaults spacing="6" margin="11"/>
|
||||
|
||||
@ -32,30 +32,126 @@ void RecollMain::helpAbout()
|
||||
{
|
||||
|
||||
}
|
||||
#include <qmessagebox.h>
|
||||
|
||||
#include "rcldb.h"
|
||||
#include "rclconfig.h"
|
||||
#include "debuglog.h"
|
||||
#include "mimehandler.h"
|
||||
|
||||
extern RclConfig *rclconfig;
|
||||
extern Rcl::Db *rcldb;
|
||||
|
||||
static string plaintorich(const string &in)
|
||||
{
|
||||
string out = "<qt><head><title></title></head><body><p>";
|
||||
for (unsigned int i = 0; i < in.length() ; i++) {
|
||||
if (in[i] == '\n') {
|
||||
out += "<br>";
|
||||
} else {
|
||||
out += in[i];
|
||||
}
|
||||
if (i == 10) {
|
||||
out += "<mytag>";
|
||||
}
|
||||
if (i == 20) {
|
||||
out += "</mytag>";
|
||||
}
|
||||
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
void RecollMain::resTextEdit_clicked( int par, int car )
|
||||
{
|
||||
fprintf(stderr, "Clicked at paragraph %d, char %d\n", par, car);
|
||||
}
|
||||
Rcl::Doc doc;
|
||||
doc.erase();
|
||||
if (rcldb->getDoc(par, doc)) {
|
||||
|
||||
// Go to the file system to retrieve / convert the document text
|
||||
// for preview:
|
||||
|
||||
#include "qfontdialog.h"
|
||||
// Look for appropriate handler
|
||||
MimeHandlerFunc fun =
|
||||
getMimeHandler(doc.mimetype, rclconfig->getMimeConf());
|
||||
if (!fun) {
|
||||
QMessageBox::warning(0, "Recoll",
|
||||
QString("No mime handler for mime type ") +
|
||||
doc.mimetype.c_str());
|
||||
return;
|
||||
}
|
||||
|
||||
#define BS 200000
|
||||
void RecollMain::resTextEdit_returnPressed()
|
||||
{
|
||||
fprintf(stderr, "ReturnPressed()\n");
|
||||
resTextEdit->setFont( QFontDialog::getFont( 0, resTextEdit->font() ) );
|
||||
const char *fname = "utf8.txt";
|
||||
FILE *fp = fopen(fname, "r");
|
||||
if (fp) {
|
||||
char buf[BS];
|
||||
memset(buf,0, sizeof(buf));
|
||||
int n = fread(buf, 1, BS-1, fp);
|
||||
fclose(fp);
|
||||
QString str = QString::fromUtf8(buf, n);
|
||||
resTextEdit->setTextFormat(RichText);
|
||||
resTextEdit->setText(str);
|
||||
string fn = doc.url.substr(6, string::npos);
|
||||
Rcl::Doc fdoc;
|
||||
if (!fun(rclconfig, fn, doc.mimetype, fdoc)) {
|
||||
QMessageBox::warning(0, "Recoll",
|
||||
QString("Failed to convert document for preview!\n") +
|
||||
fn.c_str() + " mimetype " +
|
||||
doc.mimetype.c_str());
|
||||
return;
|
||||
}
|
||||
|
||||
string rich = plaintorich(fdoc.text);
|
||||
|
||||
#if 0
|
||||
//Highlighting; pass a list of (search term, style name) to plaintorich
|
||||
// and create the corresponding styles with different colors here
|
||||
// We need to :
|
||||
// - Break the query into terms : wait for the query analyzer
|
||||
// - Break the text into words. This should use a version of
|
||||
// textsplit with an option to keep the punctuation (see how to do
|
||||
// this). We do want the same splitter code to be used here and
|
||||
// when indexing.
|
||||
QStyleSheetItem *item =
|
||||
new QStyleSheetItem( previewTextEdit->styleSheet(), "mytag" );
|
||||
item->setColor("red");
|
||||
item->setFontWeight(QFont::Bold);
|
||||
#endif
|
||||
QString str = QString::fromUtf8(rich.c_str(), rich.length());
|
||||
|
||||
previewTextEdit->setTextFormat(RichText);
|
||||
previewTextEdit->setText(str);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void RecollMain::queryText_returnPressed()
|
||||
{
|
||||
LOGDEB(("RecollMain::queryText_returnPressed()\n"));
|
||||
resTextEdit->clear();
|
||||
previewTextEdit->clear();
|
||||
|
||||
string rawq = queryText->text();
|
||||
rcldb->setQuery(rawq);
|
||||
Rcl::Doc doc;
|
||||
|
||||
// Insert results if any in result list window
|
||||
QString result;
|
||||
resTextEdit->append("<qt><head></head><body>");
|
||||
for (int i = 0;; i++) {
|
||||
doc.erase();
|
||||
if (!rcldb->getDoc(i, doc))
|
||||
break;
|
||||
LOGDEB(("Url: %s\n", doc.url.c_str()));
|
||||
LOGDEB(("Mimetype: \n", doc.mimetype.c_str()));
|
||||
LOGDEB(("Mtime: \n", doc.mtime.c_str()));
|
||||
LOGDEB(("Origcharset: \n", doc.origcharset.c_str()));
|
||||
LOGDEB(("Title: \n", doc.title.c_str()));
|
||||
LOGDEB(("Text: \n", doc.text.c_str()));
|
||||
LOGDEB(("Keywords: \n", doc.keywords.c_str()));
|
||||
LOGDEB(("Abstract: \n", doc.abstract.c_str()));
|
||||
|
||||
result = "<p>" + doc.url + "</p>";
|
||||
resTextEdit->append(result);
|
||||
}
|
||||
resTextEdit->append("</body></qt>");
|
||||
|
||||
// Display preview for 1st doc in list
|
||||
resTextEdit_clicked(0, 0);
|
||||
}
|
||||
|
||||
|
||||
void RecollMain::Search_clicked()
|
||||
{
|
||||
queryText_returnPressed();
|
||||
}
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: qtry.cpp,v 1.2 2005-01-25 14:37:21 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: qtry.cpp,v 1.3 2005-01-26 11:47:27 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
#endif
|
||||
|
||||
// Tests with the query interface
|
||||
@ -62,20 +62,20 @@ int main(int argc, char **argv)
|
||||
if (argc < 1)
|
||||
Usage();
|
||||
|
||||
RclConfig *config = new RclConfig;
|
||||
RclConfig *rclconfig = new RclConfig;
|
||||
|
||||
if (!config->ok())
|
||||
if (!rclconfig->ok())
|
||||
cerr << "Config could not be built" << endl;
|
||||
|
||||
string dbdir;
|
||||
if (config->getConfParam(string("dbdir"), dbdir) == 0) {
|
||||
if (rclconfig->getConfParam(string("dbdir"), dbdir) == 0) {
|
||||
cerr << "No database directory in configuration" << endl;
|
||||
exit(1);
|
||||
}
|
||||
|
||||
Rcl::Db *db = new Rcl::Db;
|
||||
Rcl::Db *rcldb = new Rcl::Db;
|
||||
|
||||
if (!db->open(dbdir, Rcl::Db::DbRO)) {
|
||||
if (!rcldb->open(dbdir, Rcl::Db::DbRO)) {
|
||||
fprintf(stderr, "Could not open database\n");
|
||||
exit(1);
|
||||
}
|
||||
@ -84,12 +84,12 @@ int main(int argc, char **argv)
|
||||
string query;
|
||||
while (argc--)
|
||||
query += string(*argv++) + " " ;
|
||||
db->setQuery(query);
|
||||
rcldb->setQuery(query);
|
||||
int i = 0;
|
||||
Rcl::Doc doc;
|
||||
for (i=0;;i++) {
|
||||
doc.erase();
|
||||
if (!db->getDoc(i, doc))
|
||||
if (!rcldb->getDoc(i, doc))
|
||||
break;
|
||||
|
||||
cout << "Url: " << doc.url << endl;
|
||||
@ -107,7 +107,7 @@ int main(int argc, char **argv)
|
||||
|
||||
// Look for appropriate handler
|
||||
MimeHandlerFunc fun = getMimeHandler(doc.mimetype,
|
||||
config->getMimeConf());
|
||||
rclconfig->getMimeConf());
|
||||
if (!fun) {
|
||||
cout << "No mime handler !" << endl;
|
||||
continue;
|
||||
@ -116,7 +116,7 @@ int main(int argc, char **argv)
|
||||
cout << "Filename: " << fn << endl;
|
||||
|
||||
Rcl::Doc fdoc;
|
||||
if (!fun(config, fn, doc.mimetype, fdoc)) {
|
||||
if (!fun(rclconfig, fn, doc.mimetype, fdoc)) {
|
||||
cout << "Failed to convert/preview document!" << endl;
|
||||
continue;
|
||||
}
|
||||
@ -125,7 +125,7 @@ int main(int argc, char **argv)
|
||||
transcode(fdoc.text, printable, "UTF-8", outencoding);
|
||||
cout << printable << endl;
|
||||
}
|
||||
delete db;
|
||||
delete rcldb;
|
||||
cerr << "Exiting" << endl;
|
||||
exit(0);
|
||||
}
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.7 2005-01-25 14:37:21 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.8 2005-01-26 11:47:27 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
#endif
|
||||
|
||||
#include <sys/stat.h>
|
||||
@ -201,6 +201,7 @@ bool dumb_string(const string &in, string &out)
|
||||
|
||||
bool Rcl::Db::add(const string &fn, const Rcl::Doc &doc)
|
||||
{
|
||||
LOGDEB(("Rcl::Db::add: fn %s\n", fn.c_str()));
|
||||
if (pdata == 0)
|
||||
return false;
|
||||
Native *ndb = (Native *)pdata;
|
||||
@ -226,24 +227,29 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &doc)
|
||||
|
||||
string noacc;
|
||||
if (!unac_cpp(doc.title, noacc)) {
|
||||
LOGERR(("Rcl::Db::add: unac failed\n"));
|
||||
return false;
|
||||
}
|
||||
splitter.text_to_words(noacc);
|
||||
|
||||
LOGDEB(("Rcl::Db::add: doc split\n"));
|
||||
splitData.basepos += splitData.curpos + 100;
|
||||
if (!dumb_string(doc.text, noacc)) {
|
||||
LOGERR(("Rcl::Db::add: dum_string failed\n"));
|
||||
return false;
|
||||
}
|
||||
splitter.text_to_words(noacc);
|
||||
|
||||
splitData.basepos += splitData.curpos + 100;
|
||||
if (!dumb_string(doc.keywords, noacc)) {
|
||||
LOGERR(("Rcl::Db::add: dum_string failed\n"));
|
||||
return false;
|
||||
}
|
||||
splitter.text_to_words(noacc);
|
||||
|
||||
splitData.basepos += splitData.curpos + 100;
|
||||
if (!dumb_string(doc.abstract, noacc)) {
|
||||
LOGERR(("Rcl::Db::add: dum_string failed\n"));
|
||||
return false;
|
||||
}
|
||||
splitter.text_to_words(noacc);
|
||||
@ -263,20 +269,20 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &doc)
|
||||
#if 0
|
||||
if (did < updated.size()) {
|
||||
updated[did] = true;
|
||||
LOGDEB1(("%s updated\n", fnc));
|
||||
LOGDEB(("%s updated\n", fnc));
|
||||
} else {
|
||||
LOGDEB1(("%s added\n", fnc));
|
||||
LOGDEB(("%s added\n", fnc));
|
||||
}
|
||||
#endif
|
||||
} catch (...) {
|
||||
// FIXME: is this ever actually needed?
|
||||
ndb->wdb.add_document(newdocument);
|
||||
LOGDEB1(("%s added (failed re-seek for duplicate).\n", fnc));
|
||||
LOGDEB(("%s added (failed re-seek for duplicate).\n", fnc));
|
||||
}
|
||||
} else {
|
||||
try {
|
||||
ndb->wdb.add_document(newdocument);
|
||||
LOGDEB1(("%s added\n", fnc));
|
||||
LOGDEB(("%s added\n", fnc));
|
||||
} catch (...) {
|
||||
LOGERR(("%s : Got exception while adding doc\n", fnc));
|
||||
return false;
|
||||
|
||||
@ -3,7 +3,7 @@ CXXFLAGS = -I.
|
||||
|
||||
BIGLIB = ../lib/librcl.a
|
||||
|
||||
PROGS = trfstreewalk trpathut execmd transcode
|
||||
PROGS = trfstreewalk trpathut execmd transcode trmimeparse
|
||||
all: $(PROGS)
|
||||
|
||||
FSTREEWALK_OBJS= trfstreewalk.o fstreewalk.o pathut.o
|
||||
@ -30,5 +30,13 @@ transcode : $(TRANSCODE_OBJS)
|
||||
trtranscode.o : ../utils/transcode.cpp
|
||||
$(CXX) $(CXXFLAGS) -DTEST_TRANSCODE -c -o trtranscode.o \
|
||||
transcode.cpp
|
||||
|
||||
MIMEPARSE_OBJS= trmimeparse.o $(BIGLIB)
|
||||
mimeparse : $(MIMEPARSE_OBJS)
|
||||
$(CXX) $(CXXFLAGS) -o mimeparse $(MIMEPARSE_OBJS) \
|
||||
-L/usr/local/lib -liconv
|
||||
trmimeparse.o : ../utils/mimeparse.cpp
|
||||
$(CXX) $(CXXFLAGS) -DTEST_MIMEPARSE -c -o trmimeparse.o \
|
||||
mimeparse.cpp
|
||||
clean:
|
||||
rm -f *.o $(PROGS)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user