This commit is contained in:
dockes 2005-01-26 11:47:27 +00:00
parent c14b34647a
commit 0b18276947
9 changed files with 562 additions and 136 deletions

227
src/internfile/mh_html.cpp Normal file
View File

@ -0,0 +1,227 @@
/* htmlparse.cc: simple HTML parser for omega indexer
*
* ----START-LICENCE----
* Copyright 1999,2000,2001 BrightStation PLC
* Copyright 2001 Ananova Ltd
* Copyright 2002 Olly Betts
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation; either version 2 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
* USA
* -----END-LICENCE-----
*/
// This file has code from omindex + an adaptor function for recoll at the end
#include "htmlparse.h"
#include "mimehandler.h"
#include "debuglog.h"
#include "csguess.h"
#include "readfile.h"
#include "transcode.h"
#include "mimeparse.h"
class MyHtmlParser : public HtmlParser {
public:
bool in_script_tag;
bool in_style_tag;
string title, sample, keywords, dump;
string charset; // This is the charset our user thinks the doc is in
string doccharset; // Set this to value of charset parameter in header
bool indexing_allowed;
void process_text(const string &text);
void opening_tag(const string &tag, const map<string,string> &p);
void closing_tag(const string &tag);
MyHtmlParser() :
in_script_tag(false),
in_style_tag(false),
indexing_allowed(true) { }
};
void
MyHtmlParser::process_text(const string &text)
{
// some tags are meaningful mid-word so this is simplistic at best...
if (!in_script_tag && !in_style_tag) {
string::size_type firstchar = text.find_first_not_of(" \t\n\r");
if (firstchar != string::npos) {
dump += text.substr(firstchar);
dump += " ";
}
}
}
// lets hope that the charset includes ascii values...
static inline void
lowercase_term(string &term)
{
string::iterator i = term.begin();
while (i != term.end()) {
if (*i >= 'A' && *i <= 'Z')
*i = *i + 'a' - 'A';
i++;
}
}
#include <iostream>
using namespace std;
void
MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
{
#if 0
cout << "TAG: " << tag << ": " << endl;
map<string, string>::const_iterator x;
for (x = p.begin(); x != p.end(); x++) {
cout << " " << x->first << " -> '" << x->second << "'" << endl;
}
#endif
if (tag == "meta") {
map<string, string>::const_iterator i, j;
if ((i = p.find("content")) != p.end()) {
if ((j = p.find("name")) != p.end()) {
string name = j->second;
lowercase_term(name);
if (name == "description") {
if (sample.empty()) {
sample = i->second;
decode_entities(sample);
}
} else if (name == "keywords") {
if (!keywords.empty()) keywords += ' ';
string tmp = i->second;
decode_entities(tmp);
keywords += tmp;
} else if (name == "robots") {
string val = i->second;
decode_entities(val);
lowercase_term(val);
if (val.find("none") != string::npos ||
val.find("noindex") != string::npos) {
indexing_allowed = false;
throw true;
}
}
} else if ((j = p.find("http-equiv")) != p.end()) {
string hequiv = j->second;
lowercase_term(hequiv);
if (hequiv == "content-type") {
string value = i->second;
MimeHeaderValue p = parseMimeHeaderValue(value);
map<string, string>::const_iterator k;
if ((k = p.params.find("charset")) != p.params.end()) {
doccharset = k->second;
if (doccharset != charset)
throw true;
}
}
}
}
} else if (tag == "script") {
in_script_tag = true;
} else if (tag == "style") {
in_style_tag = true;
} else if (tag == "body") {
dump = "";
}
}
void
MyHtmlParser::closing_tag(const string &tag)
{
if (tag == "title") {
title = dump;
dump = "";
} else if (tag == "script") {
in_script_tag = false;
} else if (tag == "style") {
in_style_tag = false;
} else if (tag == "body") {
throw true;
}
}
bool textHtmlToDoc(RclConfig *conf, const string &fn,
const string &mtype, Rcl::Doc &docout)
{
LOGDEB(("textHtmlToDoc: %s\n", fn.c_str()));
string otext;
if (!file_to_string(fn, otext)) {
LOGINFO(("textHtmlToDoc: cant read: %s\n", fn.c_str()));
return false;
}
// Character set handling:
// - We first try to convert from the default configured charset
// (which may depend of the current directory) to utf-8. If this
// fails, we keep the original text
// - During parsing, if we find a charset parameter, and it differs from
// what we started with, we abort and restart with the parameter value
// instead of the configuration one.
string charset;
if (conf->guesscharset) {
charset = csguess(otext, conf->defcharset);
} else
charset = conf->defcharset;
LOGDEB(("textHtmlToDoc: charset before parsing: %s\n",
charset.c_str()));
MyHtmlParser pres;
for (int pass = 0; pass < 2; pass++) {
string transcoded;
LOGDEB(("textHtmlToDoc: transcode from %s to %s\n",
charset.c_str(), "UTF-8"));
MyHtmlParser p;
// Try transcoding. If it fails, use original text.
if (!transcode(otext, transcoded, charset, "UTF-8")) {
LOGERR(("textHtmlToDoc: transcode failed from cs '%s' to UTF-8\n",
charset.c_str()));
transcoded = otext;
// We don't know the charset, at all
p.charset = charset = "";
} else {
// charset has the putative source charset, transcoded is now
// in utf-8
p.charset = "utf-8";
}
try {
p.parse_html(transcoded);
} catch (bool) {
pres = p;
if (!pres.doccharset.empty() && pres.doccharset != charset) {
LOGDEB(("textHtmlToDoc: charset '%s' doc charset '%s',"
"reparse\n", charset.c_str(),
pres.doccharset.c_str()));
charset = pres.doccharset;
} else
break;
}
}
Rcl::Doc out;
out.origcharset = charset;
out.text = pres.dump;
out.title = pres.title;
out.keywords = pres.keywords;
out.abstract = pres.sample;
docout = out;
return true;
}

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: mimehandler.cpp,v 1.1 2005-01-25 14:37:57 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: mimehandler.cpp,v 1.2 2005-01-26 11:47:27 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
#include <iostream>
@ -52,6 +52,7 @@ class IHandler_Init {
public:
IHandler_Init() {
ihandlers["text/plain"] = textPlainToDoc;
ihandlers["text/html"] = textHtmlToDoc;
// Add new associations here when needed
}
};

View File

@ -1,6 +1,6 @@
#ifndef _MIMEHANDLER_H_INCLUDED_
#define _MIMEHANDLER_H_INCLUDED_
/* @(#$Id: mimehandler.h,v 1.1 2005-01-25 14:37:57 dockes Exp $ (C) 2004 J.F.Dockes */
/* @(#$Id: mimehandler.h,v 1.2 2005-01-26 11:47:27 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string>
@ -14,4 +14,7 @@ typedef bool (*MimeHandlerFunc)(RclConfig *, const std::string &,
extern MimeHandlerFunc getMimeHandler(const std::string &mtype,
ConfTree *mhandlers);
extern bool textHtmlToDoc(RclConfig *conf, const string &fn,
const string &mtype, Rcl::Doc &docout);
#endif /* _MIMEHANDLER_H_INCLUDED_ */

View File

@ -8,14 +8,15 @@ LIBS = librcl.a
all: $(LIBS)
OBJS = conftree.o csguess.o debuglog.o \
fstreewalk.o \
mimehandler.o mimetype.o pathut.o \
fstreewalk.o html.o htmlparse.o \
mimehandler.o mimeparse.o mimetype.o pathut.o \
rclconfig.o rcldb.o readfile.o \
textsplit.o transcode.o \
unacpp.o unac.o
SRCS = ../utils/conftree.cpp ../index/csguess.cpp ../utils/debuglog.cpp \
../utils/fstreewalk.cpp \
../common/mimehandler.cpp ../index/mimetype.cpp ../utils/pathut.cpp \
../utils/fstreewalk.cpp ../common/html.cpp ../common/htmlparse.cpp \
../common/mimehandler.cpp ../utils/mimeparse.cpp ../index/mimetype.cpp \
../utils/pathut.cpp \
../common/rclconfig.cpp ../common/rcldb.cpp ../utils/readfile.cpp \
../common/textsplit.cpp ../utils/transcode.cpp \
../common/unacpp.cpp ../unac/unac.c
@ -35,8 +36,14 @@ debuglog.o : ../utils/debuglog.cpp
$(CXX) $(CXXFLAGS) -c $<
fstreewalk.o : ../utils/fstreewalk.cpp
$(CXX) $(CXXFLAGS) -c $<
html.o : ../common/html.cpp
$(CXX) $(CXXFLAGS) -c $<
htmlparse.o : ../common/htmlparse.cpp
$(CXX) $(CXXFLAGS) -c $<
mimehandler.o : ../common/mimehandler.cpp
$(CXX) $(CXXFLAGS) -c $<
mimeparse.o : ../utils/mimeparse.cpp
$(CXX) $(CXXFLAGS) -c $<
mimetype.o : ../index/mimetype.cpp
$(CXX) $(CXXFLAGS) -c $<
pathut.o : ../utils/pathut.cpp

View File

@ -8,106 +8,177 @@
<rect>
<x>0</x>
<y>0</y>
<width>774</width>
<height>619</height>
<width>782</width>
<height>622</height>
</rect>
</property>
<property name="sizePolicy">
<sizepolicy>
<hsizetype>7</hsizetype>
<vsizetype>7</vsizetype>
<horstretch>0</horstretch>
<verstretch>0</verstretch>
</sizepolicy>
</property>
<property name="caption">
<string>recoll</string>
</property>
<widget class="QLayoutWidget">
<hbox>
<property name="name">
<cstring>layout7</cstring>
<cstring>unnamed</cstring>
</property>
<property name="geometry">
<rect>
<x>11</x>
<y>11</y>
<width>752</width>
<height>41</height>
</rect>
</property>
<hbox>
<widget class="QLayoutWidget">
<property name="name">
<cstring>unnamed</cstring>
<cstring>layout3</cstring>
</property>
<widget class="QLineEdit">
<vbox>
<property name="name">
<cstring>queryText</cstring>
<cstring>unnamed</cstring>
</property>
<property name="frameShape">
<enum>LineEditPanel</enum>
</property>
<property name="frameShadow">
<enum>Sunken</enum>
</property>
</widget>
<widget class="QPushButton">
<property name="name">
<cstring>Search</cstring>
</property>
<property name="text">
<string>pushButton1</string>
</property>
</widget>
<spacer>
<property name="name">
<cstring>spacer1</cstring>
</property>
<property name="orientation">
<enum>Horizontal</enum>
</property>
<property name="sizeType">
<enum>Expanding</enum>
</property>
<property name="sizeHint">
<size>
<width>40</width>
<height>20</height>
</size>
</property>
</spacer>
</hbox>
</widget>
<widget class="QSplitter">
<property name="name">
<cstring>splitter9</cstring>
</property>
<property name="geometry">
<rect>
<x>11</x>
<y>58</y>
<width>752</width>
<height>491</height>
</rect>
</property>
<property name="orientation">
<enum>Horizontal</enum>
</property>
<widget class="QTextEdit">
<property name="name">
<cstring>resTextEdit</cstring>
</property>
<widget class="QLayoutWidget">
<property name="name">
<cstring>layout2</cstring>
</property>
<hbox>
<property name="name">
<cstring>unnamed</cstring>
</property>
<widget class="QLineEdit">
<property name="name">
<cstring>queryText</cstring>
</property>
<property name="frameShape">
<enum>LineEditPanel</enum>
</property>
<property name="frameShadow">
<enum>Sunken</enum>
</property>
</widget>
<widget class="QPushButton">
<property name="name">
<cstring>Search</cstring>
</property>
<property name="text">
<string>Search</string>
</property>
</widget>
<spacer>
<property name="name">
<cstring>spacer1</cstring>
</property>
<property name="orientation">
<enum>Horizontal</enum>
</property>
<property name="sizeType">
<enum>Expanding</enum>
</property>
<property name="sizeHint">
<size>
<width>329</width>
<height>20</height>
</size>
</property>
</spacer>
</hbox>
</widget>
<widget class="QSplitter">
<property name="name">
<cstring>splitter9</cstring>
</property>
<property name="sizePolicy">
<sizepolicy>
<hsizetype>7</hsizetype>
<vsizetype>7</vsizetype>
<horstretch>1</horstretch>
<verstretch>1</verstretch>
</sizepolicy>
</property>
<property name="minimumSize">
<size>
<width>0</width>
<height>0</height>
</size>
</property>
<property name="orientation">
<enum>Horizontal</enum>
</property>
<widget class="QTextEdit">
<property name="name">
<cstring>resTextEdit</cstring>
</property>
<property name="sizePolicy">
<sizepolicy>
<hsizetype>7</hsizetype>
<vsizetype>7</vsizetype>
<horstretch>2</horstretch>
<verstretch>0</verstretch>
</sizepolicy>
</property>
<property name="textFormat">
<enum>RichText</enum>
</property>
<property name="readOnly">
<bool>true</bool>
</property>
</widget>
<widget class="QSplitter">
<property name="name">
<cstring>splitter8</cstring>
</property>
<property name="sizePolicy">
<sizepolicy>
<hsizetype>7</hsizetype>
<vsizetype>7</vsizetype>
<horstretch>3</horstretch>
<verstretch>0</verstretch>
</sizepolicy>
</property>
<property name="orientation">
<enum>Vertical</enum>
</property>
<widget class="QTextEdit">
<property name="name">
<cstring>previewTextEdit</cstring>
</property>
<property name="sizePolicy">
<sizepolicy>
<hsizetype>7</hsizetype>
<vsizetype>7</vsizetype>
<horstretch>0</horstretch>
<verstretch>2</verstretch>
</sizepolicy>
</property>
<property name="textFormat">
<enum>RichText</enum>
</property>
<property name="readOnly">
<bool>true</bool>
</property>
</widget>
<widget class="QTextEdit">
<property name="name">
<cstring>metaTextEdit</cstring>
</property>
<property name="sizePolicy">
<sizepolicy>
<hsizetype>7</hsizetype>
<vsizetype>7</vsizetype>
<horstretch>0</horstretch>
<verstretch>1</verstretch>
</sizepolicy>
</property>
<property name="textFormat">
<enum>RichText</enum>
</property>
<property name="readOnly">
<bool>true</bool>
</property>
</widget>
</widget>
</widget>
</vbox>
</widget>
<widget class="QSplitter">
<property name="name">
<cstring>splitter8</cstring>
</property>
<property name="orientation">
<enum>Vertical</enum>
</property>
<widget class="QTextEdit">
<property name="name">
<cstring>textEdit12</cstring>
</property>
</widget>
<widget class="QTextEdit">
<property name="name">
<cstring>textEdit13</cstring>
</property>
</widget>
</widget>
</widget>
</hbox>
</widget>
<menubar>
<property name="name">
@ -216,18 +287,24 @@
<receiver>RecollMain</receiver>
<slot>resTextEdit_clicked(int,int)</slot>
</connection>
<connection>
<sender>resTextEdit</sender>
<signal>returnPressed()</signal>
<receiver>RecollMain</receiver>
<slot>resTextEdit_returnPressed()</slot>
</connection>
<connection>
<sender>fileExitAction</sender>
<signal>activated()</signal>
<receiver>RecollMain</receiver>
<slot>fileExit()</slot>
</connection>
<connection>
<sender>queryText</sender>
<signal>returnPressed()</signal>
<receiver>RecollMain</receiver>
<slot>queryText_returnPressed()</slot>
</connection>
<connection>
<sender>Search</sender>
<signal>clicked()</signal>
<receiver>RecollMain</receiver>
<slot>Search_clicked()</slot>
</connection>
</connections>
<includes>
<include location="local" impldecl="in implementation">recollmain.ui.h</include>
@ -238,7 +315,8 @@
<slot>helpContents()</slot>
<slot>helpAbout()</slot>
<slot>resTextEdit_clicked( int par, int car )</slot>
<slot>resTextEdit_returnPressed()</slot>
<slot>queryText_returnPressed()</slot>
<slot>Search_clicked()</slot>
</slots>
<pixmapinproject/>
<layoutdefaults spacing="6" margin="11"/>

View File

@ -32,30 +32,126 @@ void RecollMain::helpAbout()
{
}
#include <qmessagebox.h>
#include "rcldb.h"
#include "rclconfig.h"
#include "debuglog.h"
#include "mimehandler.h"
extern RclConfig *rclconfig;
extern Rcl::Db *rcldb;
static string plaintorich(const string &in)
{
string out = "<qt><head><title></title></head><body><p>";
for (unsigned int i = 0; i < in.length() ; i++) {
if (in[i] == '\n') {
out += "<br>";
} else {
out += in[i];
}
if (i == 10) {
out += "<mytag>";
}
if (i == 20) {
out += "</mytag>";
}
}
return out;
}
void RecollMain::resTextEdit_clicked( int par, int car )
{
fprintf(stderr, "Clicked at paragraph %d, char %d\n", par, car);
}
Rcl::Doc doc;
doc.erase();
if (rcldb->getDoc(par, doc)) {
// Go to the file system to retrieve / convert the document text
// for preview:
#include "qfontdialog.h"
// Look for appropriate handler
MimeHandlerFunc fun =
getMimeHandler(doc.mimetype, rclconfig->getMimeConf());
if (!fun) {
QMessageBox::warning(0, "Recoll",
QString("No mime handler for mime type ") +
doc.mimetype.c_str());
return;
}
#define BS 200000
void RecollMain::resTextEdit_returnPressed()
{
fprintf(stderr, "ReturnPressed()\n");
resTextEdit->setFont( QFontDialog::getFont( 0, resTextEdit->font() ) );
const char *fname = "utf8.txt";
FILE *fp = fopen(fname, "r");
if (fp) {
char buf[BS];
memset(buf,0, sizeof(buf));
int n = fread(buf, 1, BS-1, fp);
fclose(fp);
QString str = QString::fromUtf8(buf, n);
resTextEdit->setTextFormat(RichText);
resTextEdit->setText(str);
string fn = doc.url.substr(6, string::npos);
Rcl::Doc fdoc;
if (!fun(rclconfig, fn, doc.mimetype, fdoc)) {
QMessageBox::warning(0, "Recoll",
QString("Failed to convert document for preview!\n") +
fn.c_str() + " mimetype " +
doc.mimetype.c_str());
return;
}
string rich = plaintorich(fdoc.text);
#if 0
//Highlighting; pass a list of (search term, style name) to plaintorich
// and create the corresponding styles with different colors here
// We need to :
// - Break the query into terms : wait for the query analyzer
// - Break the text into words. This should use a version of
// textsplit with an option to keep the punctuation (see how to do
// this). We do want the same splitter code to be used here and
// when indexing.
QStyleSheetItem *item =
new QStyleSheetItem( previewTextEdit->styleSheet(), "mytag" );
item->setColor("red");
item->setFontWeight(QFont::Bold);
#endif
QString str = QString::fromUtf8(rich.c_str(), rich.length());
previewTextEdit->setTextFormat(RichText);
previewTextEdit->setText(str);
}
}
void RecollMain::queryText_returnPressed()
{
LOGDEB(("RecollMain::queryText_returnPressed()\n"));
resTextEdit->clear();
previewTextEdit->clear();
string rawq = queryText->text();
rcldb->setQuery(rawq);
Rcl::Doc doc;
// Insert results if any in result list window
QString result;
resTextEdit->append("<qt><head></head><body>");
for (int i = 0;; i++) {
doc.erase();
if (!rcldb->getDoc(i, doc))
break;
LOGDEB(("Url: %s\n", doc.url.c_str()));
LOGDEB(("Mimetype: \n", doc.mimetype.c_str()));
LOGDEB(("Mtime: \n", doc.mtime.c_str()));
LOGDEB(("Origcharset: \n", doc.origcharset.c_str()));
LOGDEB(("Title: \n", doc.title.c_str()));
LOGDEB(("Text: \n", doc.text.c_str()));
LOGDEB(("Keywords: \n", doc.keywords.c_str()));
LOGDEB(("Abstract: \n", doc.abstract.c_str()));
result = "<p>" + doc.url + "</p>";
resTextEdit->append(result);
}
resTextEdit->append("</body></qt>");
// Display preview for 1st doc in list
resTextEdit_clicked(0, 0);
}
void RecollMain::Search_clicked()
{
queryText_returnPressed();
}

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: qtry.cpp,v 1.2 2005-01-25 14:37:21 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: qtry.cpp,v 1.3 2005-01-26 11:47:27 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
// Tests with the query interface
@ -62,20 +62,20 @@ int main(int argc, char **argv)
if (argc < 1)
Usage();
RclConfig *config = new RclConfig;
RclConfig *rclconfig = new RclConfig;
if (!config->ok())
if (!rclconfig->ok())
cerr << "Config could not be built" << endl;
string dbdir;
if (config->getConfParam(string("dbdir"), dbdir) == 0) {
if (rclconfig->getConfParam(string("dbdir"), dbdir) == 0) {
cerr << "No database directory in configuration" << endl;
exit(1);
}
Rcl::Db *db = new Rcl::Db;
Rcl::Db *rcldb = new Rcl::Db;
if (!db->open(dbdir, Rcl::Db::DbRO)) {
if (!rcldb->open(dbdir, Rcl::Db::DbRO)) {
fprintf(stderr, "Could not open database\n");
exit(1);
}
@ -84,12 +84,12 @@ int main(int argc, char **argv)
string query;
while (argc--)
query += string(*argv++) + " " ;
db->setQuery(query);
rcldb->setQuery(query);
int i = 0;
Rcl::Doc doc;
for (i=0;;i++) {
doc.erase();
if (!db->getDoc(i, doc))
if (!rcldb->getDoc(i, doc))
break;
cout << "Url: " << doc.url << endl;
@ -107,7 +107,7 @@ int main(int argc, char **argv)
// Look for appropriate handler
MimeHandlerFunc fun = getMimeHandler(doc.mimetype,
config->getMimeConf());
rclconfig->getMimeConf());
if (!fun) {
cout << "No mime handler !" << endl;
continue;
@ -116,7 +116,7 @@ int main(int argc, char **argv)
cout << "Filename: " << fn << endl;
Rcl::Doc fdoc;
if (!fun(config, fn, doc.mimetype, fdoc)) {
if (!fun(rclconfig, fn, doc.mimetype, fdoc)) {
cout << "Failed to convert/preview document!" << endl;
continue;
}
@ -125,7 +125,7 @@ int main(int argc, char **argv)
transcode(fdoc.text, printable, "UTF-8", outencoding);
cout << printable << endl;
}
delete db;
delete rcldb;
cerr << "Exiting" << endl;
exit(0);
}

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.7 2005-01-25 14:37:21 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.8 2005-01-26 11:47:27 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
#include <sys/stat.h>
@ -201,6 +201,7 @@ bool dumb_string(const string &in, string &out)
bool Rcl::Db::add(const string &fn, const Rcl::Doc &doc)
{
LOGDEB(("Rcl::Db::add: fn %s\n", fn.c_str()));
if (pdata == 0)
return false;
Native *ndb = (Native *)pdata;
@ -226,24 +227,29 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &doc)
string noacc;
if (!unac_cpp(doc.title, noacc)) {
LOGERR(("Rcl::Db::add: unac failed\n"));
return false;
}
splitter.text_to_words(noacc);
LOGDEB(("Rcl::Db::add: doc split\n"));
splitData.basepos += splitData.curpos + 100;
if (!dumb_string(doc.text, noacc)) {
LOGERR(("Rcl::Db::add: dum_string failed\n"));
return false;
}
splitter.text_to_words(noacc);
splitData.basepos += splitData.curpos + 100;
if (!dumb_string(doc.keywords, noacc)) {
LOGERR(("Rcl::Db::add: dum_string failed\n"));
return false;
}
splitter.text_to_words(noacc);
splitData.basepos += splitData.curpos + 100;
if (!dumb_string(doc.abstract, noacc)) {
LOGERR(("Rcl::Db::add: dum_string failed\n"));
return false;
}
splitter.text_to_words(noacc);
@ -263,20 +269,20 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &doc)
#if 0
if (did < updated.size()) {
updated[did] = true;
LOGDEB1(("%s updated\n", fnc));
LOGDEB(("%s updated\n", fnc));
} else {
LOGDEB1(("%s added\n", fnc));
LOGDEB(("%s added\n", fnc));
}
#endif
} catch (...) {
// FIXME: is this ever actually needed?
ndb->wdb.add_document(newdocument);
LOGDEB1(("%s added (failed re-seek for duplicate).\n", fnc));
LOGDEB(("%s added (failed re-seek for duplicate).\n", fnc));
}
} else {
try {
ndb->wdb.add_document(newdocument);
LOGDEB1(("%s added\n", fnc));
LOGDEB(("%s added\n", fnc));
} catch (...) {
LOGERR(("%s : Got exception while adding doc\n", fnc));
return false;

View File

@ -3,7 +3,7 @@ CXXFLAGS = -I.
BIGLIB = ../lib/librcl.a
PROGS = trfstreewalk trpathut execmd transcode
PROGS = trfstreewalk trpathut execmd transcode trmimeparse
all: $(PROGS)
FSTREEWALK_OBJS= trfstreewalk.o fstreewalk.o pathut.o
@ -30,5 +30,13 @@ transcode : $(TRANSCODE_OBJS)
trtranscode.o : ../utils/transcode.cpp
$(CXX) $(CXXFLAGS) -DTEST_TRANSCODE -c -o trtranscode.o \
transcode.cpp
MIMEPARSE_OBJS= trmimeparse.o $(BIGLIB)
mimeparse : $(MIMEPARSE_OBJS)
$(CXX) $(CXXFLAGS) -o mimeparse $(MIMEPARSE_OBJS) \
-L/usr/local/lib -liconv
trmimeparse.o : ../utils/mimeparse.cpp
$(CXX) $(CXXFLAGS) -DTEST_MIMEPARSE -c -o trmimeparse.o \
mimeparse.cpp
clean:
rm -f *.o $(PROGS)