added external filters and pdf handling
This commit is contained in:
parent
cc512e2ec0
commit
d0aaf92220
84
src/filters/rclpdf
Executable file
84
src/filters/rclpdf
Executable file
@ -0,0 +1,84 @@
|
||||
#!/bin/sh
|
||||
# @(#$Id: rclpdf,v 1.1 2005-02-01 17:20:05 dockes Exp $ (C) 2004 J.F.Dockes
|
||||
# This is copied almost verbatim from Estraier:
|
||||
#================================================================
|
||||
# Estraier: a personal full-text search system
|
||||
# Copyright (C) 2003-2004 Mikio Hirabayashi
|
||||
#================================================================
|
||||
#================================================================
|
||||
# rclpdf
|
||||
# Strip a file of PDF and extract its text as HTML.
|
||||
#================================================================
|
||||
|
||||
|
||||
# set variables
|
||||
LANG=C ; export LANG
|
||||
LC_ALL=C ; export LC_ALL
|
||||
progname="rclpdf"
|
||||
|
||||
# show help message
|
||||
if test $# -ne 1 -o "$1" = "--help"
|
||||
then
|
||||
printf 'Strip a file of PDF and extract its text as HTML.\n'
|
||||
printf 'Usage: %s [infile]\n' "$progname"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
infile="$1"
|
||||
|
||||
# check the input file existence
|
||||
if test ! -f "$infile"
|
||||
then
|
||||
printf '%s: %s: no such file\n' "$progname" "$infile"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# output the result
|
||||
pdftotext -raw -htmlmeta -enc UTF-8 -eol unix -q "$infile" - |
|
||||
iconv -f UTF-8 -t UTF-8 -c -s |
|
||||
awk '
|
||||
BEGIN {
|
||||
esc = 0
|
||||
mul = 1
|
||||
emp = 0
|
||||
}
|
||||
{
|
||||
if(esc < 1 && $0 ~ /^<title>/ && $0 ~ /title>$/){
|
||||
printf("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">\n")
|
||||
gsub(/<[^>]*>/, "", $0)
|
||||
gsub(/&/, "\\&", $0)
|
||||
gsub(/</, "\\<", $0)
|
||||
gsub(/>/, "\\>", $0)
|
||||
printf("<title>%s</title>\n", $0)
|
||||
} else if($0 == "<pre>"){
|
||||
esc++
|
||||
printf("<p>")
|
||||
mul = 1
|
||||
} else if($0 == "</pre>"){
|
||||
esc--
|
||||
printf("</p>\n")
|
||||
} else if($0 ~ /-$/){
|
||||
sub(/-$/, "", $0)
|
||||
printf("%s", $0);
|
||||
} else if($0 == "\f"){
|
||||
printf("</p>\n<hr>\n<p>")
|
||||
} else {
|
||||
if(esc > 0){
|
||||
gsub(/&/, "\\&", $0)
|
||||
gsub(/</, "\\<", $0)
|
||||
gsub(/>/, "\\>", $0)
|
||||
gsub(/^ */, "", $0)
|
||||
gsub(/ *$/, "", $0)
|
||||
}
|
||||
print $0
|
||||
}
|
||||
}
|
||||
'
|
||||
# Suppressed code 2 lines above (at the last print $0), which seemed to
|
||||
# deal with multibyte character being cut by a newline ? It caused problems
|
||||
# (sometimes concatenated last word of a line with first of next, and I
|
||||
# didn't really understand its use as iconv -c is supposed to fix the
|
||||
# encoding anyway
|
||||
|
||||
# exit normally
|
||||
exit 0
|
||||
@ -1,5 +1,5 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: indexer.cpp,v 1.1 2005-01-31 14:31:09 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: indexer.cpp,v 1.2 2005-02-01 17:20:05 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
#endif
|
||||
#include <sys/stat.h>
|
||||
|
||||
@ -105,8 +105,8 @@ indexfile(void *cdata, const std::string &fn, const struct stat *stp,
|
||||
}
|
||||
|
||||
// Look for appropriate handler
|
||||
MimeHandlerFunc fun = getMimeHandler(mime, me->config->getMimeConf());
|
||||
if (!fun) {
|
||||
MimeHandler *handler = getMimeHandler(mime, me->config->getMimeConf());
|
||||
if (!handler) {
|
||||
// No handler for this type, for now :(
|
||||
LOGDEB(("indexfile: %s : no handler\n", mime.c_str()));
|
||||
return FsTreeWalker::FtwOk;
|
||||
@ -115,14 +115,19 @@ indexfile(void *cdata, const std::string &fn, const struct stat *stp,
|
||||
LOGDEB(("indexfile: %s [%s]\n", mime.c_str(), fn.c_str()));
|
||||
|
||||
// Check db up to date ?
|
||||
if (!me->db.needUpdate(fn, stp))
|
||||
if (!me->db.needUpdate(fn, stp)) {
|
||||
delete handler;
|
||||
return FsTreeWalker::FtwOk;
|
||||
}
|
||||
|
||||
// Turn file into a document. The document has fields for title, body
|
||||
// etc., all text converted to utf8
|
||||
Rcl::Doc doc;
|
||||
if (!fun(me->config, fn, mime, doc))
|
||||
if (!handler->worker(me->config, fn, mime, doc)) {
|
||||
delete handler;
|
||||
return FsTreeWalker::FtwOk;
|
||||
}
|
||||
delete handler;
|
||||
|
||||
// Set up common fields:
|
||||
doc.mimetype = mime;
|
||||
@ -131,7 +136,7 @@ indexfile(void *cdata, const std::string &fn, const struct stat *stp,
|
||||
doc.mtime = ascdate;
|
||||
|
||||
// Do database-specific work to update document data
|
||||
if (!me->db.add(fn, doc))
|
||||
if (!me->db.add(fn, doc))
|
||||
return FsTreeWalker::FtwError;
|
||||
|
||||
return FsTreeWalker::FtwOk;
|
||||
|
||||
@ -32,13 +32,14 @@
|
||||
#include "mimeparse.h"
|
||||
#include "myhtmlparse.h"
|
||||
#include "indextext.h"
|
||||
#include "html.h"
|
||||
|
||||
#include <iostream>
|
||||
using namespace std;
|
||||
|
||||
|
||||
bool textHtmlToDoc(RclConfig *conf, const string &fn,
|
||||
const string &mtype, Rcl::Doc &docout)
|
||||
bool MimeHandlerHtml::worker(RclConfig *conf, const string &fn,
|
||||
const string &mtype, Rcl::Doc &docout)
|
||||
{
|
||||
LOGDEB(("textHtmlToDoc: %s\n", fn.c_str()));
|
||||
string otext;
|
||||
@ -46,7 +47,13 @@ bool textHtmlToDoc(RclConfig *conf, const string &fn,
|
||||
LOGINFO(("textHtmlToDoc: cant read: %s\n", fn.c_str()));
|
||||
return false;
|
||||
}
|
||||
|
||||
return worker1(conf, fn, otext, mtype, docout);
|
||||
}
|
||||
|
||||
bool MimeHandlerHtml::worker1(RclConfig *conf, const string &fn,
|
||||
const string& htext,
|
||||
const string &mtype, Rcl::Doc &docout)
|
||||
{
|
||||
// Character set handling:
|
||||
|
||||
// - We first try to convert from the default configured charset
|
||||
@ -57,7 +64,7 @@ bool textHtmlToDoc(RclConfig *conf, const string &fn,
|
||||
// instead of the configuration one.
|
||||
string charset;
|
||||
if (conf->guesscharset) {
|
||||
charset = csguess(otext, conf->defcharset);
|
||||
charset = csguess(htext, conf->defcharset);
|
||||
} else
|
||||
charset = conf->defcharset;
|
||||
|
||||
@ -69,10 +76,10 @@ bool textHtmlToDoc(RclConfig *conf, const string &fn,
|
||||
|
||||
MyHtmlParser p;
|
||||
// Try transcoding. If it fails, use original text.
|
||||
if (!transcode(otext, transcoded, charset, "UTF-8")) {
|
||||
if (!transcode(htext, transcoded, charset, "UTF-8")) {
|
||||
LOGERR(("textHtmlToDoc: transcode failed from cs '%s' to UTF-8\n",
|
||||
charset.c_str()));
|
||||
transcoded = otext;
|
||||
transcoded = htext;
|
||||
// We don't know the charset, at all
|
||||
p.ocharset = p.charset = charset = "";
|
||||
} else {
|
||||
|
||||
14
src/internfile/mh_html.h
Normal file
14
src/internfile/mh_html.h
Normal file
@ -0,0 +1,14 @@
|
||||
#ifndef _HTML_H_INCLUDED_
|
||||
#define _HTML_H_INCLUDED_
|
||||
/* @(#$Id: mh_html.h,v 1.1 2005-02-01 17:20:05 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
#include "mimehandler.h"
|
||||
|
||||
class MimeHandlerHtml : public MimeHandler {
|
||||
public:
|
||||
virtual bool worker(RclConfig *conf, const string &fn,
|
||||
const string &mtype, Rcl::Doc &docout);
|
||||
virtual bool worker1(RclConfig *conf, const string &fn,
|
||||
const string& htext,
|
||||
const string &mtype, Rcl::Doc &docout);
|
||||
};
|
||||
#endif /* _HTML_H_INCLUDED_ */
|
||||
@ -1,5 +1,5 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: mimehandler.cpp,v 1.4 2005-01-29 15:41:11 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: mimehandler.cpp,v 1.5 2005-02-01 17:20:05 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
#endif
|
||||
|
||||
#include <iostream>
|
||||
@ -12,9 +12,19 @@ using namespace std;
|
||||
#include "transcode.h"
|
||||
#include "debuglog.h"
|
||||
#include "smallut.h"
|
||||
#include "html.h"
|
||||
#include "execmd.h"
|
||||
|
||||
bool textPlainToDoc(RclConfig *conf, const string &fn,
|
||||
const string &mtype, Rcl::Doc &docout)
|
||||
class MimeHandlerText : public MimeHandler {
|
||||
public:
|
||||
bool worker(RclConfig *conf, const string &fn,
|
||||
const string &mtype, Rcl::Doc &docout);
|
||||
|
||||
};
|
||||
|
||||
// Process a plain text file
|
||||
bool MimeHandlerText::worker(RclConfig *conf, const string &fn,
|
||||
const string &mtype, Rcl::Doc &docout)
|
||||
{
|
||||
string otext;
|
||||
if (!file_to_string(fn, otext))
|
||||
@ -45,25 +55,51 @@ bool textPlainToDoc(RclConfig *conf, const string &fn,
|
||||
return true;
|
||||
}
|
||||
|
||||
// Map of mime types to internal interner functions. This could just as well
|
||||
// be an if else if suite inside getMimeHandler(), but this is prettier ?
|
||||
static map<string, MimeHandlerFunc> ihandlers;
|
||||
// Static object to get the map to be initialized at program start.
|
||||
class IHandler_Init {
|
||||
class MimeHandlerExec : public MimeHandler {
|
||||
public:
|
||||
IHandler_Init() {
|
||||
ihandlers["text/plain"] = textPlainToDoc;
|
||||
ihandlers["text/html"] = textHtmlToDoc;
|
||||
// Add new associations here when needed
|
||||
}
|
||||
};
|
||||
static IHandler_Init ihandleriniter;
|
||||
list<string> params;
|
||||
virtual ~MimeHandlerExec() {}
|
||||
virtual bool worker(RclConfig *conf, const string &fn,
|
||||
const string &mtype, Rcl::Doc &docout);
|
||||
|
||||
};
|
||||
|
||||
|
||||
// Execute an external program to translate a file from its native format
|
||||
// to html. Then call the html parser to do the actual indexing
|
||||
bool MimeHandlerExec::worker(RclConfig *conf, const string &fn,
|
||||
const string &mtype, Rcl::Doc &docout)
|
||||
{
|
||||
string cmd = params.front();
|
||||
list<string>::iterator it = params.begin();
|
||||
list<string>myparams(++it, params.end());
|
||||
myparams.push_back(fn);
|
||||
|
||||
string html;
|
||||
ExecCmd exec;
|
||||
int status = exec.doexec(cmd, myparams, 0, &html);
|
||||
if (status) {
|
||||
LOGDEB(("MimeHandlerExec: command status 0x%x: %s\n",
|
||||
status, cmd.c_str()));
|
||||
return false;
|
||||
}
|
||||
MimeHandlerHtml hh;
|
||||
return hh.worker1(conf, fn, html, mtype, docout);
|
||||
}
|
||||
|
||||
static MimeHandler *mhfact(const string &mime)
|
||||
{
|
||||
if (!stringlowercmp("text/plain", mime))
|
||||
return new MimeHandlerText;
|
||||
else if (!stringlowercmp("text/html", mime))
|
||||
return new MimeHandlerHtml;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return handler function for given mime type
|
||||
*/
|
||||
MimeHandlerFunc getMimeHandler(const std::string &mtype, ConfTree *mhandlers)
|
||||
MimeHandler *getMimeHandler(const std::string &mtype, ConfTree *mhandlers)
|
||||
{
|
||||
// Return handler definition for mime type
|
||||
string hs;
|
||||
@ -82,25 +118,23 @@ MimeHandlerFunc getMimeHandler(const std::string &mtype, ConfTree *mhandlers)
|
||||
|
||||
// Retrieve handler function according to type
|
||||
if (!stringlowercmp("internal", toks[0])) {
|
||||
map<string, MimeHandlerFunc>::const_iterator it =
|
||||
ihandlers.find(mtype);
|
||||
if (it == ihandlers.end()) {
|
||||
LOGERR(("getMimeHandler: internal handler not found for %s\n",
|
||||
mtype.c_str()));
|
||||
return 0;
|
||||
}
|
||||
return it->second;
|
||||
return mhfact(mtype);
|
||||
} else if (!stringlowercmp("dll", toks[0])) {
|
||||
if (toks.size() != 2)
|
||||
return 0;
|
||||
return 0;
|
||||
} else if (!stringlowercmp("exec", toks[0])) {
|
||||
if (toks.size() != 2)
|
||||
if (toks.size() < 2) {
|
||||
LOGERR(("getMimeHandler: bad line for %s: %s\n", mtype.c_str(),
|
||||
hs.c_str()));
|
||||
return 0;
|
||||
return 0;
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
MimeHandlerExec *h = new MimeHandlerExec;
|
||||
vector<string>::const_iterator it1 = toks.begin();
|
||||
it1++;
|
||||
for (;it1 != toks.end();it1++)
|
||||
h->params.push_back(*it1);
|
||||
return h;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@ -1,21 +1,29 @@
|
||||
#ifndef _MIMEHANDLER_H_INCLUDED_
|
||||
#define _MIMEHANDLER_H_INCLUDED_
|
||||
/* @(#$Id: mimehandler.h,v 1.3 2005-01-29 15:41:11 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
/* @(#$Id: mimehandler.h,v 1.4 2005-02-01 17:20:05 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
|
||||
#include <string>
|
||||
|
||||
#include "rclconfig.h"
|
||||
#include "rcldb.h"
|
||||
|
||||
/* Definition for document interner functions */
|
||||
typedef bool (*MimeHandlerFunc)(RclConfig *, const std::string &,
|
||||
const std::string &, Rcl::Doc&);
|
||||
|
||||
/**
|
||||
* Return indexing handler function for given mime type
|
||||
* Document interner class. We sometimes have data to pass to an interner
|
||||
*/
|
||||
extern MimeHandlerFunc getMimeHandler(const std::string &mtype,
|
||||
ConfTree *mhandlers);
|
||||
class MimeHandler {
|
||||
public:
|
||||
virtual ~MimeHandler() {}
|
||||
virtual bool worker(RclConfig *, const std::string &filename,
|
||||
const std::string &mimetype, Rcl::Doc& outdoc) = 0;
|
||||
};
|
||||
|
||||
/**
|
||||
* Return indexing handler class for given mime type
|
||||
* returned pointer should be deleted by caller
|
||||
*/
|
||||
extern MimeHandler *getMimeHandler(const std::string &mtype,
|
||||
ConfTree *mhandlers);
|
||||
|
||||
/**
|
||||
* Return external viewer exec string for given mime type
|
||||
@ -23,7 +31,4 @@ extern MimeHandlerFunc getMimeHandler(const std::string &mtype,
|
||||
extern string getMimeViewer(const std::string &mtype,
|
||||
ConfTree *mhandlers);
|
||||
|
||||
extern bool textHtmlToDoc(RclConfig *conf, const string &fn,
|
||||
const string &mtype, Rcl::Doc &docout);
|
||||
|
||||
#endif /* _MIMEHANDLER_H_INCLUDED_ */
|
||||
|
||||
@ -7,12 +7,14 @@ LIBS = librcl.a
|
||||
all: $(LIBS)
|
||||
|
||||
OBJS = conftree.o csguess.o debuglog.o \
|
||||
execmd.o \
|
||||
fstreewalk.o html.o htmlparse.o indexer.o \
|
||||
mimehandler.o mimeparse.o mimetype.o myhtmlparse.o pathut.o \
|
||||
rclconfig.o rcldb.o readfile.o smallut.o \
|
||||
textsplit.o transcode.o \
|
||||
unacpp.o unac.o
|
||||
SRCS = ../utils/conftree.cpp ../index/csguess.cpp ../utils/debuglog.cpp \
|
||||
../utils/execmd.cpp \
|
||||
../utils/fstreewalk.cpp ../common/html.cpp ../common/htmlparse.cpp \
|
||||
../index/indexer.cpp \
|
||||
../common/mimehandler.cpp ../utils/mimeparse.cpp ../index/mimetype.cpp \
|
||||
@ -35,6 +37,8 @@ csguess.o : ../index/csguess.cpp
|
||||
$(CXX) $(CXXFLAGS) -c $<
|
||||
debuglog.o : ../utils/debuglog.cpp
|
||||
$(CXX) $(CXXFLAGS) -c $<
|
||||
execmd.o : ../utils/execmd.cpp
|
||||
$(CXX) $(CXXFLAGS) -c $<
|
||||
fstreewalk.o : ../utils/fstreewalk.cpp
|
||||
$(CXX) $(CXXFLAGS) -c $<
|
||||
html.o : ../common/html.cpp
|
||||
|
||||
2
src/mk/FreeBSD
Normal file
2
src/mk/FreeBSD
Normal file
@ -0,0 +1,2 @@
|
||||
CXXFLAGS = -pthread -Wall -g -I. -I../index -I../utils -I../common \
|
||||
-I../unac -I/usr/local/include
|
||||
16
src/qtgui/idxthread.h
Normal file
16
src/qtgui/idxthread.h
Normal file
@ -0,0 +1,16 @@
|
||||
#ifndef _IDXTHREAD_H_INCLUDED_
|
||||
#define _IDXTHREAD_H_INCLUDED_
|
||||
/* @(#$Id: idxthread.h,v 1.1 2005-02-01 17:20:05 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
|
||||
class RclConfig;
|
||||
|
||||
// These two deal with starting / stopping the thread itself, not indexing
|
||||
// sessions.
|
||||
extern void start_idxthread(RclConfig *cnf);
|
||||
extern void stop_idxthread();
|
||||
|
||||
extern int startindexing;
|
||||
extern int indexingdone;
|
||||
extern bool indexingstatus;
|
||||
|
||||
#endif /* _IDXTHREAD_H_INCLUDED_ */
|
||||
17
src/qtgui/recoll.h
Normal file
17
src/qtgui/recoll.h
Normal file
@ -0,0 +1,17 @@
|
||||
#ifndef _RECOLL_H_INCLUDED_
|
||||
#define _RECOLL_H_INCLUDED_
|
||||
/* @(#$Id: recoll.h,v 1.1 2005-02-01 17:20:05 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
|
||||
#include "rclconfig.h"
|
||||
#include "rcldb.h"
|
||||
#include "idxthread.h"
|
||||
|
||||
extern void recollCleanup();
|
||||
|
||||
// Misc declarations in need of sharing between the UI files
|
||||
extern RclConfig *rclconfig;
|
||||
extern Rcl::Db *rcldb;
|
||||
|
||||
extern int recollNeedsExit;
|
||||
|
||||
#endif /* _RECOLL_H_INCLUDED_ */
|
||||
@ -146,9 +146,9 @@ void RecollMain::reslistTE_clicked(int par, int car)
|
||||
// for preview:
|
||||
|
||||
// Look for appropriate handler
|
||||
MimeHandlerFunc fun =
|
||||
MimeHandler *handler =
|
||||
getMimeHandler(doc.mimetype, rclconfig->getMimeConf());
|
||||
if (!fun) {
|
||||
if (!handler) {
|
||||
QMessageBox::warning(0, "Recoll",
|
||||
QString("No mime handler for mime type ") +
|
||||
doc.mimetype.c_str());
|
||||
@ -157,13 +157,15 @@ void RecollMain::reslistTE_clicked(int par, int car)
|
||||
|
||||
string fn = urltolocalpath(doc.url);
|
||||
Rcl::Doc fdoc;
|
||||
if (!fun(rclconfig, fn, doc.mimetype, fdoc)) {
|
||||
if (!handler->worker(rclconfig, fn, doc.mimetype, fdoc)) {
|
||||
QMessageBox::warning(0, "Recoll",
|
||||
QString("Failed to convert document for preview!\n") +
|
||||
fn.c_str() + " mimetype " +
|
||||
doc.mimetype.c_str());
|
||||
delete handler;
|
||||
return;
|
||||
}
|
||||
delete handler;
|
||||
|
||||
string rich = plaintorich(fdoc.text);
|
||||
|
||||
@ -188,7 +190,8 @@ void RecollMain::reslistTE_clicked(int par, int car)
|
||||
}
|
||||
|
||||
|
||||
// User asked to start query
|
||||
// User asked to start query. Run it and call listNextPB_clicked to display
|
||||
// first page of results
|
||||
void RecollMain::queryText_returnPressed()
|
||||
{
|
||||
LOGDEB(("RecollMain::queryText_returnPressed()\n"));
|
||||
@ -294,6 +297,7 @@ void RecollMain::listNextPB_clicked()
|
||||
struct tm *tm = localtime(&mtime);
|
||||
strftime(datebuf, 99, "<i>Modified:</i> %F %T", tm);
|
||||
}
|
||||
LOGDEB(("Abstract: %s\n", doc.abstract.c_str()));
|
||||
string result = "<p>" +
|
||||
string(perbuf) + " <b>" + doc.title + "</b><br>" +
|
||||
doc.mimetype + " " +
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.15 2005-02-01 08:42:55 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.16 2005-02-01 17:20:05 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
#endif
|
||||
|
||||
#include <sys/stat.h>
|
||||
@ -210,8 +210,12 @@ bool dumb_string(const string &in, string &out)
|
||||
{
|
||||
string inter;
|
||||
out.erase();
|
||||
if (!unac_cpp(in, inter))
|
||||
if (in.empty())
|
||||
return true;
|
||||
if (!unac_cpp(in, inter)) {
|
||||
LOGERR(("unac_cpp failed for %s\n", in.c_str()));
|
||||
return false;
|
||||
}
|
||||
out.reserve(inter.length());
|
||||
for (unsigned int i = 0; i < inter.length(); i++) {
|
||||
if (inter[i] >= 'A' && inter[i] <= 'Z') {
|
||||
@ -226,13 +230,55 @@ bool dumb_string(const string &in, string &out)
|
||||
return true;
|
||||
}
|
||||
|
||||
bool Rcl::Db::add(const string &fn, const Rcl::Doc &doc)
|
||||
/* omindex direct */
|
||||
/* Truncate a string to a given maxlength, avoiding cutting off midword
|
||||
* if reasonably possible. */
|
||||
string
|
||||
truncate_to_word(string & input, string::size_type maxlen)
|
||||
{
|
||||
LOGDEB(("Rcl::Db::add: fn %s\n", fn.c_str()));
|
||||
string output;
|
||||
if (input.length() <= maxlen) {
|
||||
output = input;
|
||||
} else {
|
||||
output = input.substr(0, maxlen);
|
||||
const char *SEPAR = " \t\n\r-:.;,/[]{}";
|
||||
string::size_type space = output.find_last_of(SEPAR);
|
||||
// Original version only truncated at space if space was found after
|
||||
// maxlen/2. But we HAVE to truncate at space, else we'd need to do
|
||||
// utf8 stuff to avoid truncating at multibyte char. In any case,
|
||||
// not finding space means that the text probably has no value.
|
||||
// Except probably for Asian languages, so we may want to fix this
|
||||
// one day
|
||||
if (space == string::npos) {
|
||||
output.erase();
|
||||
} else {
|
||||
output.erase(space);
|
||||
}
|
||||
|
||||
output += " ...";
|
||||
}
|
||||
|
||||
// replace newlines with spaces
|
||||
size_t i = 0;
|
||||
while ((i = output.find('\n', i)) != string::npos) output[i] = ' ';
|
||||
return output;
|
||||
}
|
||||
|
||||
bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
|
||||
{
|
||||
LOGDEB(("Rcl::Db::add: fn %s %s\n", fn.c_str(), idoc.text.c_str()));
|
||||
if (pdata == 0)
|
||||
return false;
|
||||
Native *ndb = (Native *)pdata;
|
||||
|
||||
Rcl::Doc doc = idoc;
|
||||
if (doc.abstract.empty())
|
||||
doc.abstract = truncate_to_word(doc.text, 100);
|
||||
else
|
||||
doc.abstract = truncate_to_word(doc.abstract, 100);
|
||||
doc.title = truncate_to_word(doc.title, 100);
|
||||
doc.keywords = truncate_to_word(doc.keywords, 300);
|
||||
|
||||
Xapian::Document newdocument;
|
||||
|
||||
wsData splitData(newdocument);
|
||||
@ -248,21 +294,21 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &doc)
|
||||
|
||||
splitData.basepos += splitData.curpos + 100;
|
||||
if (!dumb_string(doc.text, noacc)) {
|
||||
LOGERR(("Rcl::Db::add: dum_string failed\n"));
|
||||
LOGERR(("Rcl::Db::add: dumb_string failed\n"));
|
||||
return false;
|
||||
}
|
||||
splitter.text_to_words(noacc);
|
||||
|
||||
splitData.basepos += splitData.curpos + 100;
|
||||
if (!dumb_string(doc.keywords, noacc)) {
|
||||
LOGERR(("Rcl::Db::add: dum_string failed\n"));
|
||||
LOGERR(("Rcl::Db::add: dumb_string failed\n"));
|
||||
return false;
|
||||
}
|
||||
splitter.text_to_words(noacc);
|
||||
|
||||
splitData.basepos += splitData.curpos + 100;
|
||||
if (!dumb_string(doc.abstract, noacc)) {
|
||||
LOGERR(("Rcl::Db::add: dum_string failed\n"));
|
||||
LOGERR(("Rcl::Db::add: dumb_string failed\n"));
|
||||
return false;
|
||||
}
|
||||
splitter.text_to_words(noacc);
|
||||
@ -271,7 +317,7 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &doc)
|
||||
string pathterm = "P" + fn;
|
||||
newdocument.add_term(pathterm);
|
||||
const char *fnc = fn.c_str();
|
||||
|
||||
|
||||
// Document data record. omindex has the following nl separated fields:
|
||||
// - url
|
||||
// - sample
|
||||
@ -288,6 +334,20 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &doc)
|
||||
LOGDEB(("Newdocument data: %s\n", record.c_str()));
|
||||
newdocument.set_data(record);
|
||||
|
||||
|
||||
time_t mtime = atol(doc.mtime.c_str());
|
||||
struct tm *tm = localtime(&mtime);
|
||||
char buf[9];
|
||||
sprintf(buf, "%04d%02d%02d",tm->tm_year+1900, tm->tm_mon + 1, tm->tm_mday);
|
||||
newdocument.add_term("D" + string(buf)); // Date (YYYYMMDD)
|
||||
buf[7] = '\0';
|
||||
if (buf[6] == '3') buf[6] = '2';
|
||||
newdocument.add_term("W" + string(buf)); // "Weak" - 10ish day interval
|
||||
buf[6] = '\0';
|
||||
newdocument.add_term("M" + string(buf)); // Month (YYYYMM)
|
||||
buf[4] = '\0';
|
||||
newdocument.add_term("Y" + string(buf)); // Year (YYYY)
|
||||
|
||||
// If this document has already been indexed, update the existing
|
||||
// entry.
|
||||
try {
|
||||
|
||||
@ -2,7 +2,7 @@ include ../mk/FreeBSD
|
||||
|
||||
BIGLIB = ../lib/librcl.a
|
||||
|
||||
PROGS = smallut trfstreewalk trpathut execmd transcode trmimeparse
|
||||
PROGS = smallut trfstreewalk trpathut transcode trmimeparse trexecmd
|
||||
all: $(PROGS)
|
||||
|
||||
FSTREEWALK_OBJS= trfstreewalk.o fstreewalk.o pathut.o
|
||||
@ -16,11 +16,13 @@ PATHUT_OBJS= trpathut.o pathut.o
|
||||
trpathut : $(PATHUT_OBJS)
|
||||
$(CXX) $(CXXFLAGS) -o trpathut $(PATHUT_OBJS)
|
||||
trpathut.o : pathut.cpp pathut.h
|
||||
$(CXX) -o trpathut.o -c $(CXXFLAGS) \
|
||||
-DTEST_PATHUT pathut.cpp
|
||||
$(CXX) -o trpathut.o -c $(CXXFLAGS) -DTEST_PATHUT pathut.cpp
|
||||
|
||||
execmd: pathut.o
|
||||
$(CXX) -o execmd $(CXXFLAGS) execmd.cpp pathut.o
|
||||
EXECMD_OBJS= trexecmd.o $(BIGLIB)
|
||||
trexecmd : $(EXECMD_OBJS)
|
||||
$(CXX) $(CXXFLAGS) -o trexecmd $(EXECMD_OBJS)
|
||||
trexecmd.o : execmd.cpp execmd.h
|
||||
$(CXX) -o trexecmd.o -c $(CXXFLAGS) -DTEST_EXECMD execmd.cpp
|
||||
|
||||
TRANSCODE_OBJS= trtranscode.o $(BIGLIB)
|
||||
transcode : $(TRANSCODE_OBJS)
|
||||
@ -31,12 +33,13 @@ trtranscode.o : ../utils/transcode.cpp
|
||||
transcode.cpp
|
||||
|
||||
MIMEPARSE_OBJS= trmimeparse.o $(BIGLIB)
|
||||
mimeparse : $(MIMEPARSE_OBJS)
|
||||
trmimeparse : $(MIMEPARSE_OBJS)
|
||||
$(CXX) $(CXXFLAGS) -o mimeparse $(MIMEPARSE_OBJS) \
|
||||
-L/usr/local/lib -liconv
|
||||
trmimeparse.o : ../utils/mimeparse.cpp
|
||||
trmimeparse.o : mimeparse.cpp
|
||||
$(CXX) $(CXXFLAGS) -DTEST_MIMEPARSE -c -o trmimeparse.o \
|
||||
mimeparse.cpp
|
||||
|
||||
SMALLUT_OBJS= trsmallut.o $(BIGLIB)
|
||||
smallut : $(SMALLUT_OBJS)
|
||||
$(CXX) $(CXXFLAGS) -o smallut $(SMALLUT_OBJS) \
|
||||
|
||||
@ -1,12 +1,13 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: execmd.cpp,v 1.2 2004-12-14 17:54:16 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: execmd.cpp,v 1.3 2005-02-01 17:20:06 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
#endif
|
||||
|
||||
#ifndef TEST_EXECMD
|
||||
#include <unistd.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/wait.h>
|
||||
#include <sys/select.h>
|
||||
#include <fcntl.h>
|
||||
#include <errno.h>
|
||||
|
||||
#include <list>
|
||||
#include <string>
|
||||
@ -15,6 +16,7 @@ static char rcsid[] = "@(#$Id: execmd.cpp,v 1.2 2004-12-14 17:54:16 dockes Exp $
|
||||
|
||||
#include "execmd.h"
|
||||
#include "pathut.h"
|
||||
#include "debuglog.h"
|
||||
|
||||
using namespace std;
|
||||
#define MAX(A,B) (A>B?A:B)
|
||||
@ -23,15 +25,25 @@ int
|
||||
ExecCmd::doexec(const string &cmd, const list<string> args,
|
||||
const string *input, string *output)
|
||||
{
|
||||
{
|
||||
string command = cmd + " ";
|
||||
for (list<string>::const_iterator it = args.begin();it != args.end();
|
||||
it++) {
|
||||
command += "{" + *it + "} ";
|
||||
}
|
||||
LOGDEB(("ExecCmd::doexec: %s\n", command.c_str()));
|
||||
}
|
||||
|
||||
int pipein[2]; // subproc input
|
||||
int pipeout[2]; // subproc output
|
||||
pipein[0] = pipein[1] = pipeout[0] = pipeout[1] = -1;
|
||||
|
||||
if (input && pipe(pipein) < 0) {
|
||||
LOGERR(("ExecCmd::doexec: pipe(2) failed. errno %d\n", errno));
|
||||
return -1;
|
||||
}
|
||||
if (output && pipe(pipeout) < 0) {
|
||||
LOGERR(("ExecCmd::doexec: pipe(2) failed. errno %d\n", errno));
|
||||
close(pipein[0]);
|
||||
close(pipein[1]);
|
||||
return -1;
|
||||
@ -39,6 +51,7 @@ ExecCmd::doexec(const string &cmd, const list<string> args,
|
||||
|
||||
pid_t pid = fork();
|
||||
if (pid < 0) {
|
||||
LOGERR(("ExecCmd::doexec: fork(2) failed. errno %d\n", errno));
|
||||
return -1;
|
||||
}
|
||||
|
||||
@ -71,17 +84,20 @@ ExecCmd::doexec(const string &cmd, const list<string> args,
|
||||
//cerr << "pipein[1] "<< pipein[1] << " pipeout[0] " <<
|
||||
//pipeout[0] << " nfds " << nfds << endl;
|
||||
if (select(nfds, &readfds, &writefds, 0, 0) <= 0) {
|
||||
perror("select");
|
||||
LOGERR(("ExecCmd::doexec: select(2) failed. errno %d\n",
|
||||
errno));
|
||||
break;
|
||||
}
|
||||
if (pipein[1] >= 0 && FD_ISSET(pipein[1], &writefds)) {
|
||||
int n = write(pipein[1], input->c_str()+nwritten,
|
||||
input->length() - nwritten);
|
||||
if (n < 0) {
|
||||
LOGERR(("ExecCmd::doexec: write(2) failed. errno %d\n",
|
||||
errno));
|
||||
goto out;
|
||||
}
|
||||
nwritten += n;
|
||||
if (nwritten == input->length()) {
|
||||
if (nwritten == (int)input->length()) {
|
||||
// cerr << "Closing output" << endl;
|
||||
close(pipein[1]);
|
||||
pipein[1] = -1;
|
||||
@ -93,7 +109,8 @@ ExecCmd::doexec(const string &cmd, const list<string> args,
|
||||
if (n == 0) {
|
||||
goto out;
|
||||
} else if (n < 0) {
|
||||
perror("read");
|
||||
LOGERR(("ExecCmd::doexec: read(2) failed. errno %d\n",
|
||||
errno));
|
||||
goto out;
|
||||
} else if (n > 0) {
|
||||
// cerr << "READ: " << n << endl;
|
||||
@ -114,6 +131,7 @@ ExecCmd::doexec(const string &cmd, const list<string> args,
|
||||
close(pipeout[0]);
|
||||
if (pipeout[1] >= 0)
|
||||
close(pipeout[1]);
|
||||
LOGDEB(("ExecCmd::doexec: father got status 0x%x\n", status));
|
||||
return status;
|
||||
} else {
|
||||
if (input) {
|
||||
@ -130,10 +148,12 @@ ExecCmd::doexec(const string &cmd, const list<string> args,
|
||||
pipeout[0] = -1;
|
||||
if (pipeout[1] != 1) {
|
||||
if (dup2(pipeout[1], 1) < 0) {
|
||||
perror("dup2");
|
||||
LOGERR(("ExecCmd::doexec: dup2(2) failed. errno %d\n",
|
||||
errno));
|
||||
}
|
||||
if (close(pipeout[1]) < 0) {
|
||||
perror("close");
|
||||
LOGERR(("ExecCmd::doexec: close(2) failed. errno %d\n",
|
||||
errno));
|
||||
}
|
||||
pipeout[1] = -1;
|
||||
}
|
||||
@ -148,7 +168,8 @@ ExecCmd::doexec(const string &cmd, const list<string> args,
|
||||
Ccharp *argv;
|
||||
argv = (Ccharp *)malloc((i+2) * sizeof(char *));
|
||||
if (argv == 0) {
|
||||
cerr << "Malloc error" << endl;
|
||||
LOGERR(("ExecCmd::doexec: malloc() failed. errno %d\n",
|
||||
errno));
|
||||
exit(1);
|
||||
}
|
||||
|
||||
@ -165,17 +186,31 @@ ExecCmd::doexec(const string &cmd, const list<string> args,
|
||||
while (argv[i]) cerr << argv[i++] << endl;}
|
||||
#endif
|
||||
|
||||
LOGDEB(("ExecCmd::doexec: execvp(%s)\n", cmd.c_str()));
|
||||
execvp(cmd.c_str(), (char *const*)argv);
|
||||
// Hu ho
|
||||
//cerr << "Exec failed" << endl;
|
||||
exit(1);
|
||||
LOGERR(("ExecCmd::doexec: execvp(%s) failed. errno %d\n", cmd.c_str(),
|
||||
errno));
|
||||
exit(128);
|
||||
}
|
||||
}
|
||||
|
||||
#else // TEST
|
||||
#include <stdio.h>
|
||||
#include <string>
|
||||
#include <iostream>
|
||||
#include <list>
|
||||
#include "debuglog.h"
|
||||
using namespace std;
|
||||
|
||||
#include "execmd.h"
|
||||
|
||||
const char *data = "Une ligne de donnees\n";
|
||||
|
||||
int main(int argc, const char **argv)
|
||||
{
|
||||
DebugLog::getdbl()->setloglevel(DEBDEB1);
|
||||
DebugLog::setfilename("stderr");
|
||||
if (argc < 2) {
|
||||
cerr << "Usage: execmd cmd arg1 arg2 ..." << endl;
|
||||
exit(1);
|
||||
@ -191,7 +226,8 @@ int main(int argc, const char **argv)
|
||||
string *ip = 0;
|
||||
//ip = &input;
|
||||
int status = mexec.doexec(cmd, l, ip, &output);
|
||||
cout << "Status: " << status << endl;
|
||||
fprintf(stderr, "Status: 0x%x\n", status);
|
||||
cout << "Output:" << output << endl;
|
||||
exit (status >> 8);
|
||||
}
|
||||
#endif // TEST
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user