diff --git a/src/VERSION b/src/VERSION index 2eb3c4fe..5a2a5806 100644 --- a/src/VERSION +++ b/src/VERSION @@ -1 +1 @@ -0.5 +0.6 diff --git a/src/filters/rcldoc b/src/filters/rcldoc new file mode 100755 index 00000000..aaae036b --- /dev/null +++ b/src/filters/rcldoc @@ -0,0 +1,74 @@ +#!/bin/sh +# @(#$Id: rcldoc,v 1.1 2005-02-09 12:07:30 dockes Exp $ (C) 2004 J.F.Dockes +# Parts taken from Estraier: +#================================================================ +# Estraier: a personal full-text search system +# Copyright (C) 2003-2004 Mikio Hirabayashi +#================================================================ +#================================================================ +# rcldoc +# Extract text from an msword file by executing either antiword +# (or wvware maybe if we need it one day) +# +# The default is to use antiword, the code would need modifications to +# work with wvWare +# +#================================================================ + + +# set variables +LANG=C ; export LANG +LC_ALL=C ; export LC_ALL +progname="rcldoc" +decoder="antiword -i -1 -m UTF-8" +# Not ready to use this for now (it outputs html, so the code below has to +# be simplified.) +#decoder="wvWare -1 -c UTF-8" + +# show help message +if test $# -ne 1 -o "$1" = "--help" +then + printf 'Convert a word file to unformatted HTML text.\n' + printf 'Usage: %s [infile]\n' "$progname" + exit 1 +fi + +infile="$1" + +# check the input file existence +if test ! -f "$infile" +then + printf '%s: %s: no such file\n' "$progname" "$infile" + exit 1 +fi + +# output the result +$decoder "$infile" | +awk ' +BEGIN { + printf("
"); + esc = 1 +} +{ + if ($0 ~ /-$/) { + sub(/-$/, "", $0) + printf("%s", $0); + } else if($0 == "\f") { + printf("
\n") + } else { + if(esc > 0) { + gsub(/&/, "\\&", $0) + gsub(/, "\\<", $0) + gsub(/>/, "\\>", $0) + } + print $0 + } +} +END { + printf("
\n"); +}' | iconv -f UTF-8 -t UTF-8 -c -s + +# exit normally +exit 0 diff --git a/src/filters/rclsoff b/src/filters/rclsoff new file mode 100755 index 00000000..6d8c3499 --- /dev/null +++ b/src/filters/rclsoff @@ -0,0 +1,125 @@ +#!/bin/sh +# @(#$Id: rclsoff,v 1.1 2005-02-09 12:07:30 dockes Exp $ (C) 2004 J.F.Dockes +# Parts taken from Estraier: +#================================================================ +# Estraier: a personal full-text search system +# Copyright (C) 2003-2004 Mikio Hirabayashi +#================================================================ +#================================================================ +# rclsoff +# Extract text from an openoffice/soffice file +# +#================================================================ + + +# set variables +LANG=C ; export LANG +LC_ALL=C ; export LC_ALL +progname="rclsoff" + + +# show help message +if test $# -ne 1 -o "$1" = "--help" +then + printf 'Convert an openoffice file to unformatted HTML text.\n' + printf 'Usage: %s [infile]\n' "$progname" + exit 1 +fi + +infile="$1" + +# check the input file existence +if test ! -f "$infile" +then + printf '%s: %s: no such file\n' "$progname" "$infile" + exit 1 +fi + +# We need a temporary directory +if test z"$RECOLL_TMPDIR" != z; then + ttdir=$RECOLL_TMPDIR +elif test z"$TMPDIR" != z ; then + ttdir=$TMPDIR +else + ttdir=/tmp +fi +tmpdir=$ttdir/rclsoff_tmp$$ +mkdir $tmpdir || exit 1 +mkdir $tmpdir/rclsofftmp || exit 1 + +cleanup() +{ + # Note that we're using a constant part (rclsofftmp), that hopefully + # guarantees that we can't do big mistakes here. + rm -rf $tmpdir/rclsofftmp + rmdir $tmpdir +} + +trap cleanup EXIT SIGHUP SIGQUIT SIGINT SIGTERM + +# Unzip the input file and change to the unzipped directory +unzip -q -d $tmpdir/rclsofftmp $infile +cd $tmpdir/rclsofftmp + +# Note: there can be newlines inside the description field, we don't want +# them... +descsedprog='/' + +echo "$content" | sed -e "s/'/'/g" -e 's/"/"/g' |\ +awk ' +BEGIN { + esc = 1 +} +{ + if ($0 ~ /-$/) { + sub(/-$/, "", $0) + printf("%s", $0); + } else if($0 == "\f") { + printf("
\n")
+ } else {
+ if(esc > 0) {
+ gsub(/&/, "\\&", $0)
+ gsub(/, "\\<", $0)
+ gsub(/>/, "\\>", $0)
+ }
+ printf("%s
", $0)
+ }
+}
+END {
+ printf("
tags)
void
MyHtmlParser::process_text(const string &text)
{
if (!in_script_tag && !in_style_tag) {
+#if 0
string::size_type b = 0;
while ((b = text.find_first_not_of(WHITESPACE, b)) != string::npos) {
if (pending_space || b != 0)
@@ -45,6 +50,11 @@ MyHtmlParser::process_text(const string &text)
dump += text.substr(b, e - b);
b = e + 1;
}
+#else
+ if (pending_space)
+ dump += ' ';
+ dump += text;
+#endif
}
}
diff --git a/src/lib/Makefile b/src/lib/Makefile
index 08aff159..3c1673f4 100644
--- a/src/lib/Makefile
+++ b/src/lib/Makefile
@@ -7,14 +7,14 @@ LIBS = librcl.a
all: $(LIBS)
OBJS = conftree.o csguess.o debuglog.o \
- execmd.o \
+ execmd.o wipedir.o \
fstreewalk.o html.o htmlparse.o indexer.o internfile.o \
mimehandler.o mimeparse.o mimetype.o myhtmlparse.o pathut.o \
rclconfig.o rcldb.o readfile.o smallut.o \
textsplit.o transcode.o \
unacpp.o unac.o
SRCS = ../utils/conftree.cpp ../index/csguess.cpp ../utils/debuglog.cpp \
- ../utils/execmd.cpp \
+ ../utils/execmd.cpp ../utils/wipedir.cpp \
../utils/fstreewalk.cpp ../common/html.cpp ../common/htmlparse.cpp \
../index/indexer.cpp ../common/internfile.cpp \
../common/mimehandler.cpp ../utils/mimeparse.cpp ../index/mimetype.cpp \
@@ -39,6 +39,8 @@ debuglog.o : ../utils/debuglog.cpp
$(CXX) $(CXXFLAGS) -c $<
execmd.o : ../utils/execmd.cpp
$(CXX) $(CXXFLAGS) -c $<
+wipedir.o : ../utils/wipedir.cpp
+ $(CXX) $(CXXFLAGS) -c $<
fstreewalk.o : ../utils/fstreewalk.cpp
$(CXX) $(CXXFLAGS) -c $<
html.o : ../common/html.cpp
diff --git a/src/qtgui/main.cpp b/src/qtgui/main.cpp
index 7c03cb1f..db8ca82d 100644
--- a/src/qtgui/main.cpp
+++ b/src/qtgui/main.cpp
@@ -10,10 +10,13 @@
#include "rclconfig.h"
#include "pathut.h"
#include "recoll.h"
+#include "smallut.h"
+#include "wipedir.h"
RclConfig *rclconfig;
Rcl::Db *rcldb;
int recollNeedsExit;
+string tmpdir;
void recollCleanup()
@@ -23,10 +26,15 @@ void recollCleanup()
rcldb = 0;
delete rclconfig;
rclconfig = 0;
+ if (tmpdir.length()) {
+ wipedir(tmpdir);
+ rmdir(tmpdir.c_str());
+ tmpdir.erase();
+ }
}
-static void sigcleanup(int sig)
+static void sigcleanup(int)
{
fprintf(stderr, "sigcleanup\n");
// Cant call exit from here, because the atexit cleanup does some
@@ -71,6 +79,13 @@ int main( int argc, char ** argv )
QString("No db directory in configuration"));
exit(1);
}
+
+ if (!maketmpdir(tmpdir)) {
+ QMessageBox::critical(0, "Recoll",
+ QString("Cannot create temporary directory"));
+ exit(1);
+ }
+
dbdir = path_tildexpand(dbdir);
rcldb = new Rcl::Db;
diff --git a/src/qtgui/recoll.h b/src/qtgui/recoll.h
index d7c11b42..f607c79b 100644
--- a/src/qtgui/recoll.h
+++ b/src/qtgui/recoll.h
@@ -1,6 +1,6 @@
#ifndef _RECOLL_H_INCLUDED_
#define _RECOLL_H_INCLUDED_
-/* @(#$Id: recoll.h,v 1.1 2005-02-01 17:20:05 dockes Exp $ (C) 2004 J.F.Dockes */
+/* @(#$Id: recoll.h,v 1.2 2005-02-09 12:07:30 dockes Exp $ (C) 2004 J.F.Dockes */
#include "rclconfig.h"
#include "rcldb.h"
@@ -11,6 +11,7 @@ extern void recollCleanup();
// Misc declarations in need of sharing between the UI files
extern RclConfig *rclconfig;
extern Rcl::Db *rcldb;
+extern string tmpdir;
extern int recollNeedsExit;
diff --git a/src/qtgui/recollmain.ui.h b/src/qtgui/recollmain.ui.h
index 13d937f4..02b08413 100644
--- a/src/qtgui/recollmain.ui.h
+++ b/src/qtgui/recollmain.ui.h
@@ -95,33 +95,25 @@ static string plaintorich(const string &in, const list& terms,
myTextSplitCB cb(terms);
TextSplit splitter(&cb, true);
splitter.text_to_words(in);
- string out1;
- if (cb.tboffs.empty()) {
- out1 = in;
- } else {
- list >::iterator it = cb.tboffs.begin();
- for (unsigned int i = 0; i < in.length() ; i++) {
- if (it != cb.tboffs.end()) {
- if (i == (unsigned int)it->first) {
- out1 += "";
- } else if (i == (unsigned int)it->second) {
- if (it != cb.tboffs.end())
- it++;
- out1 += " ";
- }
- }
- out1 += in[i];
- }
- }
string out = "";
- for (string::const_iterator it = out1.begin();it != out1.end(); it++) {
- if (*it == '\n') {
- out += "
";
- // out += '\n';
+ list >::iterator it = cb.tboffs.begin();
+ for (unsigned int i = 0; i < in.length(); i++) {
+ if (it != cb.tboffs.end()) {
+ if (i == (unsigned int)it->first) {
+ out += "";
+ } else if (i == (unsigned int)it->second) {
+ if (it != cb.tboffs.end())
+ it++;
+ out += " ";
+ }
+ }
+ if (in[i] == '\n') {
+ out += "
\n";
} else {
- out += *it;
+ out += in[i];
}
}
+
termoffsets = cb.tboffs;
return out;
}
@@ -208,7 +200,7 @@ void RecollMain::reslistTE_clicked(int par, int car)
// for preview:
string fn = urltolocalpath(doc.url);
Rcl::Doc fdoc;
- if (!internfile(fn, rclconfig, fdoc)) {
+ if (!internfile(fn, rclconfig, fdoc, tmpdir)) {
QMessageBox::warning(0, "Recoll",
QString("Can't turn doc into internal rep ") +
doc.mimetype.c_str());
diff --git a/src/sampleconf/mimeconf b/src/sampleconf/mimeconf
index f39887d1..0280da51 100644
--- a/src/sampleconf/mimeconf
+++ b/src/sampleconf/mimeconf
@@ -1,4 +1,4 @@
-# @(#$Id: mimeconf,v 1.2 2005-02-04 09:30:44 dockes Exp $ (C) 2004 J.F.Dockes
+# @(#$Id: mimeconf,v 1.3 2005-02-09 12:07:30 dockes Exp $ (C) 2004 J.F.Dockes
# Recoll : associations of mime types to processing filters.
# There are different sections for decompression, 'interning' for indexing
@@ -29,6 +29,18 @@ text/plain = internal
text/html = internal
application/pdf = exec rclpdf
application/postscript = exec rclps
+application/msword = exec rcldoc
+
+application/vnd.sun.xml.calc = exec rclsoff
+application/vnd.sun.xml.calc.template = exec rclsoff
+application/vnd.sun.xml.draw = exec rclsoff
+application/vnd.sun.xml.draw.template = exec rclsoff
+application/vnd.sun.xml.impress = exec rclsoff
+application/vnd.sun.xml.impress.template = exec rclsoff
+application/vnd.sun.xml.math = exec rclsoff
+application/vnd.sun.xml.writer = exec rclsoff
+application/vnd.sun.xml.writer.global = exec rclsoff
+application/vnd.sun.xml.writer.template = exec rclsoff
##
# External viewers, launched when you double-click a result entry
@@ -37,3 +49,15 @@ text/plain = xemacs %f
text/html = firefox -a firefox -remote "openFile(%u)"
application/pdf = xpdf %f
application/postscript = gv %f
+application/msword = openoffice-1.1.3-swriter
+
+application/vnd.sun.xml.calc = openoffice-1.1.3 %f
+application/vnd.sun.xml.calc.template = openoffice-1.1.3 %f
+application/vnd.sun.xml.draw = openoffice-1.1.3 %f
+application/vnd.sun.xml.draw.template = openoffice-1.1.3 %f
+application/vnd.sun.xml.impress = openoffice-1.1.3 %f
+application/vnd.sun.xml.impress.template = openoffice-1.1.3 %f
+application/vnd.sun.xml.math = openoffice-1.1.3 %f
+application/vnd.sun.xml.writer = openoffice-1.1.3 %f
+application/vnd.sun.xml.writer.global = openoffice-1.1.3 %f
+application/vnd.sun.xml.writer.template = openoffice-1.1.3 %f
diff --git a/src/sampleconf/mimemap b/src/sampleconf/mimemap
index 56d87963..23c6a1ab 100644
--- a/src/sampleconf/mimemap
+++ b/src/sampleconf/mimemap
@@ -1,4 +1,4 @@
-# @(#$Id: mimemap,v 1.2 2005-02-04 09:30:44 dockes Exp $ (C) 2004 J.F.Dockes
+# @(#$Id: mimemap,v 1.3 2005-02-09 12:07:30 dockes Exp $ (C) 2004 J.F.Dockes
# Recoll: associations of file name extensions to mime types
.txt = text/plain
@@ -20,5 +20,25 @@
.Z = application/x-gzip
.bz2 = application/x-bzip2
+.doc = application/msword
+
+.sxc = application/vnd.sun.xml.calc
+.stc = application/vnd.sun.xml.calc.template
+.sxd = application/vnd.sun.xml.draw
+.std = application/vnd.sun.xml.draw.template
+.sxi = application/vnd.sun.xml.impress
+.sti = application/vnd.sun.xml.impress.template
+.sxm = application/vnd.sun.xml.math
+.sxw = application/vnd.sun.xml.writer
+.sxg = application/vnd.sun.xml.writer.global
+.stw = application/vnd.sun.xml.writer.template
+
+.wpd = application/vnd.wordperfect
+.rtf = text/rtf
+
+
+# A list of stuff that we don't want to touch at all
+recoll_noindex = .tar.gz .tgz .tar.bz2 .tbz
+
[FILE]
# This section for future non suffix-based extension (ie detect mail folders)
diff --git a/src/utils/Makefile b/src/utils/Makefile
index d62f6c9a..f35fa2ca 100644
--- a/src/utils/Makefile
+++ b/src/utils/Makefile
@@ -2,7 +2,7 @@ include ../mk/sysconf
BIGLIB = ../lib/librcl.a
-PROGS = smallut trfstreewalk trpathut transcode trmimeparse trexecmd
+PROGS = wipedir smallut trfstreewalk trpathut transcode trmimeparse trexecmd
all: $(PROGS)
FSTREEWALK_OBJS= trfstreewalk.o fstreewalk.o pathut.o
@@ -44,5 +44,12 @@ smallut : $(SMALLUT_OBJS)
trsmallut.o : ../utils/smallut.cpp
$(CXX) $(CXXFLAGS) -DTEST_SMALLUT -c -o trsmallut.o \
smallut.cpp
+
+WIPEDIR_OBJS= trwipedir.o $(BIGLIB)
+wipedir : $(WIPEDIR_OBJS)
+ $(CXX) $(CXXFLAGS) -o wipedir $(WIPEDIR_OBJS) $(LIBICONV)
+trwipedir.o : ../utils/wipedir.cpp
+ $(CXX) $(CXXFLAGS) -DTEST_WIPEDIR -c -o trwipedir.o \
+ wipedir.cpp
clean:
rm -f *.o $(PROGS)
diff --git a/src/utils/smallut.cpp b/src/utils/smallut.cpp
index cfad866b..ca404ff9 100644
--- a/src/utils/smallut.cpp
+++ b/src/utils/smallut.cpp
@@ -1,14 +1,53 @@
#ifndef lint
-static char rcsid[] = "@(#$Id: smallut.cpp,v 1.2 2005-02-04 09:39:44 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: smallut.cpp,v 1.3 2005-02-09 12:07:30 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
#ifndef TEST_SMALLUT
#include
#include
+#include
+#include
+#include
#include "smallut.h"
+#include "debuglog.h"
+#include "pathut.h"
#define MIN(A,B) ((A)<(B)?(A):(B))
+bool maketmpdir(string& tdir)
+{
+ const char *tmpdir = getenv("RECOLL_TMPDIR");
+ if (!tmpdir)
+ tmpdir = getenv("TMPDIR");
+ if (!tmpdir)
+ tmpdir = "/tmp";
+ tdir = tmpdir;
+ path_cat(tdir, "rcltmpXXXXXX");
+ {
+ char *cp = strdup(tdir.c_str());
+ if (!cp) {
+ LOGERR(("maketmpdir: out of memory (for file name !)\n"));
+ tdir.erase();
+ return false;
+ }
+ if (!mktemp(cp)) {
+ free(cp);
+ LOGERR(("maketmpdir: mktemp failed\n"));
+ tdir.erase();
+ return false;
+ }
+ tdir = cp;
+ free(cp);
+ }
+
+ if (mkdir(tdir.c_str(), 0700) < 0) {
+ LOGERR(("maketmpdir: mkdir %s failed\n", tdir.c_str()));
+ tdir.erase();
+ return false;
+ }
+ return true;
+}
+
int stringicmp(const string & s1, const string& s2)
{
string::const_iterator it1 = s1.begin();
diff --git a/src/utils/smallut.h b/src/utils/smallut.h
index 9129f37c..db500713 100644
--- a/src/utils/smallut.h
+++ b/src/utils/smallut.h
@@ -1,6 +1,6 @@
#ifndef _SMALLUT_H_INCLUDED_
#define _SMALLUT_H_INCLUDED_
-/* @(#$Id: smallut.h,v 1.2 2005-02-04 09:39:44 dockes Exp $ (C) 2004 J.F.Dockes */
+/* @(#$Id: smallut.h,v 1.3 2005-02-09 12:07:30 dockes Exp $ (C) 2004 J.F.Dockes */
#include
using std::string;
@@ -9,4 +9,6 @@ extern int stringicmp(const string& s1, const string& s2);
extern int stringlowercmp(const string& alreadylower, const string& s2);
extern int stringuppercmp(const string& alreadyupper, const string& s2);
+extern bool maketmpdir(string& tdir);
+
#endif /* _SMALLUT_H_INCLUDED_ */
diff --git a/src/utils/wipedir.cpp b/src/utils/wipedir.cpp
new file mode 100644
index 00000000..e9f46bd9
--- /dev/null
+++ b/src/utils/wipedir.cpp
@@ -0,0 +1,97 @@
+#ifndef lint
+static char rcsid[] = "@(#$Id: wipedir.cpp,v 1.1 2005-02-09 12:07:30 dockes Exp $ (C) 2004 J.F.Dockes";
+#endif
+
+#ifndef TEST_WIPEDIR
+#include
+#include
+#include
+
+#include
+using namespace std;
+
+#include "debuglog.h"
+#include "pathut.h"
+#include "wipedir.h"
+
+int wipedir(const string& dir)
+{
+ struct stat st;
+ int statret;
+ int ret = -1;
+
+ statret = stat(dir.c_str(), &st);
+ if (statret == -1) {
+ LOGERR(("wipedir: cant stat %s, errno %d\n", dir.c_str(), errno));
+ return -1;
+ }
+ if (!S_ISDIR(st.st_mode)) {
+ LOGERR(("wipedir: %s not a directory\n", dir.c_str()));
+ return -1;
+ }
+
+ if (access(dir.c_str(), R_OK|W_OK|X_OK) < 0) {
+ LOGERR(("wipedir: no write access to %s\n", dir.c_str()));
+ return -1;
+ }
+
+ DIR *d = opendir(dir.c_str());
+ if (d == 0) {
+ LOGERR(("wipedir: cant opendir %s, errno %d\n", dir.c_str(), errno));
+ return -1;
+ }
+ int remaining = 0;
+ struct dirent *ent;
+ while ((ent = readdir(d)) != 0) {
+ if (!strcmp(ent->d_name, ".") || !strcmp(ent->d_name, ".."))
+ continue;
+
+ string fn = dir;
+ path_cat(fn, ent->d_name);
+
+ struct stat st;
+ int statret = stat(fn.c_str(), &st);
+ if (statret == -1) {
+ LOGERR(("wipedir: cant stat %s, errno %d\n", fn.c_str(), errno));
+ goto out;
+ }
+ if (S_ISDIR(st.st_mode)) {
+ remaining++;
+ } else {
+ if (unlink(fn.c_str()) < 0) {
+ LOGERR(("wipedir: cant unlink %s, errno %d\n",
+ fn.c_str(), errno));
+ goto out;
+ }
+ }
+ }
+
+ ret = remaining;
+ out:
+ if (d)
+ closedir(d);
+ return ret;
+}
+
+
+#else // FILEUT_TEST
+
+#include
+
+#include "wipedir.h"
+
+using namespace std;
+
+int main(int argc, const char **argv)
+{
+ if (argc != 2) {
+ fprintf(stderr, "Usage: wipedir \n");
+ exit(1);
+ }
+ string dir = argv[1];
+ int cnt = wipedir(dir);
+ printf("wipedir returned %d\n", cnt);
+ exit(0);
+}
+
+#endif
diff --git a/src/utils/wipedir.h b/src/utils/wipedir.h
new file mode 100644
index 00000000..1434baf5
--- /dev/null
+++ b/src/utils/wipedir.h
@@ -0,0 +1,13 @@
+#ifndef _FILEUT_H_INCLUDED_
+#define _FILEUT_H_INCLUDED_
+/* @(#$Id: wipedir.h,v 1.1 2005-02-09 12:07:30 dockes Exp $ (C) 2004 J.F.Dockes */
+
+#include
+
+/**
+ * Remove all files inside directory (not recursive).
+ * @return 0 if ok, count of remaining entries (ie: subdirs), or -1 for error
+ */
+int wipedir(const std::string& dirname);
+
+#endif /* _FILEUT_H_INCLUDED_ */