Get uncompression to work and fix a few other issues

This commit is contained in:
Jean-Francois Dockes 2015-10-13 16:48:16 +02:00
parent a02a611694
commit 8324f09d19
8 changed files with 65 additions and 28 deletions

View File

@ -1369,7 +1369,21 @@ bool RclConfig::getUncompressor(const string &mtype, vector<string>& cmd) const
if (stringlowercmp("uncompress", *it++))
return false;
cmd.clear();
cmd.push_back(findFilter(*it++));
cmd.push_back(findFilter(*it));
// Special-case python and perl on windows: we need to also locate the
// first argument which is the script name "python somescript.py".
// On Unix, thanks to #!, we usually just run "somescript.py", but need
// the same change if we ever want to use the same cmdling as windows
if (!stringlowercmp("python", *it) || !stringlowercmp("perl", *it)) {
it++;
if (tokens.size() < 3) {
LOGERR(("getUncpressor: python/perl cmd: no script?. [%s]\n", mtype.c_str()));
} else {
*it = findFilter(*it);
}
}
cmd.insert(cmd.end(), it, tokens.end());
return true;
}

View File

@ -70,6 +70,7 @@ class PDFExtractor:
self.pdftk = ""
self.em = em
self.attextractdone = False
self.attachlist = []
# Extract all attachments if any into temporary directory
def extractAttach(self):
@ -79,7 +80,8 @@ class PDFExtractor:
global tmpdir
if not tmpdir or not self.pdftk:
return False
# no big deal
return True
try:
vacuumdir(tmpdir)
@ -89,7 +91,8 @@ class PDFExtractor:
return True
except Exception, e:
self.em.rclog("extractAttach: failed: %s" % e)
return False
# Return true anyway, pdf attachments are no big deal
return True
def extractone(self, ipath):
#self.em.rclog("extractone: [%s]" % ipath)
@ -186,6 +189,8 @@ class PDFExtractor:
if not self.pdftotext:
self.pdftotext = rclexecm.which("pdftotext")
if not self.pdftotext:
self.pdftotext = rclexecm.which("poppler/pdftotext")
if not self.pdftotext:
print("RECFILTERROR HELPERNOTFOUND pdftotext")
sys.exit(1);
@ -209,7 +214,8 @@ class PDFExtractor:
# eofnext error instead of waiting for actual eof,
# which avoids a bug in recollindex up to 1.20
self.extractAttach()
else:
self.attextractdone = True
return True
def getipath(self, params):
@ -218,8 +224,8 @@ class PDFExtractor:
return (ok, data, ipath, eof)
def getnext(self, params):
# self.em.rclog("getnext: current %d" % self.currentindex)
if self.currentindex == -1:
#self.em.rclog("getnext: current -1")
self.currentindex = 0
return self._selfdoc()
else:

View File

@ -9,30 +9,44 @@ import platform
import subprocess
import glob
sysplat = platform.system()
ftrace = sys.stderr
#ftrace = open("C:/Users/Bill/log-uncomp.txt", "w")
sysplat = platform.system()
if sysplat != "Windows":
print("rcluncomp.py: only for Windows", file = sys.stderr)
print("rcluncomp.py: only for Windows", file = ftrace)
sys.exit(1)
try:
import msvcrt
msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
except Exception as err:
print("setmode binary failed: %s" % str(err), file = ftrace)
sevenz = rclexecm.which("7z")
if not sevenz:
print("rcluncomp.py: can't find 7z exe. Maybe set recollhelperpath " \
"in recoll.conf ?", file=sys.stderr)
sys.exit(1)
#print("rcluncomp.py: 7z is %s" % sevenz, file = sys.stderr)
"in recoll.conf ?", file=ftrace)
sys.exit(2)
# Params: uncompression program, input file name, temp directory.
# We ignore the uncomp program, and always use 7z on Windows
infile = sys.argv[2]
outdir = sys.argv[3]
# print("rcluncomp.py infile [%s], outdir [%s]" % (infile, outdir), file = ftrace)
# There is apparently no way to suppress 7z output. Hopefully the
# possible deadlock described by the subprocess module doc can't occur
# here because there is little data printed. AFAIK nothing goes to stderr anyway
subprocess.check_output([sevenz, "e", "-bd", "-y", "-o" + outdir, infile],
stderr = subprocess.PIPE)
try:
cmd = [sevenz, "e", "-bd", "-y", "-o" + outdir, infile]
subprocess.check_output(cmd, stderr = subprocess.PIPE)
outputname = glob.glob(os.path.join(outdir, "*"))
# There should be only one file in there..
print(outputname[0])
except Exception as err:
print("%s" % (str(err),), file = ftrace)
sys.exit(4)
outputname = glob.glob(os.path.join(outdir, "*"))
# There should be only one file in there..
print(outputname[0])
sys.exit(0)

View File

@ -145,6 +145,7 @@ string path_wingettempfilename(TCHAR *pref)
// Windows will have created a temp file, we delete it.
string filename = path_tchartoutf8(buf);
unlink(filename.c_str());
path_slashize(filename);
return filename;
}
#endif

View File

@ -149,8 +149,8 @@ text/x-python = exec python rclpython
text/x-shellscript = internal text/plain
text/x-srt = internal text/plain
application/xml = execm python rclxml
text/xml = execm python rclxml
application/xml = execm python rclxml.py
text/xml = execm python rclxml.py
# Using these instead of the two above would index all parameter and tag
# names, attribute values etc, instead of just the text content.
#application/xml = internal text/plain

View File

@ -62,9 +62,9 @@ RCLS=$RCLW/build-rclstartw-${QTA}-${qtsdir}/${qtsdir}/rclstartw.exe
# Needed for a VS build (which we did not ever complete because of
# missing Qt VS2015 support). Needed for unrtf
CONFIGURATION=Release
PLATFORM=Win32
# missing Qt VS2015 support).
#CONFIGURATION=Release
#PLATFORM=Win32
################
# Script:
@ -82,12 +82,14 @@ copyqt()
{
cd $DESTDIR
$QTBIN/windeployqt recoll.exe
chkcp $QTBIN/libwinpthread-1.dll $DESTDIR
}
copyxapian()
{
chkcp $LIBXAPIAN $DESTDIR
}
copyzlib()
{
chkcp $ZLIB/zlib1.dll $DESTDIR
@ -133,7 +135,8 @@ copyantiword()
copyunrtf()
{
bindir=$UNRTF/Windows/$PLATFORM/$CONFIGURATION
# bindir=$UNRTF/Windows/$PLATFORM/$CONFIGURATION
bindir=$UNRTF/Windows/
test -d $FILTERS/Share || mkdir -p $FILTERS/Share || exit 1
chkcp $bindir/unrtf.exe $FILTERS
@ -158,8 +161,7 @@ copyepub()
copypyexiv2()
{
cp -rp $PYEXIV2/pyexiv2 $FILTERS
# Check
chkcp $PYEXIV2/pyexiv2/exif.py $FILTERS/pyexiv2
chkcp $PYEXIV2/libexiv2python.pyd $FILTERS/
}
copyxslt()
@ -170,10 +172,12 @@ copyxslt()
copypoppler()
{
test -d $FILTERS/poppler || mkdir $FILTERS/poppler || \
fatal cant create poppler dir
for f in pdftotext.exe libpoppler.dll freetype6.dll jpeg62.dll \
libpng16-16.dll zlib1.dll libtiff3.dll \
libgcc_s_dw2-1.dll libstdc++-6.dll; do
chkcp $POPPLER/bin/$f $FILTERS/
chkcp $POPPLER/bin/$f $FILTERS/poppler
done
}

View File

@ -32,6 +32,5 @@ windows {
C:/recoll/src/windows/build-librecoll-Desktop_Qt_5_5_0_MinGW_32bit-Debug/debug/librecoll.dll \
-lshlwapi -lpsapi -lkernel32
INCLUDEPATH += ../../windows \
C:/temp/xapian-core-1.2.8/include
INCLUDEPATH += ../../windows
}

View File

@ -30,6 +30,5 @@ windows {
C:/recoll/src/windows/build-librecoll-Desktop_Qt_5_5_0_MinGW_32bit-Debug/debug/librecoll.dll \
-lshlwapi -lpsapi -lkernel32
INCLUDEPATH += ../../windows \
C:/recolldeps/xapian/xapian-core-1.2.8/include
INCLUDEPATH += ../../windows
}