Python filters beginning to work, still issues.

--HG--
branch : WINDOWSPORT
This commit is contained in:
Jean-Francois Dockes 2015-09-11 16:16:16 +02:00
parent 06f43c573e
commit 330c7fc30d
6 changed files with 175 additions and 13 deletions

View File

@ -117,23 +117,33 @@ class WordFilter:
identification. Do 2 tries at most'''
if self.ntry == 0:
self.ntry = 1
return (["antiword", "-t", "-i", "1", "-m", "UTF-8"],
WordProcessData(self.em))
cmd = rclexecm.which("antiword")
if cmd:
return ([cmd, "-t", "-i", "1", "-m", "UTF-8"],
WordProcessData(self.em))
else:
return ([],None)
elif self.ntry == 1:
self.ntry = 2
# antiword failed. Check for an rtf file, or text and
# process accordingly. It the doc is actually msword, try
# wvWare.
mt = self.mimetype(fn)
self.em.rclog("rcldoc.py: actual MIME type %s" % mt)
if mt == "text/plain":
return ([os.path.join(self.execdir,"rcltext")],
return ([python, os.path.join(self.execdir, "rcltext")],
WordPassData(self.em))
elif mt == "text/rtf":
return ([os.path.join(self.execdir, "rclrtf")],
WordPassData(self.em))
cmd = [python, os.path.join(self.execdir, "rclrtf.py")]
self.em.rclog("rcldoc.py: returning cmd %s" % cmd)
return (cmd, WordPassData(self.em))
elif mt == "application/msword":
return (["wvWare", "--nographics", "--charset=utf-8"],
WordPassData(self.em))
cmd = rclexecm.which("wvWare")
if cmd:
return ([cmd, "--nographics", "--charset=utf-8"],
WordPassData(self.em))
else:
return ([],None)
else:
return ([],None)
else:

View File

@ -49,6 +49,9 @@ class RclExecM:
else:
self.maxmembersize = 50 * 1024
self.maxmembersize = self.maxmembersize * 1024
if sys.platform == "win32":
import msvcrt
msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
def rclog(self, s, doexit = 0, exitvalue = 1):
print >> sys.stderr, "RCLMFILT:", self.myname, ":", s
@ -216,14 +219,15 @@ class Executor:
We expect cmd as a list of command name + arguments'''
try:
proc = subprocess.Popen(cmd + [filename],
fullcmd = cmd + [filename]
proc = subprocess.Popen(fullcmd,
stdout = subprocess.PIPE)
stdout = proc.stdout
except subprocess.CalledProcessError as err:
self.em.rclog("extractone: Popen() error: %s" % err)
self.em.rclog("extractone: Popen(%s) error: %s" % (fullcmd, err))
return (False, "")
except OSError as err:
self.em.rclog("extractone: Popen OS error: %s" % err)
self.em.rclog("extractone: Popen(%s) OS error: %s" % (fullcmd, err))
return (False, "")
for line in stdout:
@ -231,6 +235,7 @@ class Executor:
proc.wait()
if proc.returncode:
self.em.rclog("extractone: [%s] returncode %d" % (returncode))
return False, postproc.wrapData()
else:
return True, postproc.wrapData()
@ -283,12 +288,17 @@ def which(program):
for ext in os.environ.get("PATHEXT", "").split(os.pathsep):
yield fpath + ext
def path_candidates():
yield os.path.dirname(sys.argv[0])
for path in os.environ["PATH"].split(os.pathsep):
yield path
fpath, fname = os.path.split(program)
if fpath:
if is_exe(program):
return program
else:
for path in os.environ["PATH"].split(os.pathsep):
for path in path_candidates():
exe_file = os.path.join(path, program)
for candidate in ext_candidates(exe_file):
if is_exe(candidate):

52
src/filters/rclrtf.py Normal file
View File

@ -0,0 +1,52 @@
#!/usr/bin/env python
import rclexecm
import re
import sys
import os
# Processing the output from unrtf
class RTFProcessData:
def __init__(self, em):
self.em = em
self.out = ""
self.gothead = 0
self.patendhead = re.compile('''</head>''')
self.patcharset = re.compile('''^<meta http-equiv=''')
# Some versions of unrtf put out a garbled charset line.
# Apart from this, we pass the data untouched.
def takeLine(self, line):
if not self.gothead:
if self.patendhead.search(line):
self.out += '<meta http-equiv="Content-Type"' + \
'content="text/html;charset=UTF-8">' + "\n"
self.out += line + "\n"
self.gothead = 1
elif not self.patcharset.search(line):
self.out += line + "\n"
else:
self.out += line + "\n"
def wrapData(self):
return self.out
class RTFFilter:
def __init__(self, em):
self.em = em
def reset(self):
pass
def getCmd(self, fn):
cmd = rclexecm.which("unrtf")
if cmd:
return ([cmd, "--nopict", "--html"], RTFProcessData(self.em))
else:
return ([],None)
if __name__ == '__main__':
proto = rclexecm.RclExecM()
filter = RTFFilter(proto)
extract = rclexecm.Executor(proto, filter)
rclexecm.main(proto, extract)

View File

@ -90,6 +90,9 @@ bool MimeHandlerExecMultiple::readDataElement(string& name, string &data)
LOGERR(("MHExecMultiple: getline error\n"));
return false;
}
LOGDEB1(("MHEM:rde: line [%s]\n", ibuf.c_str()));
// Empty line (end of message) ?
if (!ibuf.compare("\n")) {
LOGDEB(("MHExecMultiple: Got empty line\n"));
@ -163,7 +166,7 @@ bool MimeHandlerExecMultiple::next_document()
return false;
}
if (m_cmd.getChildPid() < 0 && !startCmd()) {
if (m_cmd.getChildPid() <= 0 && !startCmd()) {
return false;
}

View File

@ -658,6 +658,7 @@ int ExecCmd::startExec(const string &cmd, const vector<string>& args,
// Create the child process.
// Need a writable buffer for the command line, for some reason.
LOGDEB1(("ExecCmd:startExec: cmdline [%s]\n", cmdline.c_str()));
LPSTR buf = (LPSTR)malloc(cmdline.size() + 1);
memcpy(buf, cmdline.c_str(), cmdline.size());
buf[cmdline.size()] = 0;
@ -818,7 +819,7 @@ int ExecCmd::receive(string& data, int cnt)
break;
}
}
if (cnt == 0)
if ((cnt == 0 && totread > 0) || (cnt > 0 && totread == cnt))
break;
}
LOGDEB1(("ExecCmd::receive: returning %d bytes\n", totread));

86
src/windows/mkinstdir.sh Normal file
View File

@ -0,0 +1,86 @@
#!/bin/sh
# Script to make a prototype recoll install directory from locally compiled
# software. *** Needs cygwin ***
##############
# Local values (to be adjusted)
# Target directory where we copy things.
DESTDIR=/cygdrive/c/recollinst
# Recoll src/build tree
RECOLL=/cygdrive/c/recoll/src
UNRTF=/cygdrive/c/unrtf
ANTIWORD=/cygdrive/c/recolldeps/antiword
CONFIGURATION=Debug
PLATFORM=x64
################
# Script:
FILTERS=$DESTDIR/Share/filters
fatal()
{
echo $*
exit 1
}
# checkcopy.
cc()
{
test -f $1 || fatal $1 does not exist
cp $1 $2 || exit 1
}
copyrecoll()
{
bindir=$RECOLL/windows/$PLATFORM/$CONFIGURATION/
cc $bindir/recollindex.exe $DESTDIR
cc $bindir/recollq.exe $DESTDIR
cc $bindir/pthreadVC2.dll $DESTDIR
cc $RECOLL/sampleconf/fields $DESTDIR/Share/examples
cc $RECOLL/sampleconf/fragbuts.xml $DESTDIR/Share/examples
cc $RECOLL/sampleconf/mimeconf $DESTDIR/Share/examples
cc $RECOLL/sampleconf/mimemap $DESTDIR/Share/examples
cc $RECOLL/sampleconf/mimeview $DESTDIR/Share/examples
cc $RECOLL/sampleconf/recoll.conf $DESTDIR/Share/examples
cc $RECOLL/sampleconf/recoll.qss $DESTDIR/Share/examples
cp $RECOLL/filters/* $FILTERS || exit 1
}
copyantiword()
{
bindir=$ANTIWORD/Win32-only/$PLATFORM/$CONFIGURATION
test -d $Filters/Resources || mkdir -p $FILTERS/Resources || exit 1
cc $bindir/antiword.exe $FILTERS
cp $ANTIWORD/Resources/* $FILTERS/Resources || exit 1
}
copyunrtf()
{
bindir=$UNRTF/Windows/$PLATFORM/$CONFIGURATION
cc $bindir/unrtf.exe $FILTERS
test -d $FILTERS/Share || mkdir -p $FILTERS/Share || exit 1
cp $UNRTF/outputs/*.conf $FILTERS/Share || exit 1
cc $UNRTF/outputs/SYMBOL.charmap $FILTERS/Share
}
test -d $DESTDIR || mkdir -p $DESTDIR || exit 1
test -d $DESTDIR/Share/examples || mkdir -p $DESTDIR/Share/examples || exit 1
test -d $FILTERS || mkdir -p $FILTERS || exit 1
copyrecoll
copyunrtf
copyantiword