rclpst: indexing / searching mostly working with maybe issues in data

charset conversions (check). Preview does not work, ipath needs conversion
inside pffexport
This commit is contained in:
Jean-Francois Dockes 2019-05-28 18:39:37 +02:00
parent 0101e6e160
commit f0944ae0b2
3 changed files with 60 additions and 20 deletions

View File

@ -25,21 +25,31 @@
import sys
import os
import posixpath
import pathlib
import tempfile
import shutil
import getopt
import traceback
import email.parser
import email.policy
import mailbox
import subprocess
import rclexecm
import rclconfig
import conftree
_mswindows = (sys.platform == "win32" or sys.platform == "msys")
if _mswindows:
import ntpath
met_basename = ntpath.basename
met_dirname = ntpath.dirname
met_splitext = ntpath.splitext
met_join = ntpath.join
def _backslashize(s):
return s.replace("/", "\\")
else:
met_basename = os.path.basename
met_dirname = os.path.dirname
met_splitext = os.path.splitext
met_join = os.path.join
def _backslashize(s):
return s
# The pffexport stream yields the email in several pieces, with some
# data missing (e.g. attachment MIME types). We rebuild a complete
@ -103,7 +113,7 @@ class EmailBuilder(object):
for att in self.attachments:
fn = att[1]
ext = os.path.splitext(fn)[1]
ext = met_splitext(fn)[1]
mime = self.mimemap.get(ext)
if not mime:
mime = 'application/octet-stream'
@ -174,8 +184,9 @@ class PFFReader(object):
if name == 'filename':
#self.log("filename: %s" % paramstr)
fullpath = paramstr
basename = os.path.basename(fullpath)
parentdir = os.path.basename(os.path.dirname(fullpath))
basename = met_basename(fullpath)
parentdir = met_basename(met_dirname(fullpath))
#self.log("basename [%s] parentdir [%s]" % (basename, parentdir))
elif name == 'data':
if parentdir == 'Attachments':
#self.log("Attachment: %s" % basename)
@ -186,18 +197,21 @@ class PFFReader(object):
if doc:
yield((doc, ipath))
elif basename == 'InternetHeaders.txt':
#self.log("name: [%s] data: %s" % (name, paramstr))
#self.log("name: [%s] data: %s" % (name, paramstr[:20]))
# This part is the indispensable one. Record
# the ipath at this point:
p = pathlib.Path(fullpath)
# the ipath at this point:
if _mswindows:
p = pathlib.PureWindowsPath(fullpath)
else:
p = pathlib.Path(fullpath)
# Strip the top dir (/nonexistent.export/)
p = p.relative_to(*p.parts[:2])
# We use the parent directory as ipath: all
# the message parts are in there
ipath = str(p.parents[0])
self.msg.setheaders(data)
elif os.path.splitext(basename)[0] == 'Message':
ext = os.path.splitext(basename)[1]
elif met_splitext(basename)[0] == 'Message':
ext = met_splitext(basename)[1]
if ext == '.txt':
self.msg.setbody(data, 'text', 'plain')
elif ext == '.html':
@ -224,13 +238,25 @@ class PstExtractor(object):
def __init__(self, em):
self.generator = None
self.em = em
self.target = "/nonexistent"
self.cmd = ["pffexport", "-q", "-t", self.target, "-s"]
if _mswindows:
self.target = "\\\\?\\c:\\nonexistent"
else:
self.target = "/nonexistent"
self.pffexport = rclexecm.which("pffexport")
if not self.pffexport:
self.pffexport = rclexecm.which("pffinstall/mingw32/bin/pffexport")
if not self.pffexport:
# No need for anything else. openfile() will return an
# error at once
return
self.cmd = [self.pffexport, "-q", "-t", self.target, "-s"]
def startCmd(self, filename, ipath=None):
fullcmd = self.cmd + [rclexecm.subprocfile(filename)]
fullcmd = self.cmd
if ipath:
fullcmd += ["-p", ipath]
fn = _backslashize(rclexecm.subprocfile(filename))
fullcmd += [fn,]
try:
self.proc = subprocess.Popen(fullcmd, stdout=subprocess.PIPE)
except subprocess.CalledProcessError as err:
@ -244,13 +270,16 @@ class PstExtractor(object):
###### File type handler api, used by rclexecm ---------->
def openfile(self, params):
if not self.pffexport:
print("RECFILTERROR HELPERNOTFOUND pffexport")
sys.exit(1);
self.filename = params["filename:"]
self.em.rclog("openfile: %s" % self.filename)
self.em.rclog("openfile: sys.platform [%s] [%s]" % (sys.platform,self.filename))
return True
def getipath(self, params):
ipath = posixpath.join(self.target + ".export",
params["ipath:"].decode("UTF-8"))
ipath = met_join(self.target + ".export",
params["ipath:"].decode("UTF-8"))
self.em.rclog("getipath: [%s]" % ipath)
if not self.startCmd(self.filename, ipath=ipath):
return (False, "", "", rclexecm.RclExecM.eofnow)

View File

@ -94,6 +94,8 @@ application/javascript = internal text/plain
# chose one.
application/vnd.ms-office = execm python rcldoc.py
application/vnd.ms-outlook = execm python rclpst.py
application/ogg = execm python rclaudio
application/x-awk = internal text/plain

View File

@ -45,6 +45,7 @@ LIBWPD=${RCLDEPS}libwpd/libwpd-0.10.0/
LIBREVENGE=${RCLDEPS}libwpd/librevenge-0.0.1.jfd/
CHM=${RCLDEPS}pychm
MISC=${RCLDEPS}misc
LIBPFF=${RCLDEPS}pffinstall
# Where to copy the Qt Dlls from:
QTBIN=C:/Qt/Qt5.8.0/5.8/mingw53_32/bin
@ -261,6 +262,13 @@ copychm()
cp -rp $CHM/chm $DEST || fatal "can't copy pychm"
}
copypff()
{
DEST=$FILTERS
cp -rp $LIBPFF $DEST || fatal "can't copy pffinstall"
chkcp $LIBPFF/mingw32/bin/pffexport.exe $DEST/pffinstall/mingw32
}
for d in doc examples filters images translations; do
test -d $DESTDIR/Share/$d || mkdir -p $DESTDIR/Share/$d || \
fatal mkdir $d failed
@ -293,4 +301,5 @@ copyepub
#copypyexiv2
copywpd
#copychm
copypff
copypython