rclpst: indexing / searching mostly working with maybe issues in data

charset conversions (check). Preview does not work, ipath needs conversion
inside pffexport
This commit is contained in:
Jean-Francois Dockes 2019-05-28 18:39:37 +02:00
parent 0101e6e160
commit f0944ae0b2
3 changed files with 60 additions and 20 deletions

View File

@ -25,21 +25,31 @@
import sys import sys
import os import os
import posixpath
import pathlib import pathlib
import tempfile
import shutil
import getopt
import traceback
import email.parser import email.parser
import email.policy import email.policy
import mailbox import mailbox
import subprocess import subprocess
import rclexecm import rclexecm
import rclconfig import rclconfig
import conftree import conftree
_mswindows = (sys.platform == "win32" or sys.platform == "msys")
if _mswindows:
import ntpath
met_basename = ntpath.basename
met_dirname = ntpath.dirname
met_splitext = ntpath.splitext
met_join = ntpath.join
def _backslashize(s):
return s.replace("/", "\\")
else:
met_basename = os.path.basename
met_dirname = os.path.dirname
met_splitext = os.path.splitext
met_join = os.path.join
def _backslashize(s):
return s
# The pffexport stream yields the email in several pieces, with some # The pffexport stream yields the email in several pieces, with some
# data missing (e.g. attachment MIME types). We rebuild a complete # data missing (e.g. attachment MIME types). We rebuild a complete
@ -103,7 +113,7 @@ class EmailBuilder(object):
for att in self.attachments: for att in self.attachments:
fn = att[1] fn = att[1]
ext = os.path.splitext(fn)[1] ext = met_splitext(fn)[1]
mime = self.mimemap.get(ext) mime = self.mimemap.get(ext)
if not mime: if not mime:
mime = 'application/octet-stream' mime = 'application/octet-stream'
@ -174,8 +184,9 @@ class PFFReader(object):
if name == 'filename': if name == 'filename':
#self.log("filename: %s" % paramstr) #self.log("filename: %s" % paramstr)
fullpath = paramstr fullpath = paramstr
basename = os.path.basename(fullpath) basename = met_basename(fullpath)
parentdir = os.path.basename(os.path.dirname(fullpath)) parentdir = met_basename(met_dirname(fullpath))
#self.log("basename [%s] parentdir [%s]" % (basename, parentdir))
elif name == 'data': elif name == 'data':
if parentdir == 'Attachments': if parentdir == 'Attachments':
#self.log("Attachment: %s" % basename) #self.log("Attachment: %s" % basename)
@ -186,18 +197,21 @@ class PFFReader(object):
if doc: if doc:
yield((doc, ipath)) yield((doc, ipath))
elif basename == 'InternetHeaders.txt': elif basename == 'InternetHeaders.txt':
#self.log("name: [%s] data: %s" % (name, paramstr)) #self.log("name: [%s] data: %s" % (name, paramstr[:20]))
# This part is the indispensable one. Record # This part is the indispensable one. Record
# the ipath at this point: # the ipath at this point:
p = pathlib.Path(fullpath) if _mswindows:
p = pathlib.PureWindowsPath(fullpath)
else:
p = pathlib.Path(fullpath)
# Strip the top dir (/nonexistent.export/) # Strip the top dir (/nonexistent.export/)
p = p.relative_to(*p.parts[:2]) p = p.relative_to(*p.parts[:2])
# We use the parent directory as ipath: all # We use the parent directory as ipath: all
# the message parts are in there # the message parts are in there
ipath = str(p.parents[0]) ipath = str(p.parents[0])
self.msg.setheaders(data) self.msg.setheaders(data)
elif os.path.splitext(basename)[0] == 'Message': elif met_splitext(basename)[0] == 'Message':
ext = os.path.splitext(basename)[1] ext = met_splitext(basename)[1]
if ext == '.txt': if ext == '.txt':
self.msg.setbody(data, 'text', 'plain') self.msg.setbody(data, 'text', 'plain')
elif ext == '.html': elif ext == '.html':
@ -224,13 +238,25 @@ class PstExtractor(object):
def __init__(self, em): def __init__(self, em):
self.generator = None self.generator = None
self.em = em self.em = em
self.target = "/nonexistent" if _mswindows:
self.cmd = ["pffexport", "-q", "-t", self.target, "-s"] self.target = "\\\\?\\c:\\nonexistent"
else:
self.target = "/nonexistent"
self.pffexport = rclexecm.which("pffexport")
if not self.pffexport:
self.pffexport = rclexecm.which("pffinstall/mingw32/bin/pffexport")
if not self.pffexport:
# No need for anything else. openfile() will return an
# error at once
return
self.cmd = [self.pffexport, "-q", "-t", self.target, "-s"]
def startCmd(self, filename, ipath=None): def startCmd(self, filename, ipath=None):
fullcmd = self.cmd + [rclexecm.subprocfile(filename)] fullcmd = self.cmd
if ipath: if ipath:
fullcmd += ["-p", ipath] fullcmd += ["-p", ipath]
fn = _backslashize(rclexecm.subprocfile(filename))
fullcmd += [fn,]
try: try:
self.proc = subprocess.Popen(fullcmd, stdout=subprocess.PIPE) self.proc = subprocess.Popen(fullcmd, stdout=subprocess.PIPE)
except subprocess.CalledProcessError as err: except subprocess.CalledProcessError as err:
@ -244,13 +270,16 @@ class PstExtractor(object):
###### File type handler api, used by rclexecm ----------> ###### File type handler api, used by rclexecm ---------->
def openfile(self, params): def openfile(self, params):
if not self.pffexport:
print("RECFILTERROR HELPERNOTFOUND pffexport")
sys.exit(1);
self.filename = params["filename:"] self.filename = params["filename:"]
self.em.rclog("openfile: %s" % self.filename) self.em.rclog("openfile: sys.platform [%s] [%s]" % (sys.platform,self.filename))
return True return True
def getipath(self, params): def getipath(self, params):
ipath = posixpath.join(self.target + ".export", ipath = met_join(self.target + ".export",
params["ipath:"].decode("UTF-8")) params["ipath:"].decode("UTF-8"))
self.em.rclog("getipath: [%s]" % ipath) self.em.rclog("getipath: [%s]" % ipath)
if not self.startCmd(self.filename, ipath=ipath): if not self.startCmd(self.filename, ipath=ipath):
return (False, "", "", rclexecm.RclExecM.eofnow) return (False, "", "", rclexecm.RclExecM.eofnow)

View File

@ -94,6 +94,8 @@ application/javascript = internal text/plain
# chose one. # chose one.
application/vnd.ms-office = execm python rcldoc.py application/vnd.ms-office = execm python rcldoc.py
application/vnd.ms-outlook = execm python rclpst.py
application/ogg = execm python rclaudio application/ogg = execm python rclaudio
application/x-awk = internal text/plain application/x-awk = internal text/plain

View File

@ -45,6 +45,7 @@ LIBWPD=${RCLDEPS}libwpd/libwpd-0.10.0/
LIBREVENGE=${RCLDEPS}libwpd/librevenge-0.0.1.jfd/ LIBREVENGE=${RCLDEPS}libwpd/librevenge-0.0.1.jfd/
CHM=${RCLDEPS}pychm CHM=${RCLDEPS}pychm
MISC=${RCLDEPS}misc MISC=${RCLDEPS}misc
LIBPFF=${RCLDEPS}pffinstall
# Where to copy the Qt Dlls from: # Where to copy the Qt Dlls from:
QTBIN=C:/Qt/Qt5.8.0/5.8/mingw53_32/bin QTBIN=C:/Qt/Qt5.8.0/5.8/mingw53_32/bin
@ -261,6 +262,13 @@ copychm()
cp -rp $CHM/chm $DEST || fatal "can't copy pychm" cp -rp $CHM/chm $DEST || fatal "can't copy pychm"
} }
copypff()
{
DEST=$FILTERS
cp -rp $LIBPFF $DEST || fatal "can't copy pffinstall"
chkcp $LIBPFF/mingw32/bin/pffexport.exe $DEST/pffinstall/mingw32
}
for d in doc examples filters images translations; do for d in doc examples filters images translations; do
test -d $DESTDIR/Share/$d || mkdir -p $DESTDIR/Share/$d || \ test -d $DESTDIR/Share/$d || mkdir -p $DESTDIR/Share/$d || \
fatal mkdir $d failed fatal mkdir $d failed
@ -293,4 +301,5 @@ copyepub
#copypyexiv2 #copypyexiv2
copywpd copywpd
#copychm #copychm
copypff
copypython copypython