ckpt: pst: basic indexing of email. no getipath/preview

This commit is contained in:
Jean-Francois Dockes 2019-05-26 12:30:59 +02:00
parent c7c413d9e7
commit cc4f4e0c74
4 changed files with 235 additions and 179 deletions

View File

@ -1,26 +1,27 @@
#!/usr/bin/python3 #!/usr/bin/python3
################################# #################################
# Copyright (C) 2019 J.F.Dockes # Copyright (C) 2019 J.F.Dockes
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by # it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or # the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version. # (at your option) any later version.
# #
# This program is distributed in the hope that it will be useful, # This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of # but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details. # GNU General Public License for more details.
# #
# You should have received a copy of the GNU General Public License # You should have received a copy of the GNU General Public License
# along with this program; if not, write to the # along with this program; if not, write to the
# Free Software Foundation, Inc., # Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
######################################################## ########################################################
# #
# Process the stream produced by a modified pffexport: # Process the stream produced by a modified pffexport:
# https://github.com/libyal/libpff # https://github.com/libyal/libpff
# The tool has been modified to produce a data stream instead of a file tree # The modification allows producing a data stream instead of a file tree
#
import sys import sys
import os import os
@ -31,189 +32,238 @@ import traceback
import email.parser import email.parser
import email.policy import email.policy
import mailbox import mailbox
import subprocess
import rclexecm
import rclconfig import rclconfig
import conftree import conftree
def _deb(s):
print("%s"%s, file=sys.stderr)
# The pffexport stream yields the email in several pieces, with some # The pffexport stream yields the email in several pieces, with some
# data missing (e.g. attachment MIME types). We rebuild a complete # data missing (e.g. attachment MIME types). We rebuild a complete
# message for parsing by the Recoll email handler # message for parsing by the Recoll email handler
class EmailBuilder(object): class EmailBuilder(object):
def __init__(self): def __init__(self, logger, mimemap):
self.reset() self.log = logger
self.parser = email.parser.Parser(policy = email.policy.default) self.reset()
def reset(self): self.mimemap = mimemap
self.headers = '' self.parser = email.parser.Parser(policy = email.policy.default)
self.body = '' def reset(self):
self.bodymime = '' self.headers = ''
self.attachments = [] self.body = ''
def setheaders(self, h): self.bodymime = ''
self.headers = h self.attachments = []
def setbody(self, body, main, sub): def setheaders(self, h):
self.body = body self.headers = h
self.bodymimemain = main def setbody(self, body, main, sub):
self.bodymimesub = sub self.body = body
def addattachment(self, att, filename): self.bodymimemain = main
_deb("Adding attachment") self.bodymimesub = sub
self.attachments.append((att, filename)) def addattachment(self, att, filename):
def flush(self): self.log("Adding attachment")
if not self.headers: self.attachments.append((att, filename))
_deb("Not flushing because no headers") def flush(self):
if self.headers and (self.body or self.attachments): if not (self.headers and (self.body or self.attachments)):
newmsg = email.message.EmailMessage(policy = self.log("Not flushing because no headers or no body/attach")
email.policy.default) return None
newmsg = email.message.EmailMessage(policy=email.policy.default)
headerstr = self.headers.decode('utf-8') headerstr = self.headers.decode('utf-8')
# print("%s" % headerstr) # print("%s" % headerstr)
headers = self.parser.parsestr(headerstr, headersonly=True) headers = self.parser.parsestr(headerstr, headersonly=True)
_deb("EmailBuilder: content-type %s" % headers['content-type']) #self.log("EmailBuilder: content-type %s" % headers['content-type'])
for nm in ('from', 'subject'): for nm in ('from', 'subject'):
if nm in headers: if nm in headers:
newmsg.add_header(nm, headers[nm]) newmsg.add_header(nm, headers[nm])
tolist = headers.get_all('to') tolist = headers.get_all('to')
alldests = "" alldests = ""
for toheader in tolist: for toheader in tolist:
for dest in toheader.addresses: for dest in toheader.addresses:
sd = str(dest).replace('\n', '').replace('\r','') sd = str(dest).replace('\n', '').replace('\r','')
_deb("EmailBuilder: dest %s" % sd) #self.log("EmailBuilder: dest %s" % sd)
alldests += sd + ", " alldests += sd + ", "
alldests = alldests.rstrip(", ") alldests = alldests.rstrip(", ")
newmsg.add_header('to', alldests) newmsg.add_header('to', alldests)
# Also: CC # Also: CC
if self.body: if self.body:
newmsg.set_content(self.body, maintype = self.bodymimemain, newmsg.set_content(self.body, maintype = self.bodymimemain,
subtype = self.bodymimesub) subtype = self.bodymimesub)
for att in self.attachments: for att in self.attachments:
#if self.body: fn = att[1]
# newmsg.make_mixed() ext = os.path.splitext(fn)[1]
ext = os.path.splitext(att[1])[1] mime = self.mimemap.get(ext)
_deb("Querying mimemap with %s" % ext) if not mime:
mime = mimemap.get(ext) mime = 'application/octet-stream'
if not mime: #self.log("Attachment: filename %s MIME %s" % (fn, mime))
mime = 'application/octet-stream' mt,st = mime.split('/')
_deb("Attachment: filename %s MIME %s" % (att[1], mime)) newmsg.add_attachment(att[0], maintype=mt, subtype=st,
mt,st = mime.split('/') filename=fn)
newmsg.add_attachment(att[0], maintype=mt, subtype=st,
filename=att[1])
newmsg.set_unixfrom("From some@place.org Sun Jan 01 00:00:00 2000") #newmsg.set_unixfrom("From some@place.org Sun Jan 01 00:00:00 2000")
print("%s\n" % newmsg.as_string(unixfrom=True, maxheaderlen=80)) #print("%s\n" % newmsg.as_string(unixfrom=True, maxheaderlen=80))
ret = newmsg.as_string(maxheaderlen=100)
self.reset() self.reset()
return ret
class PFFReader(object): class PFFReader(object):
def __init__(self, infile=sys.stdin): def __init__(self, logger, infile=sys.stdin):
try: self.log = logger
self.myname = os.path.basename(sys.argv[0]) config = rclconfig.RclConfig()
except: dir1 = os.path.join(config.getConfDir(), "examples")
self.myname = "???" dir2 = os.path.join(config.datadir, "examples")
self.mimemap = conftree.ConfStack('mimemap', [dir1, dir2])
self.infile = infile
self.fields = {}
self.msg = EmailBuilder(self.log, self.mimemap)
self.infile = infile # Read single parameter from process input: line with param name and size
self.fields = {} # followed by data. The param name is returned as str/unicode, the data
self.msg = EmailBuilder() # as bytes
def readparam(self):
inf = self.infile
s = inf.readline()
if s == b'':
return ('', b'')
s = s.rstrip(b'\n')
if s == b'':
return ('', b'')
l = s.split()
if len(l) != 2:
self.log(b'bad line: [' + s + b']', 1, 1)
return ('', b'')
paramname = l[0].decode('ASCII').rstrip(':')
paramsize = int(l[1])
if paramsize > 0:
paramdata = inf.read(paramsize)
if len(paramdata) != paramsize:
self.log("Bad read: wanted %d, got %d" %
(paramsize, len(paramdata)), 1, 1)
return('', b'')
else:
paramdata = b''
return (paramname, paramdata)
if sys.platform == "win32": def mainloop(self):
import msvcrt basename = ''
msvcrt.setmode(self.outfile.fileno(), os.O_BINARY) path = ''
msvcrt.setmode(self.infile.fileno(), os.O_BINARY) while 1:
self.debugfile = None name, data = self.readparam()
if self.debugfile: if name == "":
self.errfout = open(self.debugfile, "a") break
else: try:
self.errfout = sys.stderr paramstr = data.decode('utf-8')
except:
paramstr = ''
def log(self, s): if name == 'filename':
print("PFFReader: %s: %s" % (self.myname, s), file=self.errfout) self.log("filename: %s" % paramstr)
path = paramstr
# Read single parameter from process input: line with param name and size basename = os.path.basename(path)
# followed by data. The param name is returned as str/unicode, the data parentdir = os.path.basename(os.path.dirname(paramstr))
# as bytes elif name == 'data':
def readparam(self): if parentdir == 'Attachments':
inf = self.infile.buffer #self.log("Attachment: %s" % basename)
s = inf.readline() self.msg.addattachment(data, basename)
if s == b'': else:
return ('', b'') if basename == 'OutlookHeaders.txt':
s = s.rstrip(b'\n') doc = self.msg.flush()
if s == b'': if doc:
return ('', b'') yield((doc, path))
l = s.split() elif basename == 'ConversationIndex.txt':
if len(l) != 2: pass
self.log(b'bad line: [' + s + b']', 1, 1) elif basename == 'Recipients.txt':
return ('', b'') pass
paramname = l[0].decode('ASCII').rstrip(':') elif basename == 'InternetHeaders.txt':
paramsize = int(l[1]) #self.log("name: [%s] data: %s" % (name, paramstr))
if paramsize > 0: self.msg.setheaders(data)
paramdata = inf.read(paramsize) elif os.path.splitext(basename)[0] == 'Message':
if len(paramdata) != paramsize: ext = os.path.splitext(basename)[1]
self.log("Bad read: wanted %d, got %d" % if ext == '.txt':
(paramsize, len(paramdata)), 1, 1) self.msg.setbody(data, 'text', 'plain')
return('', b'') elif ext == '.html':
else: self.msg.setbody(data, 'text', 'html')
paramdata = b'' elif ext == '.rtf':
return (paramname, paramdata) self.msg.setbody(data, 'text', 'rtf')
else:
def mainloop(self): raise Exception("PST: Unknown body type %s"%ext)
basename = '' self.log("Message")
while 1: pass
name, data = self.readparam() basename = ''
if name == "": parentdir = ''
break self.log("Out of loop")
try: doc = self.msg.flush()
paramstr = data.decode('utf-8') if doc:
except: yield((doc, path))
paramstr = '' return
if name == 'filename':
basename = os.path.basename(paramstr)
self.log("name: [%s] data: %s" %
(name, paramstr))
parentdir = os.path.basename(os.path.dirname(paramstr))
elif name == 'data':
if parentdir == 'Attachments':
#self.log("Attachment: %s" % basename)
self.msg.addattachment(data, basename)
else:
if basename == 'OutlookHeaders.txt':
self.msg.flush()
pass
if basename == 'ConversationIndex.txt':
pass
elif basename == 'Recipients.txt':
pass
elif basename == 'InternetHeaders.txt':
#self.log("name: [%s] data: %s" % (name, paramstr))
self.msg.setheaders(data)
elif os.path.splitext(basename)[0] == 'Message':
ext = os.path.splitext(basename)[1]
if ext == '.txt':
self.msg.setbody(data, 'text', 'plain')
elif ext == '.html':
self.msg.setbody(data, 'text', 'html')
elif ext == '.rtf':
self.msg.setbody(data, 'text', 'rtf')
else:
raise Exception("PST: Unknown body type %s"%ext)
self.log("Message")
pass
basename = ''
parentdir = ''
self.log("Out of loop")
self.msg.flush()
config = rclconfig.RclConfig() class PstExtractor(object):
dir1 = os.path.join(config.getConfDir(), "examples") def __init__(self, em):
dir2 = os.path.join(config.datadir, "examples") self.currentindex = 0
mimemap = conftree.ConfStack('mimemap', [dir1, dir2]) self.em = em
self.cmd = ["pffexport", "-q", "-t", "/nonexistent", "-s"]
proto = PFFReader() def startCmd(self, filename):
proto.mainloop() fullcmd = self.cmd + [rclexecm.subprocfile(filename)]
try:
self.proc = subprocess.Popen(fullcmd, stdout=subprocess.PIPE)
except subprocess.CalledProcessError as err:
self.em.rclog("Pst: Popen(%s) error: %s" % (fullcmd, err))
return False
except OSError as err:
self.em.rclog("Pst: Popen(%s) OS error: %s" % (fullcmd, err))
return (False, "")
self.filein = self.proc.stdout
return True
def extractone(self, ipath):
#self.em.rclog("extractone: [%s]" % ipath)
docdata = ""
ok = False
iseof = True
return (ok, docdata, rclexecm.makebytes(ipath), iseof)
###### File type handler api, used by rclexecm ---------->
def openfile(self, params):
filename = params["filename:"]
if not self.startCmd(filename):
return False
reader = PFFReader(self.em.rclog, infile=self.filein)
self.generator = reader.mainloop()
return True
def getipath(self, params):
ipath = params["ipath:"]
ok, data, ipath, eof = self.extractone(ipath)
if ok:
return (ok, data, ipath, eof)
# Not found. Maybe we need to decode the path?
try:
ipath = ipath.decode("utf-8")
return self.extractone(ipath)
except Exception as err:
return (ok, data, ipath, eof)
def getnext(self, params):
try:
doc, ipath = next(self.generator)
self.em.setmimetype("message/rfc822")
#self.em.rclog("doc %s ipath %s" % (doc[:40], ipath))
except StopIteration:
return(False, "", "", rclexecm.RclExecM.eofnow)
return (True, doc, ipath, False)
# Main program: create protocol handler and extractor and run them
proto = rclexecm.RclExecM()
extract = PstExtractor(proto)
rclexecm.main(proto, extract)

View File

@ -78,6 +78,8 @@ class RclConfig:
def getConfDir(self): def getConfDir(self):
return self.confdir return self.confdir
def getDataDir(self):
return self.datadir
def setKeyDir(self, dir): def setKeyDir(self, dir):
self.keydir = dir self.keydir = dir

View File

@ -78,6 +78,7 @@ application/pdf = execm rclpdf.py
application/postscript = exec rclps application/postscript = exec rclps
application/sql = internal text/plain application/sql = internal text/plain
application/vnd.ms-excel = execm rclxls.py application/vnd.ms-excel = execm rclxls.py
application/vnd.ms-outlook = execm rclpst.py
application/vnd.ms-powerpoint = execm rclppt.py application/vnd.ms-powerpoint = execm rclppt.py
application/vnd.oasis.opendocument.text = internal xsltproc meta.xml opendoc-meta.xsl content.xml opendoc-body.xsl application/vnd.oasis.opendocument.text = internal xsltproc meta.xml opendoc-meta.xsl content.xml opendoc-body.xsl
application/vnd.oasis.opendocument.text-template = internal xsltproc meta.xml opendoc-meta.xsl content.xml opendoc-body.xsl application/vnd.oasis.opendocument.text-template = internal xsltproc meta.xml opendoc-meta.xsl content.xml opendoc-body.xsl

View File

@ -61,6 +61,9 @@
# extracted message. Also used by Windows Live Mail # extracted message. Also used by Windows Live Mail
.eml = message/rfc822 .eml = message/rfc822
.pst = application/vnd.ms-outlook
.ost = application/vnd.ms-outlook
.pdf = application/pdf .pdf = application/pdf
.ps = application/postscript .ps = application/postscript