ckpt: pst: basic indexing of email. no getipath/preview
This commit is contained in:
parent
c7c413d9e7
commit
cc4f4e0c74
@ -1,26 +1,27 @@
|
||||
#!/usr/bin/python3
|
||||
#################################
|
||||
# Copyright (C) 2019 J.F.Dockes
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the
|
||||
# Free Software Foundation, Inc.,
|
||||
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the
|
||||
# Free Software Foundation, Inc.,
|
||||
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
########################################################
|
||||
|
||||
#
|
||||
# Process the stream produced by a modified pffexport:
|
||||
# https://github.com/libyal/libpff
|
||||
# The tool has been modified to produce a data stream instead of a file tree
|
||||
# The modification allows producing a data stream instead of a file tree
|
||||
#
|
||||
|
||||
import sys
|
||||
import os
|
||||
@ -31,189 +32,238 @@ import traceback
|
||||
import email.parser
|
||||
import email.policy
|
||||
import mailbox
|
||||
import subprocess
|
||||
|
||||
import rclexecm
|
||||
import rclconfig
|
||||
import conftree
|
||||
|
||||
def _deb(s):
|
||||
print("%s"%s, file=sys.stderr)
|
||||
|
||||
|
||||
# The pffexport stream yields the email in several pieces, with some
|
||||
# data missing (e.g. attachment MIME types). We rebuild a complete
|
||||
# message for parsing by the Recoll email handler
|
||||
class EmailBuilder(object):
|
||||
def __init__(self):
|
||||
self.reset()
|
||||
self.parser = email.parser.Parser(policy = email.policy.default)
|
||||
def reset(self):
|
||||
self.headers = ''
|
||||
self.body = ''
|
||||
self.bodymime = ''
|
||||
self.attachments = []
|
||||
def setheaders(self, h):
|
||||
self.headers = h
|
||||
def setbody(self, body, main, sub):
|
||||
self.body = body
|
||||
self.bodymimemain = main
|
||||
self.bodymimesub = sub
|
||||
def addattachment(self, att, filename):
|
||||
_deb("Adding attachment")
|
||||
self.attachments.append((att, filename))
|
||||
def flush(self):
|
||||
if not self.headers:
|
||||
_deb("Not flushing because no headers")
|
||||
if self.headers and (self.body or self.attachments):
|
||||
newmsg = email.message.EmailMessage(policy =
|
||||
email.policy.default)
|
||||
|
||||
headerstr = self.headers.decode('utf-8')
|
||||
# print("%s" % headerstr)
|
||||
headers = self.parser.parsestr(headerstr, headersonly=True)
|
||||
_deb("EmailBuilder: content-type %s" % headers['content-type'])
|
||||
for nm in ('from', 'subject'):
|
||||
if nm in headers:
|
||||
newmsg.add_header(nm, headers[nm])
|
||||
def __init__(self, logger, mimemap):
|
||||
self.log = logger
|
||||
self.reset()
|
||||
self.mimemap = mimemap
|
||||
self.parser = email.parser.Parser(policy = email.policy.default)
|
||||
def reset(self):
|
||||
self.headers = ''
|
||||
self.body = ''
|
||||
self.bodymime = ''
|
||||
self.attachments = []
|
||||
def setheaders(self, h):
|
||||
self.headers = h
|
||||
def setbody(self, body, main, sub):
|
||||
self.body = body
|
||||
self.bodymimemain = main
|
||||
self.bodymimesub = sub
|
||||
def addattachment(self, att, filename):
|
||||
self.log("Adding attachment")
|
||||
self.attachments.append((att, filename))
|
||||
def flush(self):
|
||||
if not (self.headers and (self.body or self.attachments)):
|
||||
self.log("Not flushing because no headers or no body/attach")
|
||||
return None
|
||||
newmsg = email.message.EmailMessage(policy=email.policy.default)
|
||||
|
||||
headerstr = self.headers.decode('utf-8')
|
||||
# print("%s" % headerstr)
|
||||
headers = self.parser.parsestr(headerstr, headersonly=True)
|
||||
#self.log("EmailBuilder: content-type %s" % headers['content-type'])
|
||||
for nm in ('from', 'subject'):
|
||||
if nm in headers:
|
||||
newmsg.add_header(nm, headers[nm])
|
||||
|
||||
tolist = headers.get_all('to')
|
||||
alldests = ""
|
||||
for toheader in tolist:
|
||||
for dest in toheader.addresses:
|
||||
sd = str(dest).replace('\n', '').replace('\r','')
|
||||
_deb("EmailBuilder: dest %s" % sd)
|
||||
alldests += sd + ", "
|
||||
alldests = alldests.rstrip(", ")
|
||||
newmsg.add_header('to', alldests)
|
||||
tolist = headers.get_all('to')
|
||||
alldests = ""
|
||||
for toheader in tolist:
|
||||
for dest in toheader.addresses:
|
||||
sd = str(dest).replace('\n', '').replace('\r','')
|
||||
#self.log("EmailBuilder: dest %s" % sd)
|
||||
alldests += sd + ", "
|
||||
alldests = alldests.rstrip(", ")
|
||||
newmsg.add_header('to', alldests)
|
||||
|
||||
# Also: CC
|
||||
|
||||
if self.body:
|
||||
newmsg.set_content(self.body, maintype = self.bodymimemain,
|
||||
subtype = self.bodymimesub)
|
||||
|
||||
for att in self.attachments:
|
||||
#if self.body:
|
||||
# newmsg.make_mixed()
|
||||
ext = os.path.splitext(att[1])[1]
|
||||
_deb("Querying mimemap with %s" % ext)
|
||||
mime = mimemap.get(ext)
|
||||
if not mime:
|
||||
mime = 'application/octet-stream'
|
||||
_deb("Attachment: filename %s MIME %s" % (att[1], mime))
|
||||
mt,st = mime.split('/')
|
||||
newmsg.add_attachment(att[0], maintype=mt, subtype=st,
|
||||
filename=att[1])
|
||||
# Also: CC
|
||||
|
||||
if self.body:
|
||||
newmsg.set_content(self.body, maintype = self.bodymimemain,
|
||||
subtype = self.bodymimesub)
|
||||
|
||||
for att in self.attachments:
|
||||
fn = att[1]
|
||||
ext = os.path.splitext(fn)[1]
|
||||
mime = self.mimemap.get(ext)
|
||||
if not mime:
|
||||
mime = 'application/octet-stream'
|
||||
#self.log("Attachment: filename %s MIME %s" % (fn, mime))
|
||||
mt,st = mime.split('/')
|
||||
newmsg.add_attachment(att[0], maintype=mt, subtype=st,
|
||||
filename=fn)
|
||||
|
||||
newmsg.set_unixfrom("From some@place.org Sun Jan 01 00:00:00 2000")
|
||||
print("%s\n" % newmsg.as_string(unixfrom=True, maxheaderlen=80))
|
||||
#newmsg.set_unixfrom("From some@place.org Sun Jan 01 00:00:00 2000")
|
||||
#print("%s\n" % newmsg.as_string(unixfrom=True, maxheaderlen=80))
|
||||
ret = newmsg.as_string(maxheaderlen=100)
|
||||
|
||||
self.reset()
|
||||
|
||||
self.reset()
|
||||
return ret
|
||||
|
||||
|
||||
class PFFReader(object):
|
||||
def __init__(self, infile=sys.stdin):
|
||||
try:
|
||||
self.myname = os.path.basename(sys.argv[0])
|
||||
except:
|
||||
self.myname = "???"
|
||||
def __init__(self, logger, infile=sys.stdin):
|
||||
self.log = logger
|
||||
config = rclconfig.RclConfig()
|
||||
dir1 = os.path.join(config.getConfDir(), "examples")
|
||||
dir2 = os.path.join(config.datadir, "examples")
|
||||
self.mimemap = conftree.ConfStack('mimemap', [dir1, dir2])
|
||||
self.infile = infile
|
||||
self.fields = {}
|
||||
self.msg = EmailBuilder(self.log, self.mimemap)
|
||||
|
||||
# Read single parameter from process input: line with param name and size
|
||||
# followed by data. The param name is returned as str/unicode, the data
|
||||
# as bytes
|
||||
def readparam(self):
|
||||
inf = self.infile
|
||||
s = inf.readline()
|
||||
if s == b'':
|
||||
return ('', b'')
|
||||
s = s.rstrip(b'\n')
|
||||
if s == b'':
|
||||
return ('', b'')
|
||||
l = s.split()
|
||||
if len(l) != 2:
|
||||
self.log(b'bad line: [' + s + b']', 1, 1)
|
||||
return ('', b'')
|
||||
paramname = l[0].decode('ASCII').rstrip(':')
|
||||
paramsize = int(l[1])
|
||||
if paramsize > 0:
|
||||
paramdata = inf.read(paramsize)
|
||||
if len(paramdata) != paramsize:
|
||||
self.log("Bad read: wanted %d, got %d" %
|
||||
(paramsize, len(paramdata)), 1, 1)
|
||||
return('', b'')
|
||||
else:
|
||||
paramdata = b''
|
||||
return (paramname, paramdata)
|
||||
|
||||
self.infile = infile
|
||||
self.fields = {}
|
||||
self.msg = EmailBuilder()
|
||||
|
||||
if sys.platform == "win32":
|
||||
import msvcrt
|
||||
msvcrt.setmode(self.outfile.fileno(), os.O_BINARY)
|
||||
msvcrt.setmode(self.infile.fileno(), os.O_BINARY)
|
||||
self.debugfile = None
|
||||
if self.debugfile:
|
||||
self.errfout = open(self.debugfile, "a")
|
||||
else:
|
||||
self.errfout = sys.stderr
|
||||
|
||||
def log(self, s):
|
||||
print("PFFReader: %s: %s" % (self.myname, s), file=self.errfout)
|
||||
def mainloop(self):
|
||||
basename = ''
|
||||
path = ''
|
||||
while 1:
|
||||
name, data = self.readparam()
|
||||
if name == "":
|
||||
break
|
||||
try:
|
||||
paramstr = data.decode('utf-8')
|
||||
except:
|
||||
paramstr = ''
|
||||
|
||||
# Read single parameter from process input: line with param name and size
|
||||
# followed by data. The param name is returned as str/unicode, the data
|
||||
# as bytes
|
||||
def readparam(self):
|
||||
inf = self.infile.buffer
|
||||
s = inf.readline()
|
||||
if s == b'':
|
||||
return ('', b'')
|
||||
s = s.rstrip(b'\n')
|
||||
if s == b'':
|
||||
return ('', b'')
|
||||
l = s.split()
|
||||
if len(l) != 2:
|
||||
self.log(b'bad line: [' + s + b']', 1, 1)
|
||||
return ('', b'')
|
||||
paramname = l[0].decode('ASCII').rstrip(':')
|
||||
paramsize = int(l[1])
|
||||
if paramsize > 0:
|
||||
paramdata = inf.read(paramsize)
|
||||
if len(paramdata) != paramsize:
|
||||
self.log("Bad read: wanted %d, got %d" %
|
||||
(paramsize, len(paramdata)), 1, 1)
|
||||
return('', b'')
|
||||
else:
|
||||
paramdata = b''
|
||||
return (paramname, paramdata)
|
||||
|
||||
def mainloop(self):
|
||||
basename = ''
|
||||
while 1:
|
||||
name, data = self.readparam()
|
||||
if name == "":
|
||||
break
|
||||
try:
|
||||
paramstr = data.decode('utf-8')
|
||||
except:
|
||||
paramstr = ''
|
||||
|
||||
if name == 'filename':
|
||||
basename = os.path.basename(paramstr)
|
||||
self.log("name: [%s] data: %s" %
|
||||
(name, paramstr))
|
||||
parentdir = os.path.basename(os.path.dirname(paramstr))
|
||||
elif name == 'data':
|
||||
if parentdir == 'Attachments':
|
||||
#self.log("Attachment: %s" % basename)
|
||||
self.msg.addattachment(data, basename)
|
||||
else:
|
||||
if basename == 'OutlookHeaders.txt':
|
||||
self.msg.flush()
|
||||
pass
|
||||
if basename == 'ConversationIndex.txt':
|
||||
pass
|
||||
elif basename == 'Recipients.txt':
|
||||
pass
|
||||
elif basename == 'InternetHeaders.txt':
|
||||
#self.log("name: [%s] data: %s" % (name, paramstr))
|
||||
self.msg.setheaders(data)
|
||||
elif os.path.splitext(basename)[0] == 'Message':
|
||||
ext = os.path.splitext(basename)[1]
|
||||
if ext == '.txt':
|
||||
self.msg.setbody(data, 'text', 'plain')
|
||||
elif ext == '.html':
|
||||
self.msg.setbody(data, 'text', 'html')
|
||||
elif ext == '.rtf':
|
||||
self.msg.setbody(data, 'text', 'rtf')
|
||||
else:
|
||||
raise Exception("PST: Unknown body type %s"%ext)
|
||||
self.log("Message")
|
||||
pass
|
||||
basename = ''
|
||||
parentdir = ''
|
||||
self.log("Out of loop")
|
||||
self.msg.flush()
|
||||
if name == 'filename':
|
||||
self.log("filename: %s" % paramstr)
|
||||
path = paramstr
|
||||
basename = os.path.basename(path)
|
||||
parentdir = os.path.basename(os.path.dirname(paramstr))
|
||||
elif name == 'data':
|
||||
if parentdir == 'Attachments':
|
||||
#self.log("Attachment: %s" % basename)
|
||||
self.msg.addattachment(data, basename)
|
||||
else:
|
||||
if basename == 'OutlookHeaders.txt':
|
||||
doc = self.msg.flush()
|
||||
if doc:
|
||||
yield((doc, path))
|
||||
elif basename == 'ConversationIndex.txt':
|
||||
pass
|
||||
elif basename == 'Recipients.txt':
|
||||
pass
|
||||
elif basename == 'InternetHeaders.txt':
|
||||
#self.log("name: [%s] data: %s" % (name, paramstr))
|
||||
self.msg.setheaders(data)
|
||||
elif os.path.splitext(basename)[0] == 'Message':
|
||||
ext = os.path.splitext(basename)[1]
|
||||
if ext == '.txt':
|
||||
self.msg.setbody(data, 'text', 'plain')
|
||||
elif ext == '.html':
|
||||
self.msg.setbody(data, 'text', 'html')
|
||||
elif ext == '.rtf':
|
||||
self.msg.setbody(data, 'text', 'rtf')
|
||||
else:
|
||||
raise Exception("PST: Unknown body type %s"%ext)
|
||||
self.log("Message")
|
||||
pass
|
||||
basename = ''
|
||||
parentdir = ''
|
||||
self.log("Out of loop")
|
||||
doc = self.msg.flush()
|
||||
if doc:
|
||||
yield((doc, path))
|
||||
return
|
||||
|
||||
|
||||
config = rclconfig.RclConfig()
|
||||
dir1 = os.path.join(config.getConfDir(), "examples")
|
||||
dir2 = os.path.join(config.datadir, "examples")
|
||||
mimemap = conftree.ConfStack('mimemap', [dir1, dir2])
|
||||
class PstExtractor(object):
|
||||
def __init__(self, em):
|
||||
self.currentindex = 0
|
||||
self.em = em
|
||||
self.cmd = ["pffexport", "-q", "-t", "/nonexistent", "-s"]
|
||||
|
||||
proto = PFFReader()
|
||||
proto.mainloop()
|
||||
def startCmd(self, filename):
|
||||
fullcmd = self.cmd + [rclexecm.subprocfile(filename)]
|
||||
try:
|
||||
self.proc = subprocess.Popen(fullcmd, stdout=subprocess.PIPE)
|
||||
except subprocess.CalledProcessError as err:
|
||||
self.em.rclog("Pst: Popen(%s) error: %s" % (fullcmd, err))
|
||||
return False
|
||||
except OSError as err:
|
||||
self.em.rclog("Pst: Popen(%s) OS error: %s" % (fullcmd, err))
|
||||
return (False, "")
|
||||
self.filein = self.proc.stdout
|
||||
return True
|
||||
|
||||
def extractone(self, ipath):
|
||||
#self.em.rclog("extractone: [%s]" % ipath)
|
||||
docdata = ""
|
||||
ok = False
|
||||
iseof = True
|
||||
return (ok, docdata, rclexecm.makebytes(ipath), iseof)
|
||||
|
||||
###### File type handler api, used by rclexecm ---------->
|
||||
def openfile(self, params):
|
||||
filename = params["filename:"]
|
||||
|
||||
if not self.startCmd(filename):
|
||||
return False
|
||||
|
||||
reader = PFFReader(self.em.rclog, infile=self.filein)
|
||||
self.generator = reader.mainloop()
|
||||
return True
|
||||
|
||||
|
||||
def getipath(self, params):
|
||||
ipath = params["ipath:"]
|
||||
ok, data, ipath, eof = self.extractone(ipath)
|
||||
if ok:
|
||||
return (ok, data, ipath, eof)
|
||||
# Not found. Maybe we need to decode the path?
|
||||
try:
|
||||
ipath = ipath.decode("utf-8")
|
||||
return self.extractone(ipath)
|
||||
except Exception as err:
|
||||
return (ok, data, ipath, eof)
|
||||
|
||||
def getnext(self, params):
|
||||
try:
|
||||
doc, ipath = next(self.generator)
|
||||
self.em.setmimetype("message/rfc822")
|
||||
#self.em.rclog("doc %s ipath %s" % (doc[:40], ipath))
|
||||
except StopIteration:
|
||||
return(False, "", "", rclexecm.RclExecM.eofnow)
|
||||
return (True, doc, ipath, False)
|
||||
|
||||
|
||||
# Main program: create protocol handler and extractor and run them
|
||||
proto = rclexecm.RclExecM()
|
||||
extract = PstExtractor(proto)
|
||||
rclexecm.main(proto, extract)
|
||||
|
||||
@ -78,6 +78,8 @@ class RclConfig:
|
||||
|
||||
def getConfDir(self):
|
||||
return self.confdir
|
||||
def getDataDir(self):
|
||||
return self.datadir
|
||||
|
||||
def setKeyDir(self, dir):
|
||||
self.keydir = dir
|
||||
|
||||
@ -78,6 +78,7 @@ application/pdf = execm rclpdf.py
|
||||
application/postscript = exec rclps
|
||||
application/sql = internal text/plain
|
||||
application/vnd.ms-excel = execm rclxls.py
|
||||
application/vnd.ms-outlook = execm rclpst.py
|
||||
application/vnd.ms-powerpoint = execm rclppt.py
|
||||
application/vnd.oasis.opendocument.text = internal xsltproc meta.xml opendoc-meta.xsl content.xml opendoc-body.xsl
|
||||
application/vnd.oasis.opendocument.text-template = internal xsltproc meta.xml opendoc-meta.xsl content.xml opendoc-body.xsl
|
||||
|
||||
@ -61,6 +61,9 @@
|
||||
# extracted message. Also used by Windows Live Mail
|
||||
.eml = message/rfc822
|
||||
|
||||
.pst = application/vnd.ms-outlook
|
||||
.ost = application/vnd.ms-outlook
|
||||
|
||||
.pdf = application/pdf
|
||||
|
||||
.ps = application/postscript
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user