Pst on Unix: email message indexing seems fully ok
This commit is contained in:
parent
cc4f4e0c74
commit
c1553029b9
@ -244,6 +244,7 @@ class RclExecM:
|
|||||||
else:
|
else:
|
||||||
ok, data, ipath, eof = processor.getnext(params)
|
ok, data, ipath, eof = processor.getnext(params)
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
|
self.rclog("getipath/next: exception: %s" %err)
|
||||||
self.answer("", "", eof, RclExecM.fileerror)
|
self.answer("", "", eof, RclExecM.fileerror)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|||||||
@ -25,6 +25,8 @@
|
|||||||
|
|
||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
|
import posixpath
|
||||||
|
import pathlib
|
||||||
import tempfile
|
import tempfile
|
||||||
import shutil
|
import shutil
|
||||||
import getopt
|
import getopt
|
||||||
@ -48,26 +50,33 @@ class EmailBuilder(object):
|
|||||||
self.reset()
|
self.reset()
|
||||||
self.mimemap = mimemap
|
self.mimemap = mimemap
|
||||||
self.parser = email.parser.Parser(policy = email.policy.default)
|
self.parser = email.parser.Parser(policy = email.policy.default)
|
||||||
|
|
||||||
def reset(self):
|
def reset(self):
|
||||||
self.headers = ''
|
self.headers = ''
|
||||||
self.body = ''
|
self.body = ''
|
||||||
self.bodymime = ''
|
self.bodymimemain = ''
|
||||||
|
self.bodymimesub = ''
|
||||||
self.attachments = []
|
self.attachments = []
|
||||||
|
|
||||||
def setheaders(self, h):
|
def setheaders(self, h):
|
||||||
self.headers = h
|
self.headers = h
|
||||||
|
|
||||||
def setbody(self, body, main, sub):
|
def setbody(self, body, main, sub):
|
||||||
self.body = body
|
self.body = body
|
||||||
self.bodymimemain = main
|
self.bodymimemain = main
|
||||||
self.bodymimesub = sub
|
self.bodymimesub = sub
|
||||||
|
|
||||||
def addattachment(self, att, filename):
|
def addattachment(self, att, filename):
|
||||||
self.log("Adding attachment")
|
#self.log("Adding attachment")
|
||||||
self.attachments.append((att, filename))
|
self.attachments.append((att, filename))
|
||||||
|
|
||||||
def flush(self):
|
def flush(self):
|
||||||
if not (self.headers and (self.body or self.attachments)):
|
if not (self.headers and (self.body or self.attachments)):
|
||||||
self.log("Not flushing because no headers or no body/attach")
|
self.log("Not flushing because no headers or no body/attach")
|
||||||
|
self.reset()
|
||||||
return None
|
return None
|
||||||
|
|
||||||
newmsg = email.message.EmailMessage(policy=email.policy.default)
|
newmsg = email.message.EmailMessage(policy=email.policy.default)
|
||||||
|
|
||||||
headerstr = self.headers.decode('utf-8')
|
headerstr = self.headers.decode('utf-8')
|
||||||
# print("%s" % headerstr)
|
# print("%s" % headerstr)
|
||||||
headers = self.parser.parsestr(headerstr, headersonly=True)
|
headers = self.parser.parsestr(headerstr, headersonly=True)
|
||||||
@ -105,8 +114,8 @@ class EmailBuilder(object):
|
|||||||
|
|
||||||
#newmsg.set_unixfrom("From some@place.org Sun Jan 01 00:00:00 2000")
|
#newmsg.set_unixfrom("From some@place.org Sun Jan 01 00:00:00 2000")
|
||||||
#print("%s\n" % newmsg.as_string(unixfrom=True, maxheaderlen=80))
|
#print("%s\n" % newmsg.as_string(unixfrom=True, maxheaderlen=80))
|
||||||
ret = newmsg.as_string(maxheaderlen=100)
|
|
||||||
|
|
||||||
|
ret = newmsg.as_string(maxheaderlen=100)
|
||||||
self.reset()
|
self.reset()
|
||||||
return ret
|
return ret
|
||||||
|
|
||||||
@ -151,7 +160,8 @@ class PFFReader(object):
|
|||||||
|
|
||||||
def mainloop(self):
|
def mainloop(self):
|
||||||
basename = ''
|
basename = ''
|
||||||
path = ''
|
fullpath = ''
|
||||||
|
ipath = ''
|
||||||
while 1:
|
while 1:
|
||||||
name, data = self.readparam()
|
name, data = self.readparam()
|
||||||
if name == "":
|
if name == "":
|
||||||
@ -162,10 +172,10 @@ class PFFReader(object):
|
|||||||
paramstr = ''
|
paramstr = ''
|
||||||
|
|
||||||
if name == 'filename':
|
if name == 'filename':
|
||||||
self.log("filename: %s" % paramstr)
|
#self.log("filename: %s" % paramstr)
|
||||||
path = paramstr
|
fullpath = paramstr
|
||||||
basename = os.path.basename(path)
|
basename = os.path.basename(fullpath)
|
||||||
parentdir = os.path.basename(os.path.dirname(paramstr))
|
parentdir = os.path.basename(os.path.dirname(fullpath))
|
||||||
elif name == 'data':
|
elif name == 'data':
|
||||||
if parentdir == 'Attachments':
|
if parentdir == 'Attachments':
|
||||||
#self.log("Attachment: %s" % basename)
|
#self.log("Attachment: %s" % basename)
|
||||||
@ -174,13 +184,17 @@ class PFFReader(object):
|
|||||||
if basename == 'OutlookHeaders.txt':
|
if basename == 'OutlookHeaders.txt':
|
||||||
doc = self.msg.flush()
|
doc = self.msg.flush()
|
||||||
if doc:
|
if doc:
|
||||||
yield((doc, path))
|
yield((doc, ipath))
|
||||||
elif basename == 'ConversationIndex.txt':
|
|
||||||
pass
|
|
||||||
elif basename == 'Recipients.txt':
|
|
||||||
pass
|
|
||||||
elif basename == 'InternetHeaders.txt':
|
elif basename == 'InternetHeaders.txt':
|
||||||
#self.log("name: [%s] data: %s" % (name, paramstr))
|
#self.log("name: [%s] data: %s" % (name, paramstr))
|
||||||
|
# This part is the indispensable one. Record
|
||||||
|
# the ipath at this point:
|
||||||
|
p = pathlib.Path(fullpath)
|
||||||
|
# Strip the top dir (/nonexistent.export/)
|
||||||
|
p = p.relative_to(*p.parts[:2])
|
||||||
|
# We use the parent directory as ipath: all
|
||||||
|
# the message parts are in there
|
||||||
|
ipath = str(p.parents[0])
|
||||||
self.msg.setheaders(data)
|
self.msg.setheaders(data)
|
||||||
elif os.path.splitext(basename)[0] == 'Message':
|
elif os.path.splitext(basename)[0] == 'Message':
|
||||||
ext = os.path.splitext(basename)[1]
|
ext = os.path.splitext(basename)[1]
|
||||||
@ -192,25 +206,31 @@ class PFFReader(object):
|
|||||||
self.msg.setbody(data, 'text', 'rtf')
|
self.msg.setbody(data, 'text', 'rtf')
|
||||||
else:
|
else:
|
||||||
raise Exception("PST: Unknown body type %s"%ext)
|
raise Exception("PST: Unknown body type %s"%ext)
|
||||||
self.log("Message")
|
elif basename == 'ConversationIndex.txt':
|
||||||
pass
|
pass
|
||||||
basename = ''
|
elif basename == 'Recipients.txt':
|
||||||
parentdir = ''
|
pass
|
||||||
|
else:
|
||||||
|
raise Exception("Unknown param name: %s" % name)
|
||||||
|
|
||||||
self.log("Out of loop")
|
self.log("Out of loop")
|
||||||
doc = self.msg.flush()
|
doc = self.msg.flush()
|
||||||
if doc:
|
if doc:
|
||||||
yield((doc, path))
|
yield((doc, ipath))
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
class PstExtractor(object):
|
class PstExtractor(object):
|
||||||
def __init__(self, em):
|
def __init__(self, em):
|
||||||
self.currentindex = 0
|
self.generator = None
|
||||||
self.em = em
|
self.em = em
|
||||||
self.cmd = ["pffexport", "-q", "-t", "/nonexistent", "-s"]
|
self.target = "/nonexistent"
|
||||||
|
self.cmd = ["pffexport", "-q", "-t", self.target, "-s"]
|
||||||
|
|
||||||
def startCmd(self, filename):
|
def startCmd(self, filename, ipath=None):
|
||||||
fullcmd = self.cmd + [rclexecm.subprocfile(filename)]
|
fullcmd = self.cmd + [rclexecm.subprocfile(filename)]
|
||||||
|
if ipath:
|
||||||
|
fullcmd += ["-p", ipath]
|
||||||
try:
|
try:
|
||||||
self.proc = subprocess.Popen(fullcmd, stdout=subprocess.PIPE)
|
self.proc = subprocess.Popen(fullcmd, stdout=subprocess.PIPE)
|
||||||
except subprocess.CalledProcessError as err:
|
except subprocess.CalledProcessError as err:
|
||||||
@ -222,48 +242,57 @@ class PstExtractor(object):
|
|||||||
self.filein = self.proc.stdout
|
self.filein = self.proc.stdout
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def extractone(self, ipath):
|
|
||||||
#self.em.rclog("extractone: [%s]" % ipath)
|
|
||||||
docdata = ""
|
|
||||||
ok = False
|
|
||||||
iseof = True
|
|
||||||
return (ok, docdata, rclexecm.makebytes(ipath), iseof)
|
|
||||||
|
|
||||||
###### File type handler api, used by rclexecm ---------->
|
###### File type handler api, used by rclexecm ---------->
|
||||||
def openfile(self, params):
|
def openfile(self, params):
|
||||||
filename = params["filename:"]
|
self.filename = params["filename:"]
|
||||||
|
self.em.rclog("openfile: %s" % self.filename)
|
||||||
if not self.startCmd(filename):
|
|
||||||
return False
|
|
||||||
|
|
||||||
reader = PFFReader(self.em.rclog, infile=self.filein)
|
|
||||||
self.generator = reader.mainloop()
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
def getipath(self, params):
|
def getipath(self, params):
|
||||||
ipath = params["ipath:"]
|
ipath = posixpath.join(self.target + ".export",
|
||||||
ok, data, ipath, eof = self.extractone(ipath)
|
params["ipath:"].decode("UTF-8"))
|
||||||
if ok:
|
self.em.rclog("getipath: [%s]" % ipath)
|
||||||
return (ok, data, ipath, eof)
|
if not self.startCmd(self.filename, ipath=ipath):
|
||||||
# Not found. Maybe we need to decode the path?
|
return (False, "", "", rclexecm.RclExecM.eofnow)
|
||||||
try:
|
reader = PFFReader(self.em.rclog, infile=self.filein)
|
||||||
ipath = ipath.decode("utf-8")
|
self.generator = reader.mainloop()
|
||||||
return self.extractone(ipath)
|
|
||||||
except Exception as err:
|
|
||||||
return (ok, data, ipath, eof)
|
|
||||||
|
|
||||||
def getnext(self, params):
|
|
||||||
try:
|
try:
|
||||||
doc, ipath = next(self.generator)
|
doc, ipath = next(self.generator)
|
||||||
self.em.setmimetype("message/rfc822")
|
self.em.setmimetype("message/rfc822")
|
||||||
#self.em.rclog("doc %s ipath %s" % (doc[:40], ipath))
|
self.em.rclog("getipath doc len %d [%s] ipath %s" %
|
||||||
|
(len(doc), doc[:20], ipath))
|
||||||
|
f = open("/tmp/document", "wb")
|
||||||
|
f.write(doc.encode('utf-8'))
|
||||||
|
except StopIteration:
|
||||||
|
self.em.rclog("getipath: StopIteration")
|
||||||
|
return(False, "", "", rclexecm.RclExecM.eofnow)
|
||||||
|
return (True, doc, ipath, False)
|
||||||
|
|
||||||
|
def getnext(self, params):
|
||||||
|
self.em.rclog("getnext:")
|
||||||
|
if not self.generator:
|
||||||
|
if not self.startCmd(self.filename):
|
||||||
|
return False
|
||||||
|
reader = PFFReader(self.em.rclog, infile=self.filein)
|
||||||
|
self.generator = reader.mainloop()
|
||||||
|
try:
|
||||||
|
doc, ipath = next(self.generator)
|
||||||
|
self.em.setmimetype("message/rfc822")
|
||||||
|
self.em.rclog("getnext: ipath %s\ndoc\n%s" % (ipath, doc))
|
||||||
except StopIteration:
|
except StopIteration:
|
||||||
return(False, "", "", rclexecm.RclExecM.eofnow)
|
return(False, "", "", rclexecm.RclExecM.eofnow)
|
||||||
return (True, doc, ipath, False)
|
return (True, doc, ipath, False)
|
||||||
|
|
||||||
|
|
||||||
# Main program: create protocol handler and extractor and run them
|
if True:
|
||||||
proto = rclexecm.RclExecM()
|
# Main program: create protocol handler and extractor and run them
|
||||||
extract = PstExtractor(proto)
|
proto = rclexecm.RclExecM()
|
||||||
rclexecm.main(proto, extract)
|
extract = PstExtractor(proto)
|
||||||
|
rclexecm.main(proto, extract)
|
||||||
|
else:
|
||||||
|
def _deb(s):
|
||||||
|
print("%s" % s, file=sys.stderr)
|
||||||
|
reader = PFFReader(_deb, infile=sys.stdin.buffer)
|
||||||
|
generator = reader.mainloop()
|
||||||
|
for doc, ipath in generator:
|
||||||
|
_deb("Got %s data len %d" % (ipath, len(doc)))
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user