ckpt: pst: basic indexing of email. no getipath/preview

2019-05-26 12:30:59 +02:00 · 2019-05-26 12:30:59 +02:00 · cc4f4e0c74
commit cc4f4e0c74
parent c7c413d9e7
4 changed files with 235 additions and 179 deletions
--- a/src/filters/rclpst.py
+++ b/src/filters/rclpst.py
@ -1,26 +1,27 @@
 #!/usr/bin/python3
 #################################
 # Copyright (C) 2019 J.F.Dockes
-#	This program is free software; you can redistribute it and/or modify
-#	it under the terms of the GNU General Public License as published by
-#	the Free Software Foundation; either version 2 of the License, or
-#	(at your option) any later version.
+#   This program is free software; you can redistribute it and/or modify
+#   it under the terms of the GNU General Public License as published by
+#   the Free Software Foundation; either version 2 of the License, or
+#   (at your option) any later version.
 #
-#	This program is distributed in the hope that it will be useful,
-#	but WITHOUT ANY WARRANTY; without even the implied warranty of
-#	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-#	GNU General Public License for more details.
+#   This program is distributed in the hope that it will be useful,
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#   GNU General Public License for more details.
 #
-#	You should have received a copy of the GNU General Public License
-#	along with this program; if not, write to the
-#	Free Software Foundation, Inc.,
-#	51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+#   You should have received a copy of the GNU General Public License
+#   along with this program; if not, write to the
+#   Free Software Foundation, Inc.,
+#   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 ########################################################

 #
 # Process the stream produced by a modified pffexport:
 # https://github.com/libyal/libpff
-# The tool has been modified to produce a data stream instead of a file tree
+# The modification allows producing a data stream instead of a file tree
+#

 import sys
 import os
@ -31,189 +32,238 @@ import traceback
 import email.parser
 import email.policy
 import mailbox
+import subprocess

+import rclexecm
 import rclconfig
 import conftree

-def _deb(s):
-	print("%s"%s, file=sys.stderr)
-

 # The pffexport stream yields the email in several pieces, with some
 # data missing (e.g. attachment MIME types). We rebuild a complete
 # message for parsing by the Recoll email handler
 class EmailBuilder(object):
-	def __init__(self):
-		self.reset()
-		self.parser = email.parser.Parser(policy = email.policy.default)
-	def reset(self):
-		self.headers = ''
-		self.body = ''
-		self.bodymime = ''
-		self.attachments = []
-	def setheaders(self, h):
-		self.headers = h
-	def setbody(self, body, main, sub):
-		self.body = body
-		self.bodymimemain = main
-		self.bodymimesub = sub
-	def addattachment(self, att, filename):
-		_deb("Adding attachment")
-		self.attachments.append((att, filename))
-	def flush(self):
-		if not self.headers:
-			_deb("Not flushing because no headers")
-		if self.headers and (self.body or self.attachments):
-			newmsg = email.message.EmailMessage(policy =
-													email.policy.default)
-			
-			headerstr = self.headers.decode('utf-8')
-			# print("%s" % headerstr)
-			headers = self.parser.parsestr(headerstr, headersonly=True)
-			_deb("EmailBuilder: content-type %s" % headers['content-type'])
-			for nm in ('from', 'subject'):
-				if nm in headers:
-					newmsg.add_header(nm, headers[nm])
+    def __init__(self, logger, mimemap):
+        self.log = logger
+        self.reset()
+        self.mimemap = mimemap
+        self.parser = email.parser.Parser(policy = email.policy.default)
+    def reset(self):
+        self.headers = ''
+        self.body = ''
+        self.bodymime = ''
+        self.attachments = []
+    def setheaders(self, h):
+        self.headers = h
+    def setbody(self, body, main, sub):
+        self.body = body
+        self.bodymimemain = main
+        self.bodymimesub = sub
+    def addattachment(self, att, filename):
+        self.log("Adding attachment")
+        self.attachments.append((att, filename))
+    def flush(self):
+        if not (self.headers and (self.body or self.attachments)):
+            self.log("Not flushing because no headers or no body/attach")
+            return None
+        newmsg = email.message.EmailMessage(policy=email.policy.default)
+            
+        headerstr = self.headers.decode('utf-8')
+        # print("%s" % headerstr)
+        headers = self.parser.parsestr(headerstr, headersonly=True)
+        #self.log("EmailBuilder: content-type %s" % headers['content-type'])
+        for nm in ('from', 'subject'):
+            if nm in headers:
+                newmsg.add_header(nm, headers[nm])

-			tolist = headers.get_all('to')
-			alldests = ""
-			for toheader in tolist:
-				for dest in toheader.addresses:
-					sd = str(dest).replace('\n', '').replace('\r','')
-					_deb("EmailBuilder: dest %s" % sd)
-					alldests += sd + ", "
-				alldests = alldests.rstrip(", ")
-				newmsg.add_header('to', alldests)
+        tolist = headers.get_all('to')
+        alldests = ""
+        for toheader in tolist:
+            for dest in toheader.addresses:
+                sd = str(dest).replace('\n', '').replace('\r','')
+                #self.log("EmailBuilder: dest %s" % sd)
+                alldests += sd + ", "
+            alldests = alldests.rstrip(", ")
+            newmsg.add_header('to', alldests)

-			# Also: CC
-			
-			if self.body:
-				newmsg.set_content(self.body, maintype = self.bodymimemain,
-									   subtype = self.bodymimesub)
-				
-			for att in self.attachments:
-				#if self.body:
-				#	newmsg.make_mixed()
-				ext = os.path.splitext(att[1])[1]
-				_deb("Querying mimemap with %s" % ext)
-				mime = mimemap.get(ext)
-				if not mime:
-					mime = 'application/octet-stream'
-				_deb("Attachment: filename %s MIME %s" % (att[1], mime))
-				mt,st = mime.split('/')
-				newmsg.add_attachment(att[0], maintype=mt, subtype=st,
-									  filename=att[1])
+        # Also: CC
+            
+        if self.body:
+            newmsg.set_content(self.body, maintype = self.bodymimemain,
+                               subtype = self.bodymimesub)
+                
+        for att in self.attachments:
+            fn = att[1]
+            ext = os.path.splitext(fn)[1]
+            mime = self.mimemap.get(ext)
+            if not mime:
+                mime = 'application/octet-stream'
+            #self.log("Attachment: filename %s MIME %s" % (fn, mime))
+            mt,st = mime.split('/')
+            newmsg.add_attachment(att[0], maintype=mt, subtype=st,
+                                  filename=fn)

-			newmsg.set_unixfrom("From some@place.org Sun Jan 01 00:00:00 2000")
-			print("%s\n" % newmsg.as_string(unixfrom=True, maxheaderlen=80))
+        #newmsg.set_unixfrom("From some@place.org Sun Jan 01 00:00:00 2000")
+        #print("%s\n" % newmsg.as_string(unixfrom=True, maxheaderlen=80))
+        ret = newmsg.as_string(maxheaderlen=100)

-		self.reset()
-	
+        self.reset()
+        return ret
+    

 class PFFReader(object):
-	def __init__(self, infile=sys.stdin):
-		try:
-			self.myname = os.path.basename(sys.argv[0])
-		except:
-			self.myname = "???"
+    def __init__(self, logger, infile=sys.stdin):
+        self.log = logger
+        config = rclconfig.RclConfig()
+        dir1 = os.path.join(config.getConfDir(), "examples")
+        dir2 = os.path.join(config.datadir, "examples")
+        self.mimemap = conftree.ConfStack('mimemap', [dir1, dir2])
+        self.infile = infile
+        self.fields = {}
+        self.msg = EmailBuilder(self.log, self.mimemap)
+        
+    # Read single parameter from process input: line with param name and size
+    # followed by data. The param name is returned as str/unicode, the data
+    # as bytes
+    def readparam(self):
+        inf = self.infile
+        s = inf.readline()
+        if s == b'':
+            return ('', b'')
+        s = s.rstrip(b'\n')
+        if s == b'':
+            return ('', b'')
+        l = s.split()
+        if len(l) != 2:
+            self.log(b'bad line: [' + s + b']', 1, 1)
+            return ('', b'')
+        paramname = l[0].decode('ASCII').rstrip(':')
+        paramsize = int(l[1])
+        if paramsize > 0:
+            paramdata = inf.read(paramsize)
+            if len(paramdata) != paramsize:
+                self.log("Bad read: wanted %d, got %d" %
+                      (paramsize, len(paramdata)), 1, 1)
+                return('', b'')
+        else:
+            paramdata = b''
+        return (paramname, paramdata)

-		self.infile = infile
-		self.fields = {}
-		self.msg = EmailBuilder()
-		
-		if sys.platform == "win32":
-			import msvcrt
-			msvcrt.setmode(self.outfile.fileno(), os.O_BINARY)
-			msvcrt.setmode(self.infile.fileno(), os.O_BINARY)
-		self.debugfile = None
-		if self.debugfile:
-			self.errfout = open(self.debugfile, "a")
-		else:
-			self.errfout = sys.stderr
-		
-	def log(self, s):
-		print("PFFReader: %s: %s" % (self.myname, s), file=self.errfout)
+    def mainloop(self):
+        basename = ''
+        path = ''
+        while 1:
+            name, data = self.readparam()
+            if name == "":
+                break
+            try:
+                paramstr = data.decode('utf-8')
+            except:
+                paramstr = ''

-	# Read single parameter from process input: line with param name and size
-	# followed by data. The param name is returned as str/unicode, the data
-	# as bytes
-	def readparam(self):
-		inf = self.infile.buffer
-		s = inf.readline()
-		if s == b'':
-			return ('', b'')
-		s = s.rstrip(b'\n')
-		if s == b'':
-			return ('', b'')
-		l = s.split()
-		if len(l) != 2:
-			self.log(b'bad line: [' + s + b']', 1, 1)
-			return ('', b'')
-		paramname = l[0].decode('ASCII').rstrip(':')
-		paramsize = int(l[1])
-		if paramsize > 0:
-			paramdata = inf.read(paramsize)
-			if len(paramdata) != paramsize:
-				self.log("Bad read: wanted %d, got %d" %
-					  (paramsize, len(paramdata)), 1, 1)
-				return('', b'')
-		else:
-			paramdata = b''
-		return (paramname, paramdata)
-
-	def mainloop(self):
-		basename = ''
-		while 1:
-			name, data = self.readparam()
-			if name == "":
-				break
-			try:
-				paramstr = data.decode('utf-8')
-			except:
-				paramstr = ''
-
-			if name == 'filename':
-				basename = os.path.basename(paramstr)
-				self.log("name: [%s] data: %s" %
-						 (name, paramstr))
-				parentdir = os.path.basename(os.path.dirname(paramstr))
-			elif name == 'data':
-				if parentdir == 'Attachments':
-					#self.log("Attachment: %s" % basename)
-					self.msg.addattachment(data, basename)
-				else:
-					if basename == 'OutlookHeaders.txt':
-						self.msg.flush()
-						pass
-					if basename == 'ConversationIndex.txt':
-						pass
-					elif basename == 'Recipients.txt':
-						pass
-					elif basename == 'InternetHeaders.txt':
-						#self.log("name: [%s] data: %s" % (name, paramstr))
-						self.msg.setheaders(data)
-					elif os.path.splitext(basename)[0] == 'Message':
-						ext = os.path.splitext(basename)[1]
-						if ext == '.txt':
-							self.msg.setbody(data, 'text', 'plain')
-						elif ext == '.html':
-							self.msg.setbody(data, 'text', 'html')
-						elif ext == '.rtf':
-							self.msg.setbody(data, 'text', 'rtf')
-						else:
-							raise Exception("PST: Unknown body type %s"%ext)
-						self.log("Message")
-						pass
-				basename = ''
-				parentdir = ''
-		self.log("Out of loop")
-		self.msg.flush()
+            if name == 'filename':
+                self.log("filename: %s" %  paramstr)
+                path = paramstr
+                basename = os.path.basename(path)
+                parentdir = os.path.basename(os.path.dirname(paramstr))
+            elif name == 'data':
+                if parentdir == 'Attachments':
+                    #self.log("Attachment: %s" % basename)
+                    self.msg.addattachment(data, basename)
+                else:
+                    if basename == 'OutlookHeaders.txt':
+                        doc = self.msg.flush()
+                        if doc:
+                            yield((doc, path))
+                    elif basename == 'ConversationIndex.txt':
+                        pass
+                    elif basename == 'Recipients.txt':
+                        pass
+                    elif basename == 'InternetHeaders.txt':
+                        #self.log("name: [%s] data: %s" % (name, paramstr))
+                        self.msg.setheaders(data)
+                    elif os.path.splitext(basename)[0] == 'Message':
+                        ext = os.path.splitext(basename)[1]
+                        if ext == '.txt':
+                            self.msg.setbody(data, 'text', 'plain')
+                        elif ext == '.html':
+                            self.msg.setbody(data, 'text', 'html')
+                        elif ext == '.rtf':
+                            self.msg.setbody(data, 'text', 'rtf')
+                        else:
+                            raise Exception("PST: Unknown body type %s"%ext)
+                        self.log("Message")
+                        pass
+                basename = ''
+                parentdir = ''
+        self.log("Out of loop")
+        doc = self.msg.flush()
+        if doc:
+            yield((doc, path))
+        return


-config = rclconfig.RclConfig()
-dir1 = os.path.join(config.getConfDir(), "examples")
-dir2 = os.path.join(config.datadir, "examples")
-mimemap = conftree.ConfStack('mimemap', [dir1, dir2])
+class PstExtractor(object):
+    def __init__(self, em):
+        self.currentindex = 0
+        self.em = em
+        self.cmd = ["pffexport", "-q", "-t", "/nonexistent", "-s"]

-proto = PFFReader()
-proto.mainloop()
+    def startCmd(self, filename):
+        fullcmd = self.cmd + [rclexecm.subprocfile(filename)]
+        try:
+            self.proc = subprocess.Popen(fullcmd, stdout=subprocess.PIPE)
+        except subprocess.CalledProcessError as err:
+            self.em.rclog("Pst: Popen(%s) error: %s" % (fullcmd, err))
+            return False
+        except OSError as err:
+            self.em.rclog("Pst: Popen(%s) OS error: %s" % (fullcmd, err))
+            return (False, "")
+        self.filein = self.proc.stdout
+        return True
+
+    def extractone(self, ipath):
+        #self.em.rclog("extractone: [%s]" % ipath)
+        docdata = ""
+        ok = False
+        iseof = True
+        return (ok, docdata, rclexecm.makebytes(ipath), iseof)
+
+    ###### File type handler api, used by rclexecm ---------->
+    def openfile(self, params):
+        filename = params["filename:"]
+        
+        if not self.startCmd(filename):
+            return False
+        
+        reader = PFFReader(self.em.rclog, infile=self.filein)
+        self.generator = reader.mainloop()
+        return True
+
+
+    def getipath(self, params):
+        ipath = params["ipath:"]
+        ok, data, ipath, eof = self.extractone(ipath)
+        if ok:
+            return (ok, data, ipath, eof)
+        # Not found. Maybe we need to decode the path?
+        try:
+            ipath = ipath.decode("utf-8")
+            return self.extractone(ipath)
+        except Exception as err:
+            return (ok, data, ipath, eof)
+        
+    def getnext(self, params):
+        try:
+            doc, ipath = next(self.generator)
+            self.em.setmimetype("message/rfc822")
+            #self.em.rclog("doc %s ipath %s" % (doc[:40], ipath))
+        except StopIteration:
+            return(False, "", "", rclexecm.RclExecM.eofnow)
+        return (True, doc, ipath, False)
+    
+
+# Main program: create protocol handler and extractor and run them
+proto = rclexecm.RclExecM()
+extract = PstExtractor(proto)
+rclexecm.main(proto, extract)
--- a/src/python/recoll/recoll/rclconfig.py
+++ b/src/python/recoll/recoll/rclconfig.py
@ -78,6 +78,8 @@ class RclConfig:

    def getConfDir(self):
        return self.confdir
+    def getDataDir(self):
+        return self.datadir
    
    def setKeyDir(self, dir):
        self.keydir = dir
--- a/src/sampleconf/mimeconf
+++ b/src/sampleconf/mimeconf
@ -78,6 +78,7 @@ application/pdf = execm rclpdf.py
 application/postscript = exec rclps
 application/sql = internal text/plain
 application/vnd.ms-excel = execm rclxls.py
+application/vnd.ms-outlook = execm rclpst.py
 application/vnd.ms-powerpoint = execm rclppt.py
 application/vnd.oasis.opendocument.text = internal xsltproc meta.xml opendoc-meta.xsl content.xml opendoc-body.xsl
 application/vnd.oasis.opendocument.text-template = internal xsltproc meta.xml opendoc-meta.xsl content.xml opendoc-body.xsl
--- a/src/sampleconf/mimemap
+++ b/src/sampleconf/mimemap
@ -61,6 +61,9 @@
 # extracted message. Also used by Windows Live Mail
 .eml = message/rfc822

+.pst = application/vnd.ms-outlook
+.ost = application/vnd.ms-outlook
+
 .pdf = application/pdf

 .ps = application/postscript