pst handler: improved charset processing

2019-10-11 14:18:20 +02:00 · 2019-10-11 14:18:20 +02:00 · 2491388e9e
commit 2491388e9e
parent f66b5d1ef9
1 changed files with 65 additions and 21 deletions
--- a/src/filters/rclpst.py
+++ b/src/filters/rclpst.py
@ -93,7 +93,7 @@ class EmailBuilder(object):
            return None

        newmsg = email.message.EmailMessage(policy=email.policy.default)
-        headerstr = self.headers.decode("UTF-8")
+        headerstr = self.headers.decode("UTF-8", errors='replace')
        # print("%s" % headerstr)
        headers = self.parser.parsestr(headerstr, headersonly=True)
        #self.log("EmailBuilder: content-type %s" % headers['content-type'])
@ -101,21 +101,58 @@ class EmailBuilder(object):
            if nm in headers:
                newmsg.add_header(nm, headers[nm])

-        tolist = headers.get_all('to')
-        alldests = ""
-        for toheader in tolist:
-            for dest in toheader.addresses:
-                sd = str(dest).replace('\n', '').replace('\r','')
-                #self.log("EmailBuilder: dest %s" % sd)
-                alldests += sd + ", "
-            alldests = alldests.rstrip(", ")
-            newmsg.add_header('to', alldests)
-
-        # Also: CC
+        for h in ('to', 'cc'):
+            tolist = headers.get_all(h)
+            if not tolist:
+                continue
+            alldests = ""
+            for toheader in tolist:
+                for dest in toheader.addresses:
+                    sd = str(dest).replace('\n', '').replace('\r','')
+                    #self.log("EmailBuilder: dest %s" % sd)
+                    alldests += sd + ", "
+            if alldests:
+                alldests = alldests.rstrip(", ")
+                newmsg.add_header(h, alldests)

+# Decoding the body: the .pst contains the text value decoded from qp
+# or base64 (at least that's what libpff sends). Unfortunately, it
+# appears that the charset value for subparts (e.g. the html part of a
+# multipart/related) is not saved (or not transmitted).
+#
+# This information is both necessary and unavailable, so we apply an heuristic
+# which works in 'most' cases: if we have a charset in the message
+# header, hopefully, this is a simple body and the charset
+# applies. Else try to decode from utf-8, and use charset=utf-8 if it
+# succeeds. Else, send binary and hope for the best (the HTML handler
+# still has a chance to get the charset from the HTML header).
+#
+# There are cases of an HTML UTF-8 text having charset=iso in the
+# head. Don't know if the original HTML was borked or if outlook or
+# libpff decoded to utf-8 without changing the head charset.
        if self.body:
-            newmsg.set_content(self.body, maintype = self.bodymimemain,
-                               subtype = self.bodymimesub)
+            if self.bodymimemain == 'text':
+                charset = headers.get_content_charset()
+                body = ''
+                if charset:
+                    body = self.body.decode(charset, errors='replace')
+                    #self.log("DECODE FROM HEADER CHARSET %s SUCCEEDED"% charset)
+                else:
+                    try:
+                        body = self.body.decode('utf-8')
+                        #self.log("DECODE FROM GUESSED UTF-8 SUCCEEDED")
+                    except:
+                        pass
+                if body:
+                    #self.log("Unicode body: %s" % body)
+                    newmsg.set_content(body, subtype = self.bodymimesub)
+                else:
+                    newmsg.set_content(self.body, maintype = self.bodymimemain,
+                                       subtype = self.bodymimesub)
+            else:
+                newmsg.set_content(self.body, maintype = self.bodymimemain,
+                                   subtype = self.bodymimesub)
+

        for att in self.attachments:
            fn = att[1]
@ -128,10 +165,10 @@ class EmailBuilder(object):
            newmsg.add_attachment(att[0], maintype=mt, subtype=st,
                                  filename=fn)

+        ret = newmsg.as_string(maxheaderlen=100)
        #newmsg.set_unixfrom("From some@place.org Sun Jan 01 00:00:00 2000")
        #print("%s\n" % newmsg.as_string(unixfrom=True, maxheaderlen=80))
-
-        ret = newmsg.as_string(maxheaderlen=100)
+        #self.log("MESSAGE: %s" % ret)
        self.reset()
        return ret
    
@ -225,7 +262,14 @@ class PFFReader(object):
                        elif ext == '.rtf':
                            self.msg.setbody(data, 'text', 'rtf')
                        else:
-                            raise Exception("PST: Unknown body type %s"%ext)
+                            # Note: I don't know what happens with a
+                            # message body of type, e.g. image/jpg.
+                            # This is probably not a big issue,
+                            # because there is nothing to index
+                            # We raised during dev to see if we would find one,
+                            # now just pass 
+                            # raise Exception("PST: Unknown body type %s"%ext)
+                            pass
                    elif basename == 'ConversationIndex.txt':
                        pass
                    elif basename == 'Recipients.txt':
@ -233,7 +277,7 @@ class PFFReader(object):
            else:
                raise Exception("Unknown param name: %s" % name)

-        self.log("Out of loop")
+        #self.log("Out of loop")
        doc = self.msg.flush()
        if doc:
            yield((doc, ipath))
@ -248,9 +292,9 @@ class PstExtractor(object):
            self.target = "\\\\?\\c:\\nonexistent"
        else:
            self.target = "/nonexistent"
-        self.pffexport = rclexecm.which("pffexport")
+        self.pffexport = rclexecm.which("pffinstall/mingw32/bin/pffexport")
        if not self.pffexport:
-            self.pffexport = rclexecm.which("pffinstall/mingw32/bin/pffexport")
+            self.pffexport = rclexecm.which("pffexport")
            if not self.pffexport:
                # No need for anything else. openfile() will return an
                # error at once