pst handler: improved charset processing
This commit is contained in:
parent
f66b5d1ef9
commit
2491388e9e
@ -93,7 +93,7 @@ class EmailBuilder(object):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
newmsg = email.message.EmailMessage(policy=email.policy.default)
|
newmsg = email.message.EmailMessage(policy=email.policy.default)
|
||||||
headerstr = self.headers.decode("UTF-8")
|
headerstr = self.headers.decode("UTF-8", errors='replace')
|
||||||
# print("%s" % headerstr)
|
# print("%s" % headerstr)
|
||||||
headers = self.parser.parsestr(headerstr, headersonly=True)
|
headers = self.parser.parsestr(headerstr, headersonly=True)
|
||||||
#self.log("EmailBuilder: content-type %s" % headers['content-type'])
|
#self.log("EmailBuilder: content-type %s" % headers['content-type'])
|
||||||
@ -101,22 +101,59 @@ class EmailBuilder(object):
|
|||||||
if nm in headers:
|
if nm in headers:
|
||||||
newmsg.add_header(nm, headers[nm])
|
newmsg.add_header(nm, headers[nm])
|
||||||
|
|
||||||
tolist = headers.get_all('to')
|
for h in ('to', 'cc'):
|
||||||
alldests = ""
|
tolist = headers.get_all(h)
|
||||||
for toheader in tolist:
|
if not tolist:
|
||||||
for dest in toheader.addresses:
|
continue
|
||||||
sd = str(dest).replace('\n', '').replace('\r','')
|
alldests = ""
|
||||||
#self.log("EmailBuilder: dest %s" % sd)
|
for toheader in tolist:
|
||||||
alldests += sd + ", "
|
for dest in toheader.addresses:
|
||||||
alldests = alldests.rstrip(", ")
|
sd = str(dest).replace('\n', '').replace('\r','')
|
||||||
newmsg.add_header('to', alldests)
|
#self.log("EmailBuilder: dest %s" % sd)
|
||||||
|
alldests += sd + ", "
|
||||||
|
if alldests:
|
||||||
|
alldests = alldests.rstrip(", ")
|
||||||
|
newmsg.add_header(h, alldests)
|
||||||
|
|
||||||
# Also: CC
|
# Decoding the body: the .pst contains the text value decoded from qp
|
||||||
|
# or base64 (at least that's what libpff sends). Unfortunately, it
|
||||||
|
# appears that the charset value for subparts (e.g. the html part of a
|
||||||
|
# multipart/related) is not saved (or not transmitted).
|
||||||
|
#
|
||||||
|
# This information is both necessary and unavailable, so we apply an heuristic
|
||||||
|
# which works in 'most' cases: if we have a charset in the message
|
||||||
|
# header, hopefully, this is a simple body and the charset
|
||||||
|
# applies. Else try to decode from utf-8, and use charset=utf-8 if it
|
||||||
|
# succeeds. Else, send binary and hope for the best (the HTML handler
|
||||||
|
# still has a chance to get the charset from the HTML header).
|
||||||
|
#
|
||||||
|
# There are cases of an HTML UTF-8 text having charset=iso in the
|
||||||
|
# head. Don't know if the original HTML was borked or if outlook or
|
||||||
|
# libpff decoded to utf-8 without changing the head charset.
|
||||||
if self.body:
|
if self.body:
|
||||||
newmsg.set_content(self.body, maintype = self.bodymimemain,
|
if self.bodymimemain == 'text':
|
||||||
subtype = self.bodymimesub)
|
charset = headers.get_content_charset()
|
||||||
|
body = ''
|
||||||
|
if charset:
|
||||||
|
body = self.body.decode(charset, errors='replace')
|
||||||
|
#self.log("DECODE FROM HEADER CHARSET %s SUCCEEDED"% charset)
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
body = self.body.decode('utf-8')
|
||||||
|
#self.log("DECODE FROM GUESSED UTF-8 SUCCEEDED")
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
if body:
|
||||||
|
#self.log("Unicode body: %s" % body)
|
||||||
|
newmsg.set_content(body, subtype = self.bodymimesub)
|
||||||
|
else:
|
||||||
|
newmsg.set_content(self.body, maintype = self.bodymimemain,
|
||||||
|
subtype = self.bodymimesub)
|
||||||
|
else:
|
||||||
|
newmsg.set_content(self.body, maintype = self.bodymimemain,
|
||||||
|
subtype = self.bodymimesub)
|
||||||
|
|
||||||
|
|
||||||
for att in self.attachments:
|
for att in self.attachments:
|
||||||
fn = att[1]
|
fn = att[1]
|
||||||
ext = met_splitext(fn)[1]
|
ext = met_splitext(fn)[1]
|
||||||
@ -128,10 +165,10 @@ class EmailBuilder(object):
|
|||||||
newmsg.add_attachment(att[0], maintype=mt, subtype=st,
|
newmsg.add_attachment(att[0], maintype=mt, subtype=st,
|
||||||
filename=fn)
|
filename=fn)
|
||||||
|
|
||||||
|
ret = newmsg.as_string(maxheaderlen=100)
|
||||||
#newmsg.set_unixfrom("From some@place.org Sun Jan 01 00:00:00 2000")
|
#newmsg.set_unixfrom("From some@place.org Sun Jan 01 00:00:00 2000")
|
||||||
#print("%s\n" % newmsg.as_string(unixfrom=True, maxheaderlen=80))
|
#print("%s\n" % newmsg.as_string(unixfrom=True, maxheaderlen=80))
|
||||||
|
#self.log("MESSAGE: %s" % ret)
|
||||||
ret = newmsg.as_string(maxheaderlen=100)
|
|
||||||
self.reset()
|
self.reset()
|
||||||
return ret
|
return ret
|
||||||
|
|
||||||
@ -225,7 +262,14 @@ class PFFReader(object):
|
|||||||
elif ext == '.rtf':
|
elif ext == '.rtf':
|
||||||
self.msg.setbody(data, 'text', 'rtf')
|
self.msg.setbody(data, 'text', 'rtf')
|
||||||
else:
|
else:
|
||||||
raise Exception("PST: Unknown body type %s"%ext)
|
# Note: I don't know what happens with a
|
||||||
|
# message body of type, e.g. image/jpg.
|
||||||
|
# This is probably not a big issue,
|
||||||
|
# because there is nothing to index
|
||||||
|
# We raised during dev to see if we would find one,
|
||||||
|
# now just pass
|
||||||
|
# raise Exception("PST: Unknown body type %s"%ext)
|
||||||
|
pass
|
||||||
elif basename == 'ConversationIndex.txt':
|
elif basename == 'ConversationIndex.txt':
|
||||||
pass
|
pass
|
||||||
elif basename == 'Recipients.txt':
|
elif basename == 'Recipients.txt':
|
||||||
@ -233,7 +277,7 @@ class PFFReader(object):
|
|||||||
else:
|
else:
|
||||||
raise Exception("Unknown param name: %s" % name)
|
raise Exception("Unknown param name: %s" % name)
|
||||||
|
|
||||||
self.log("Out of loop")
|
#self.log("Out of loop")
|
||||||
doc = self.msg.flush()
|
doc = self.msg.flush()
|
||||||
if doc:
|
if doc:
|
||||||
yield((doc, ipath))
|
yield((doc, ipath))
|
||||||
@ -248,9 +292,9 @@ class PstExtractor(object):
|
|||||||
self.target = "\\\\?\\c:\\nonexistent"
|
self.target = "\\\\?\\c:\\nonexistent"
|
||||||
else:
|
else:
|
||||||
self.target = "/nonexistent"
|
self.target = "/nonexistent"
|
||||||
self.pffexport = rclexecm.which("pffexport")
|
self.pffexport = rclexecm.which("pffinstall/mingw32/bin/pffexport")
|
||||||
if not self.pffexport:
|
if not self.pffexport:
|
||||||
self.pffexport = rclexecm.which("pffinstall/mingw32/bin/pffexport")
|
self.pffexport = rclexecm.which("pffexport")
|
||||||
if not self.pffexport:
|
if not self.pffexport:
|
||||||
# No need for anything else. openfile() will return an
|
# No need for anything else. openfile() will return an
|
||||||
# error at once
|
# error at once
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user