229 lines
7.5 KiB
Python
229 lines
7.5 KiB
Python
#!/usr/bin/env python
|
|
"""This sample uses the Recoll Python API to index a directory
|
|
containing mbox files. This is not particularly useful as Recoll
|
|
itself can do this better (e.g. this script does not process
|
|
attachments), but it shows the use of most of the Recoll interface
|
|
features, except 'parent_udi' (we do not create a 'self' document to
|
|
act as the parent)."""
|
|
from __future__ import print_function
|
|
|
|
import sys
|
|
import glob
|
|
import os
|
|
import stat
|
|
import mailbox
|
|
import email.header
|
|
import email.utils
|
|
|
|
try:
|
|
from recoll import recoll
|
|
except:
|
|
import recoll
|
|
|
|
# EDIT
|
|
# Change this for some directory with mbox files, such as a
|
|
# Thunderbird/Icedove mail storage directory.
|
|
mbdir = os.path.expanduser("~/mail")
|
|
#mbdir = os.path.expanduser("~/.icedove/n8n19644.default/Mail/Local Folders/")
|
|
|
|
# EDIT
|
|
# Change this to wherever you want your recoll data to live. Create
|
|
# the directory with a (possibly empty) recoll.conf in it before first
|
|
# running the script
|
|
rclconf = os.path.expanduser("~/.recoll-extern")
|
|
|
|
# Utility: extract text for named header
|
|
def header_value(msg, nm, to_utf = False):
|
|
value = msg.get(nm)
|
|
if value == None:
|
|
return ""
|
|
#value = value.replace("\n", "")
|
|
#value = value.replace("\r", "")
|
|
parts = email.header.decode_header(value)
|
|
univalue = u""
|
|
for part in parts:
|
|
try:
|
|
if part[1] != None:
|
|
univalue += part[0].decode(part[1]) + u" "
|
|
else:
|
|
if isinstance(part[0], bytes):
|
|
univalue += part[0].decode("cp1252") + u" "
|
|
else:
|
|
univalue += part[0] + u" "
|
|
except Exception as err:
|
|
print("Failed decoding header: %s" % err, file=sys.stderr)
|
|
pass
|
|
if to_utf:
|
|
return univalue.encode('utf-8')
|
|
else:
|
|
return univalue
|
|
|
|
# Utility: extract text parts from body
|
|
def extract_text(msg):
|
|
"""Extract and decode all text/plain parts from the message"""
|
|
text = u""
|
|
# We only output the headers for previewing, else they're already
|
|
# output/indexed as fields.
|
|
if "RECOLL_FILTER_FORPREVIEW" in os.environ and \
|
|
os.environ["RECOLL_FILTER_FORPREVIEW"] == "yes":
|
|
text += u"From: " + header_value(msg, "From") + u"\n"
|
|
text += u"To: " + header_value(msg, "To") + u"\n"
|
|
text += u"Subject: " + header_value(msg, "Subject") + u"\n"
|
|
# text += u"Content-Type: text/plain; charset=UTF-8\n"
|
|
#text += u"Message-ID: " + header_value(msg, "Message-ID") + u"\n"
|
|
text += u"\n"
|
|
for part in msg.walk():
|
|
if part.is_multipart():
|
|
pass
|
|
else:
|
|
ct = part.get_content_type()
|
|
if ct.lower() == "text/plain":
|
|
charset = part.get_content_charset("cp1252")
|
|
try:
|
|
ntxt = part.get_payload(None, True).decode(charset)
|
|
text += ntxt
|
|
except Exception as err:
|
|
print("Failed decoding payload: %s" % err,
|
|
file=sys.stderr)
|
|
pass
|
|
return text
|
|
|
|
|
|
|
|
class mbox_indexer:
|
|
"""The indexer classs. An object is created for indexing one mbox folder"""
|
|
def __init__(self, db, mbfile):
|
|
"""Initialize for writable db recoll.Db object and mbfile mbox
|
|
file. We retrieve the the file size and mtime."""
|
|
self.db = db
|
|
self.mbfile = mbfile
|
|
stdata = os.stat(mbfile)
|
|
self.fmtime = stdata[stat.ST_MTIME]
|
|
self.fbytes = stdata[stat.ST_SIZE]
|
|
self.msgnum = 1
|
|
|
|
def sig(self):
|
|
"""Create update verification value for mbox file:
|
|
modification time concatenated with size should cover most
|
|
cases"""
|
|
return str(self.fmtime) + ":" + str(self.fbytes)
|
|
|
|
def udi(self, msgnum):
|
|
"""Create unique document identifier for message. This should
|
|
be shorter than 150 bytes, which we optimistically don't check
|
|
here, as we just concatenate the mbox file name and message
|
|
number"""
|
|
return self.mbfile + ":" + str(msgnum)
|
|
|
|
def index(self):
|
|
if not self.db.needUpdate(self.udi(1), self.sig()):
|
|
print("Index is up to date for %s"%self.mbfile, file=sys.stderr);
|
|
return None
|
|
mb = mailbox.mbox(self.mbfile)
|
|
for msg in mb.values():
|
|
print("Indexing message %d" % self.msgnum, file=sys.stderr);
|
|
self.index_message(msg)
|
|
self.msgnum += 1
|
|
|
|
def getdata(self, ipath):
|
|
"""Implements the 'fetch' data access interface (called at
|
|
query time from the command line)."""
|
|
#print("mbox::getdata: ipath: %s" % ipath, file=sys.stderr)
|
|
imsgnum = int(ipath)
|
|
mb = mailbox.mbox(self.mbfile)
|
|
msgnum = 0;
|
|
for msg in mb.values():
|
|
msgnum += 1
|
|
if msgnum == imsgnum:
|
|
return extract_text(msg)
|
|
return ""
|
|
|
|
def index_message(self, msg):
|
|
doc = recoll.Doc()
|
|
|
|
# Misc standard recoll fields
|
|
doc.author = header_value(msg, "From")
|
|
doc.recipient = header_value(msg, "To") + " " + header_value(msg, "Cc")
|
|
dte = header_value(msg, "Date")
|
|
tm = email.utils.parsedate_tz(dte)
|
|
if tm == None:
|
|
doc.mtime = str(self.fmtime)
|
|
else:
|
|
doc.mtime = str(email.utils.mktime_tz(tm))
|
|
doc.title = header_value(msg, "Subject")
|
|
doc.fbytes = str(self.fbytes)
|
|
|
|
# Custom field
|
|
doc.myfield = "some value"
|
|
|
|
# Main document text and MIME type
|
|
doc.text = extract_text(msg)
|
|
doc.dbytes = str(len(doc.text.encode('UTF-8')))
|
|
doc.mimetype = "text/plain"
|
|
|
|
# Store data for later "up to date" checks
|
|
doc.sig = self.sig()
|
|
|
|
# The rclbes field is the link between the index data and this
|
|
# script when used at query time
|
|
doc.rclbes = "MBOX"
|
|
|
|
# These get stored inside the index, and returned at query
|
|
# time, but the main identifier is the condensed 'udi'
|
|
doc.url = "file://" + self.mbfile
|
|
doc.ipath = str(self.msgnum)
|
|
# The udi is the unique document identifier, later used if we
|
|
# want to e.g. delete the document index data (and other ops).
|
|
udi = self.udi(self.msgnum)
|
|
|
|
self.db.addOrUpdate(udi, doc)
|
|
|
|
# Index a directory containing mbox files
|
|
def index_mboxdir(dir):
|
|
db = recoll.connect(confdir=rclconf, writable=1)
|
|
entries = glob.glob(dir + "/*")
|
|
for ent in entries:
|
|
if '.' in os.path.basename(ent):
|
|
# skip .log etc. our mboxes have no exts
|
|
continue
|
|
if not os.path.isfile(ent):
|
|
continue
|
|
print("Processing %s"%ent)
|
|
mbidx = mbox_indexer(db, ent)
|
|
mbidx.index()
|
|
db.purge()
|
|
|
|
usage_string='''Usage:
|
|
rclmbox.py
|
|
Index the directory (the path is hard-coded inside the script)
|
|
rclmbox.py [fetch|makesig] udi url ipath
|
|
fetch subdoc data or make signature (query time)
|
|
'''
|
|
def usage():
|
|
print("%s" % usage_string, file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
if len(sys.argv) == 1:
|
|
index_mboxdir(mbdir)
|
|
else:
|
|
# cmd [fetch|makesig] udi url ipath
|
|
if len(sys.argv) != 5:
|
|
usage()
|
|
cmd = sys.argv[1]
|
|
udi = sys.argv[2]
|
|
url = sys.argv[3]
|
|
ipath = sys.argv[4]
|
|
|
|
mbfile = url.replace('file://', '')
|
|
# no need for a db for getdata or makesig.
|
|
mbidx = mbox_indexer(None, mbfile)
|
|
|
|
if cmd == 'fetch':
|
|
print("%s"%mbidx.getdata(ipath).encode('UTF-8'), end="")
|
|
elif cmd == 'makesig':
|
|
print(mbidx.sig(), end="")
|
|
else:
|
|
usage()
|
|
|
|
sys.exit(0)
|