index: support webarchive (.war) and mimehtml (.mhtml) formats

This commit is contained in:
Jean-Francois Dockes 2011-03-26 17:29:04 +01:00
parent 22788b8900
commit d0cb158d26
3 changed files with 61 additions and 1 deletions

54
src/filters/rclwar Executable file
View File

@ -0,0 +1,54 @@
#!/usr/bin/env python
# WAR web archive filter for recoll. War file are gzipped tar files
import rclexecm
import tarfile
class WarExtractor:
def __init__(self, em):
self.em = em
def extractone(self, tarinfo):
docdata = ""
try:
member = self.tar.extractfile(tarinfo)
docdata = member.read()
ok = True
except Exception, err:
self.em.rclog("extractone: failed: [%s]" % err)
ok = False
return (ok, docdata, tarinfo.name, rclexecm.RclExecM.noteof)
###### File type handler api, used by rclexecm ---------->
def openfile(self, params):
self.currentindex = 0
try:
self.tar = tarfile.open(params["filename:"])
return True
except Exception, err:
self.em.rclog(str(err))
return False
def getipath(self, params):
ipath = params["ipath:"]
try:
tarinfo = self.tar.getmember(ipath)
except Exception, err:
self.em.rclog(str(err))
return (False, "", ipath, rclexecm.RclExecM.noteof)
return self.extractone(tarinfo)
def getnext(self, params):
tarinfo = self.tar.next()
if tarinfo is None:
#self.em.rclog("getnext: EOF hit")
return (False, "", "", rclexecm.RclExecM.eofnow)
else:
ret = self.extractone(tarinfo)
return ret
# Main program: create protocol handler and extractor and run them
proto = rclexecm.RclExecM()
extract = WarExtractor(proto)
rclexecm.main(proto, extract)

View File

@ -71,12 +71,13 @@ application/x-flac = execm rclaudio
application/x-gnuinfo = execm rclinfo
application/x-kword = exec rclkwd
application/x-lyx = exec rcllyx
application/x-mimehtml = internal message/rfc822
application/x-perl = internal text/plain
application/x-scribus = exec rclscribus
application/x-shellscript = internal text/plain
application/x-tex = exec rcltex
text/x-tex = exec rcltex
application/x-chm = execm rclchm
application/x-webarchive = execm rclwar
application/zip = execm rclzip;charset=default
audio/mpeg = execm rclaudio
audio/x-karaoke = execm rclkar
@ -104,6 +105,7 @@ text/x-purple-log = exec rclpurple
text/x-purple-html-log = internal text/html
text/x-python = exec rclpython
text/x-shellscript = internal text/plain
text/x-tex = exec rcltex
## #############################################
# Icons to be used in the result list if required by gui config

View File

@ -112,6 +112,10 @@
.fb2 = text/x-fictionbook
.war = application/x-webarchive
.mht = application/x-mimehtml
.mhtml = application/x-mimehtml
# A list of suffixes (name endings) that we don't want to touch at all.
# Having these explicitely listed speeds things up a bit by avoiding
# unneeded decompression or 'file' calls. File names still get indexed if