index: support webarchive (.war) and mimehtml (.mhtml) formats
This commit is contained in:
parent
22788b8900
commit
d0cb158d26
54
src/filters/rclwar
Executable file
54
src/filters/rclwar
Executable file
@ -0,0 +1,54 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# WAR web archive filter for recoll. War file are gzipped tar files
|
||||
|
||||
import rclexecm
|
||||
import tarfile
|
||||
|
||||
class WarExtractor:
|
||||
def __init__(self, em):
|
||||
self.em = em
|
||||
|
||||
def extractone(self, tarinfo):
|
||||
docdata = ""
|
||||
try:
|
||||
member = self.tar.extractfile(tarinfo)
|
||||
docdata = member.read()
|
||||
ok = True
|
||||
except Exception, err:
|
||||
self.em.rclog("extractone: failed: [%s]" % err)
|
||||
ok = False
|
||||
return (ok, docdata, tarinfo.name, rclexecm.RclExecM.noteof)
|
||||
|
||||
###### File type handler api, used by rclexecm ---------->
|
||||
def openfile(self, params):
|
||||
self.currentindex = 0
|
||||
try:
|
||||
self.tar = tarfile.open(params["filename:"])
|
||||
return True
|
||||
except Exception, err:
|
||||
self.em.rclog(str(err))
|
||||
return False
|
||||
|
||||
def getipath(self, params):
|
||||
ipath = params["ipath:"]
|
||||
try:
|
||||
tarinfo = self.tar.getmember(ipath)
|
||||
except Exception, err:
|
||||
self.em.rclog(str(err))
|
||||
return (False, "", ipath, rclexecm.RclExecM.noteof)
|
||||
return self.extractone(tarinfo)
|
||||
|
||||
def getnext(self, params):
|
||||
tarinfo = self.tar.next()
|
||||
if tarinfo is None:
|
||||
#self.em.rclog("getnext: EOF hit")
|
||||
return (False, "", "", rclexecm.RclExecM.eofnow)
|
||||
else:
|
||||
ret = self.extractone(tarinfo)
|
||||
return ret
|
||||
|
||||
# Main program: create protocol handler and extractor and run them
|
||||
proto = rclexecm.RclExecM()
|
||||
extract = WarExtractor(proto)
|
||||
rclexecm.main(proto, extract)
|
||||
@ -71,12 +71,13 @@ application/x-flac = execm rclaudio
|
||||
application/x-gnuinfo = execm rclinfo
|
||||
application/x-kword = exec rclkwd
|
||||
application/x-lyx = exec rcllyx
|
||||
application/x-mimehtml = internal message/rfc822
|
||||
application/x-perl = internal text/plain
|
||||
application/x-scribus = exec rclscribus
|
||||
application/x-shellscript = internal text/plain
|
||||
application/x-tex = exec rcltex
|
||||
text/x-tex = exec rcltex
|
||||
application/x-chm = execm rclchm
|
||||
application/x-webarchive = execm rclwar
|
||||
application/zip = execm rclzip;charset=default
|
||||
audio/mpeg = execm rclaudio
|
||||
audio/x-karaoke = execm rclkar
|
||||
@ -104,6 +105,7 @@ text/x-purple-log = exec rclpurple
|
||||
text/x-purple-html-log = internal text/html
|
||||
text/x-python = exec rclpython
|
||||
text/x-shellscript = internal text/plain
|
||||
text/x-tex = exec rcltex
|
||||
|
||||
## #############################################
|
||||
# Icons to be used in the result list if required by gui config
|
||||
|
||||
@ -112,6 +112,10 @@
|
||||
|
||||
.fb2 = text/x-fictionbook
|
||||
|
||||
.war = application/x-webarchive
|
||||
.mht = application/x-mimehtml
|
||||
.mhtml = application/x-mimehtml
|
||||
|
||||
# A list of suffixes (name endings) that we don't want to touch at all.
|
||||
# Having these explicitely listed speeds things up a bit by avoiding
|
||||
# unneeded decompression or 'file' calls. File names still get indexed if
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user