diff --git a/src/filters/rclwar b/src/filters/rclwar new file mode 100755 index 00000000..f102d5b5 --- /dev/null +++ b/src/filters/rclwar @@ -0,0 +1,54 @@ +#!/usr/bin/env python + +# WAR web archive filter for recoll. War file are gzipped tar files + +import rclexecm +import tarfile + +class WarExtractor: + def __init__(self, em): + self.em = em + + def extractone(self, tarinfo): + docdata = "" + try: + member = self.tar.extractfile(tarinfo) + docdata = member.read() + ok = True + except Exception, err: + self.em.rclog("extractone: failed: [%s]" % err) + ok = False + return (ok, docdata, tarinfo.name, rclexecm.RclExecM.noteof) + + ###### File type handler api, used by rclexecm ----------> + def openfile(self, params): + self.currentindex = 0 + try: + self.tar = tarfile.open(params["filename:"]) + return True + except Exception, err: + self.em.rclog(str(err)) + return False + + def getipath(self, params): + ipath = params["ipath:"] + try: + tarinfo = self.tar.getmember(ipath) + except Exception, err: + self.em.rclog(str(err)) + return (False, "", ipath, rclexecm.RclExecM.noteof) + return self.extractone(tarinfo) + + def getnext(self, params): + tarinfo = self.tar.next() + if tarinfo is None: + #self.em.rclog("getnext: EOF hit") + return (False, "", "", rclexecm.RclExecM.eofnow) + else: + ret = self.extractone(tarinfo) + return ret + +# Main program: create protocol handler and extractor and run them +proto = rclexecm.RclExecM() +extract = WarExtractor(proto) +rclexecm.main(proto, extract) diff --git a/src/sampleconf/mimeconf b/src/sampleconf/mimeconf index ee108a02..1c304ccc 100644 --- a/src/sampleconf/mimeconf +++ b/src/sampleconf/mimeconf @@ -71,12 +71,13 @@ application/x-flac = execm rclaudio application/x-gnuinfo = execm rclinfo application/x-kword = exec rclkwd application/x-lyx = exec rcllyx +application/x-mimehtml = internal message/rfc822 application/x-perl = internal text/plain application/x-scribus = exec rclscribus application/x-shellscript = internal text/plain application/x-tex = exec rcltex -text/x-tex = exec rcltex application/x-chm = execm rclchm +application/x-webarchive = execm rclwar application/zip = execm rclzip;charset=default audio/mpeg = execm rclaudio audio/x-karaoke = execm rclkar @@ -104,6 +105,7 @@ text/x-purple-log = exec rclpurple text/x-purple-html-log = internal text/html text/x-python = exec rclpython text/x-shellscript = internal text/plain +text/x-tex = exec rcltex ## ############################################# # Icons to be used in the result list if required by gui config diff --git a/src/sampleconf/mimemap b/src/sampleconf/mimemap index a2341dde..10a34a85 100644 --- a/src/sampleconf/mimemap +++ b/src/sampleconf/mimemap @@ -112,6 +112,10 @@ .fb2 = text/x-fictionbook +.war = application/x-webarchive +.mht = application/x-mimehtml +.mhtml = application/x-mimehtml + # A list of suffixes (name endings) that we don't want to touch at all. # Having these explicitely listed speeds things up a bit by avoiding # unneeded decompression or 'file' calls. File names still get indexed if