#!/usr/bin/env python2 # WAR web archive filter for recoll. War file are gzipped tar files from __future__ import print_function import rclexecm import tarfile class WarExtractor: def __init__(self, em): self.em = em def extractone(self, tarinfo): docdata = "" try: member = self.tar.extractfile(tarinfo) docdata = member.read() ok = True except Exception as err: self.em.rclog("extractone: failed: [%s]" % err) ok = False return (ok, docdata, tarinfo.name, rclexecm.RclExecM.noteof) ###### File type handler api, used by rclexecm ----------> def openfile(self, params): self.currentindex = -1 try: self.tar = tarfile.open(params["filename:"]) return True except Exception as err: self.em.rclog(str(err)) return False def getipath(self, params): ipath = params["ipath:"] try: tarinfo = self.tar.getmember(ipath) except Exception as err: self.em.rclog(str(err)) return (False, "", ipath, rclexecm.RclExecM.noteof) return self.extractone(tarinfo) def getnext(self, params): if self.currentindex == -1: # Return "self" doc self.currentindex = 0 return (True, "", "", rclexecm.RclExecM.noteof) tarinfo = self.tar.next() if tarinfo is None: #self.em.rclog("getnext: EOF hit") return (False, "", "", rclexecm.RclExecM.eofnow) else: ret = self.extractone(tarinfo) return ret # Main program: create protocol handler and extractor and run them proto = rclexecm.RclExecM() extract = WarExtractor(proto) rclexecm.main(proto, extract)