rcl7z: use py7zr if available, rather than pylzma, which does not work on some archives

This commit is contained in:
Jean-Francois Dockes 2020-12-25 12:30:18 +01:00
parent 3479e7cd85
commit 53edd7b213

View File

@ -4,18 +4,27 @@
# Thanks to Recoll user Martin Ziegler # Thanks to Recoll user Martin Ziegler
# This is a modified version of rclzip, with some help from rcltar # This is a modified version of rclzip, with some help from rcltar
# Python pylzma library required. See http://www.joachim-bauch.de/projects/pylzma/ #
# Normally using py7zr https://github.com/miurahr/py7zr
#
# Else, but it does not work on all archives, may use:
# Python pylzma library required. See http://www.joachim-bauch.de/projects/pylzma/
import sys import sys
import os import os
import fnmatch import fnmatch
import rclexecm import rclexecm
usingpy7zr = False
try: try:
from py7zlib import Archive7z from py7zr import SevenZipFile as Archive7z
usingpy7zr = True
except: except:
print("RECFILTERROR HELPERNOTFOUND python3:pylzma") try:
sys.exit(1); from py7zlib import Archive7z
except:
print("RECFILTERROR HELPERNOTFOUND python3:py7zr or python3:pylzma")
sys.exit(1);
try: try:
from recoll import rclconfig from recoll import rclconfig
@ -41,14 +50,18 @@ class SevenZipExtractor:
def extractone(self, ipath): def extractone(self, ipath):
#self.em.rclog("extractone: [%s]" % ipath) #self.em.rclog("extractone: [%s]" % ipath)
docdata = b'' docdata = b''
ok = False
try: try:
docdata = self.sevenzip.getmember(ipath).read() if usingpy7zr:
docdata = self.sevenzdic[ipath].read()
else:
docdata = self.sevenzip.getmember(ipath).read()
ok = True ok = True
except Exception as err: except Exception as err:
self.em.rclog("extractone: failed: [%s]" % err) self.em.rclog("extractone: failed: [%s]" % err)
ok = False
iseof = rclexecm.RclExecM.noteof iseof = rclexecm.RclExecM.noteof
if self.currentindex >= len(self.sevenzip.getnames()) -1: if self.currentindex >= len(self.names) -1:
iseof = rclexecm.RclExecM.eofnext iseof = rclexecm.RclExecM.eofnext
return (ok, docdata, rclexecm.makebytes(ipath), iseof) return (ok, docdata, rclexecm.makebytes(ipath), iseof)
@ -72,6 +85,11 @@ class SevenZipExtractor:
try: try:
self.fp = open(filename, 'rb') self.fp = open(filename, 'rb')
self.sevenzip = Archive7z(self.fp) self.sevenzip = Archive7z(self.fp)
if usingpy7zr:
self.sevenzdic = self.sevenzip.readall()
self.names = [k[0] for k in self.sevenzdic.items()]
else:
self.names = self.sevenzip.getnames()
return True return True
except Exception as err: except Exception as err:
self.em.rclog("openfile: failed: [%s]" % err) self.em.rclog("openfile: failed: [%s]" % err)
@ -94,40 +112,40 @@ class SevenZipExtractor:
# Return "self" doc # Return "self" doc
self.currentindex = 0 self.currentindex = 0
self.em.setmimetype('text/plain') self.em.setmimetype('text/plain')
if len(self.sevenzip.getnames()) == 0: if len(self.names) == 0:
self.closefile() self.closefile()
eof = rclexecm.RclExecM.eofnext eof = rclexecm.RclExecM.eofnext
else: else:
eof = rclexecm.RclExecM.noteof eof = rclexecm.RclExecM.noteof
return (True, "", "", eof) return (True, "", "", eof)
if self.currentindex >= len(self.sevenzip.getnames()): if self.currentindex >= len(self.names):
#self.em.rclog("getnext: EOF hit") #self.em.rclog("getnext: EOF hit")
self.closefile() self.closefile()
return (False, "", "", rclexecm.RclExecM.eofnow) return (False, "", "", rclexecm.RclExecM.eofnow)
else:
entryname = self.sevenzip.getnames()[self.currentindex]
if hasrclconfig and len(self.skiplist) != 0: entryname = self.names[self.currentindex]
while self.currentindex < len(self.sevenzip.getnames()):
entryname = self.sevenzip.getnames()[self.currentindex] if hasrclconfig and len(self.skiplist) != 0:
for pat in self.skiplist: while self.currentindex < len(self.names):
if fnmatch.fnmatch(entryname, pat): entryname = self.names[self.currentindex]
entryname = None for pat in self.skiplist:
break if fnmatch.fnmatch(entryname, pat):
if entryname is not None: entryname = None
break break
self.currentindex += 1 if entryname is not None:
if entryname is None: break
self.closefile() self.currentindex += 1
return (False, "", "", rclexecm.RclExecM.eofnow) if entryname is None:
ret = self.extractone(entryname)
if ret[3] == rclexecm.RclExecM.eofnext or \
ret[3] == rclexecm.RclExecM.eofnow:
self.closefile() self.closefile()
self.currentindex += 1 return (False, "", "", rclexecm.RclExecM.eofnow)
return ret
ret = self.extractone(entryname)
if ret[3] == rclexecm.RclExecM.eofnext or \
ret[3] == rclexecm.RclExecM.eofnow:
self.closefile()
self.currentindex += 1
return ret
# Main program: create protocol handler and extractor and run them # Main program: create protocol handler and extractor and run them
proto = rclexecm.RclExecM() proto = rclexecm.RclExecM()