recoll/src/filters/rclrar
Jean-Francois Dockes ee1e84b2f3 comments
2020-12-25 17:35:08 +01:00

179 lines
6.6 KiB
Python
Executable File

#!/usr/bin/env python3
# Rar file filter for Recoll
# Adapted from the Zip archive filter by mroark.
# Copyright (C) 2004 J.F.Dockes + mroark for rar bits
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the
# Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
from __future__ import print_function
import sys
import rclexecm
import os
# We can use two different unrar python modules. Either python3-rarfile
# which is a wrapper over the the unrar command line, or python3-unrar
# which is a ctypes wrapper of the unrar lib.
#
# Python3-rarfile is the one commonly packaged on linux.
#
# Python3-unrar needs the libunrar library, built from unrar source
# code (https://www.rarlab.com/rar_add.htm), which is not packaged on
# Debian or Fedora, probably because of licensing issues. It is
# packaged as libunrar5 on Ubuntu, and there are Fedora packages on
# RPM Fusion, and it's probably easy to build the Ubuntu package on
# Debian. Python-unrar works much better and is the right choice if
# the licensing is not an issue.
#
# The interfaces is similar, but python-unrar uses forward slashes
# in internal file paths while python-rarfile uses backslashes (ipaths
# are opaque anyway).
using_unrar = False
try:
from unrar import rarfile
using_unrar = True
except Exception as ex:
try:
from rarfile import RarFile
except:
print("RECFILTERROR HELPERNOTFOUND python3:rarfile/python3:unrar")
sys.exit(1);
# Requires RarFile python module. Try "sudo pip install rarfile" or
# install it with the system package manager
#
# Also for many files, you will need the non-free version of unrar
# (https://www.rarlab.com/rar_add.htm). The unrar-free version fails
# with the message "Failed the read enough data"
#
# This is identical to rclzip except I did a search/replace from zip
# to rar, and changed this comment.
class RarExtractor:
def __init__(self, em):
self.currentindex = 0
self.em = em
def extractone(self, ipath):
#self.em.rclog("extractone: [%s]" % ipath)
docdata = ""
isdir = False
try:
if using_unrar:
if type(ipath) == type(b''):
ipath = ipath.decode('UTF-8')
rarinfo = self.rar.getinfo(ipath)
# dll.hpp RHDF_DIRECTORY: 0x20
isdir = ((rarinfo.flag_bits & 0x20) != 0)
else:
rarinfo = self.rar.getinfo(ipath)
isdir = rarinfo.isdir()
except Exception as err:
self.em.rclog("extractone: using_unrar %d rar.getinfo failed: [%s]"
% (using_unrar,err))
return (True, docdata, ipath, False)
if not isdir:
try:
if rarinfo.file_size > self.em.maxmembersize:
self.em.rclog("extractone: entry %s size %d too big" %
(ipath, rarinfo.file_size))
docdata = ""
else:
docdata = self.rar.read(ipath)
ok = True
except Exception as err:
self.em.rclog("extractone: rar.read failed: [%s]" % err)
ok = False
else:
docdata = ""
ok = True
self.em.setmimetype("application/x-fsdirectory")
iseof = rclexecm.RclExecM.noteof
if self.currentindex >= len(self.rar.namelist()) -1:
iseof = rclexecm.RclExecM.eofnext
return (ok, docdata, rclexecm.makebytes(ipath), iseof)
def closefile(self):
self.rar = None
###### File type handler api, used by rclexecm ---------->
def openfile(self, params):
self.currentindex = -1
try:
if using_unrar:
# There might be a way to avoid the decoding which is
# wrong on Unix, but I'd have to dig further in the
# lib than I wish to. This is used on Windows anyway,
# where all Recoll paths are utf-8
fn = params["filename"].decode("UTF-8")
self.rar = rarfile.RarFile(fn, 'rb')
else:
# The previous versions passed the file name to
# RarFile. But the py3 version of this wants an str as
# input, which is wrong of course, as filenames are
# binary. Circumvented by passing the open file
f = open(params["filename"], 'rb')
self.rar = RarFile(f)
return True
except Exception as err:
self.em.rclog("RarFile: %s"%err)
return False
def getipath(self, params):
ipath = params["ipath"]
ok, data, ipath, eof = self.extractone(ipath)
if ok:
return (ok, data, ipath, eof)
# Not found. Maybe we need to decode the path?
try:
ipath = ipath.decode("utf-8")
return self.extractone(ipath)
except Exception as err:
return (ok, data, ipath, eof)
def getnext(self, params):
if self.currentindex == -1:
# Return "self" doc
self.currentindex = 0
self.em.setmimetype('text/plain')
if len(self.rar.namelist()) == 0:
self.closefile()
eof = rclexecm.RclExecM.eofnext
else:
eof = rclexecm.RclExecM.noteof
return (True, "", "", eof)
if self.currentindex >= len(self.rar.namelist()):
#self.em.rclog("getnext: EOF hit")
self.closefile()
return (False, "", "", rclexecm.RclExecM.eofnow)
else:
ret = self.extractone(self.rar.namelist()[self.currentindex])
self.currentindex += 1
if ret[3] == rclexecm.RclExecM.eofnext or \
ret[3] == rclexecm.RclExecM.eofnow:
self.closefile()
return ret
# Main program: create protocol handler and extractor and run them
proto = rclexecm.RclExecM()
extract = RarExtractor(proto)
rclexecm.main(proto, extract)