xls filter: catch HTML files disguising as XLS

This commit is contained in:
Jean-Francois Dockes 2016-02-26 09:35:23 +01:00
parent 110ee2d809
commit 92bb5bfc43
2 changed files with 50 additions and 23 deletions

View File

@ -46,30 +46,43 @@ class Executor:
def runCmd(self, cmd, filename, postproc, opt):
''' Substitute parameters and execute command, process output
with the specific postprocessor and return the complete text.
We expect cmd as a list of command name + arguments'''
We expect cmd as a list of command name + arguments, except that, for
the special value "cat", we just read the file'''
try:
fullcmd = cmd + [filename]
proc = subprocess.Popen(fullcmd,
stdout = subprocess.PIPE)
stdout = proc.stdout
except subprocess.CalledProcessError as err:
self.em.rclog("extractone: Popen(%s) error: %s" % (fullcmd, err))
return (False, "")
except OSError as err:
self.em.rclog("extractone: Popen(%s) OS error: %s" % (fullcmd, err))
return (False, "")
for line in stdout:
postproc.takeLine(line.strip())
proc.wait()
if (opt & self.opt_ignxval) == 0 and proc.returncode:
self.em.rclog("extractone: [%s] returncode %d" % \
(filename, proc.returncode))
return False, postproc.wrapData()
else:
if cmd == "cat":
try:
data = open(filename, 'rb').read()
ok = True
except Exception as err:
self.em.rclog("runCmd: error reading %s: %s"%(filename, err))
return(False, "")
for line in data.split('\n'):
postproc.takeLine(line)
return True, postproc.wrapData()
else:
try:
fullcmd = cmd + [filename]
proc = subprocess.Popen(fullcmd,
stdout = subprocess.PIPE)
stdout = proc.stdout
except subprocess.CalledProcessError as err:
self.em.rclog("extractone: Popen(%s) error: %s" % (fullcmd, err))
return (False, "")
except OSError as err:
self.em.rclog("extractone: Popen(%s) OS error: %s" %
(fullcmd, err))
return (False, "")
for line in stdout:
postproc.takeLine(line.strip())
proc.wait()
if (opt & self.opt_ignxval) == 0 and proc.returncode:
self.em.rclog("extractone: [%s] returncode %d" % \
(filename, proc.returncode))
return False, postproc.wrapData()
else:
return True, postproc.wrapData()
def extractone(self, params):
#self.em.rclog("extractone %s %s" % (params["filename:"], \

View File

@ -13,13 +13,17 @@ import os
import xml.sax
class XLSProcessData:
def __init__(self, em):
def __init__(self, em, ishtml = False):
self.em = em
self.out = ""
self.gotdata = 0
self.xmldata = ""
self.ishtml = ishtml
def takeLine(self, line):
if self.ishtml:
self.out += line + "\n"
return
if not self.gotdata:
self.out += '''<html><head>''' + \
'''<meta http-equiv="Content-Type" ''' + \
@ -29,6 +33,8 @@ class XLSProcessData:
self.xmldata += line
def wrapData(self):
if self.ishtml:
return self.out
handler = xlsxmltocsv.XlsXmlHandler()
data = xml.sax.parseString(self.xmldata, handler)
self.out += self.em.htmlescape(handler.output)
@ -47,6 +53,14 @@ class XLSFilter:
if self.ntry:
return ([], None)
self.ntry = 1
# Some HTML files masquerade as XLS
try:
data = open(fn, 'rb').read(512)
if data.find('html') != -1 or data.find('HTML') != -1:
return ("cat", XLSProcessData(self.em, True))
except Exception as err:
self.em.rclog("Error reading %s:%s" % (fn, str(err)))
pass
cmd = rclexecm.which("xls-dump.py")
if cmd:
# xls-dump.py often exits 1 with valid data. Ignore exit value