xls filter: catch HTML files disguising as XLS

This commit is contained in:
Jean-Francois Dockes 2016-02-26 09:35:23 +01:00
parent 110ee2d809
commit 92bb5bfc43
2 changed files with 50 additions and 23 deletions

View File

@ -46,30 +46,43 @@ class Executor:
def runCmd(self, cmd, filename, postproc, opt): def runCmd(self, cmd, filename, postproc, opt):
''' Substitute parameters and execute command, process output ''' Substitute parameters and execute command, process output
with the specific postprocessor and return the complete text. with the specific postprocessor and return the complete text.
We expect cmd as a list of command name + arguments''' We expect cmd as a list of command name + arguments, except that, for
the special value "cat", we just read the file'''
try: if cmd == "cat":
fullcmd = cmd + [filename] try:
proc = subprocess.Popen(fullcmd, data = open(filename, 'rb').read()
stdout = subprocess.PIPE) ok = True
stdout = proc.stdout except Exception as err:
except subprocess.CalledProcessError as err: self.em.rclog("runCmd: error reading %s: %s"%(filename, err))
self.em.rclog("extractone: Popen(%s) error: %s" % (fullcmd, err)) return(False, "")
return (False, "") for line in data.split('\n'):
except OSError as err: postproc.takeLine(line)
self.em.rclog("extractone: Popen(%s) OS error: %s" % (fullcmd, err))
return (False, "")
for line in stdout:
postproc.takeLine(line.strip())
proc.wait()
if (opt & self.opt_ignxval) == 0 and proc.returncode:
self.em.rclog("extractone: [%s] returncode %d" % \
(filename, proc.returncode))
return False, postproc.wrapData()
else:
return True, postproc.wrapData() return True, postproc.wrapData()
else:
try:
fullcmd = cmd + [filename]
proc = subprocess.Popen(fullcmd,
stdout = subprocess.PIPE)
stdout = proc.stdout
except subprocess.CalledProcessError as err:
self.em.rclog("extractone: Popen(%s) error: %s" % (fullcmd, err))
return (False, "")
except OSError as err:
self.em.rclog("extractone: Popen(%s) OS error: %s" %
(fullcmd, err))
return (False, "")
for line in stdout:
postproc.takeLine(line.strip())
proc.wait()
if (opt & self.opt_ignxval) == 0 and proc.returncode:
self.em.rclog("extractone: [%s] returncode %d" % \
(filename, proc.returncode))
return False, postproc.wrapData()
else:
return True, postproc.wrapData()
def extractone(self, params): def extractone(self, params):
#self.em.rclog("extractone %s %s" % (params["filename:"], \ #self.em.rclog("extractone %s %s" % (params["filename:"], \

View File

@ -13,13 +13,17 @@ import os
import xml.sax import xml.sax
class XLSProcessData: class XLSProcessData:
def __init__(self, em): def __init__(self, em, ishtml = False):
self.em = em self.em = em
self.out = "" self.out = ""
self.gotdata = 0 self.gotdata = 0
self.xmldata = "" self.xmldata = ""
self.ishtml = ishtml
def takeLine(self, line): def takeLine(self, line):
if self.ishtml:
self.out += line + "\n"
return
if not self.gotdata: if not self.gotdata:
self.out += '''<html><head>''' + \ self.out += '''<html><head>''' + \
'''<meta http-equiv="Content-Type" ''' + \ '''<meta http-equiv="Content-Type" ''' + \
@ -29,6 +33,8 @@ class XLSProcessData:
self.xmldata += line self.xmldata += line
def wrapData(self): def wrapData(self):
if self.ishtml:
return self.out
handler = xlsxmltocsv.XlsXmlHandler() handler = xlsxmltocsv.XlsXmlHandler()
data = xml.sax.parseString(self.xmldata, handler) data = xml.sax.parseString(self.xmldata, handler)
self.out += self.em.htmlescape(handler.output) self.out += self.em.htmlescape(handler.output)
@ -47,6 +53,14 @@ class XLSFilter:
if self.ntry: if self.ntry:
return ([], None) return ([], None)
self.ntry = 1 self.ntry = 1
# Some HTML files masquerade as XLS
try:
data = open(fn, 'rb').read(512)
if data.find('html') != -1 or data.find('HTML') != -1:
return ("cat", XLSProcessData(self.em, True))
except Exception as err:
self.em.rclog("Error reading %s:%s" % (fn, str(err)))
pass
cmd = rclexecm.which("xls-dump.py") cmd = rclexecm.which("xls-dump.py")
if cmd: if cmd:
# xls-dump.py often exits 1 with valid data. Ignore exit value # xls-dump.py often exits 1 with valid data. Ignore exit value