xls filter: catch HTML files disguising as XLS

This commit is contained in:
Jean-Francois Dockes 2016-02-26 09:35:23 +01:00
parent 110ee2d809
commit 92bb5bfc43
2 changed files with 50 additions and 23 deletions

View File

@ -46,8 +46,20 @@ class Executor:
def runCmd(self, cmd, filename, postproc, opt): def runCmd(self, cmd, filename, postproc, opt):
''' Substitute parameters and execute command, process output ''' Substitute parameters and execute command, process output
with the specific postprocessor and return the complete text. with the specific postprocessor and return the complete text.
We expect cmd as a list of command name + arguments''' We expect cmd as a list of command name + arguments, except that, for
the special value "cat", we just read the file'''
if cmd == "cat":
try:
data = open(filename, 'rb').read()
ok = True
except Exception as err:
self.em.rclog("runCmd: error reading %s: %s"%(filename, err))
return(False, "")
for line in data.split('\n'):
postproc.takeLine(line)
return True, postproc.wrapData()
else:
try: try:
fullcmd = cmd + [filename] fullcmd = cmd + [filename]
proc = subprocess.Popen(fullcmd, proc = subprocess.Popen(fullcmd,
@ -57,7 +69,8 @@ class Executor:
self.em.rclog("extractone: Popen(%s) error: %s" % (fullcmd, err)) self.em.rclog("extractone: Popen(%s) error: %s" % (fullcmd, err))
return (False, "") return (False, "")
except OSError as err: except OSError as err:
self.em.rclog("extractone: Popen(%s) OS error: %s" % (fullcmd, err)) self.em.rclog("extractone: Popen(%s) OS error: %s" %
(fullcmd, err))
return (False, "") return (False, "")
for line in stdout: for line in stdout:

View File

@ -13,13 +13,17 @@ import os
import xml.sax import xml.sax
class XLSProcessData: class XLSProcessData:
def __init__(self, em): def __init__(self, em, ishtml = False):
self.em = em self.em = em
self.out = "" self.out = ""
self.gotdata = 0 self.gotdata = 0
self.xmldata = "" self.xmldata = ""
self.ishtml = ishtml
def takeLine(self, line): def takeLine(self, line):
if self.ishtml:
self.out += line + "\n"
return
if not self.gotdata: if not self.gotdata:
self.out += '''<html><head>''' + \ self.out += '''<html><head>''' + \
'''<meta http-equiv="Content-Type" ''' + \ '''<meta http-equiv="Content-Type" ''' + \
@ -29,6 +33,8 @@ class XLSProcessData:
self.xmldata += line self.xmldata += line
def wrapData(self): def wrapData(self):
if self.ishtml:
return self.out
handler = xlsxmltocsv.XlsXmlHandler() handler = xlsxmltocsv.XlsXmlHandler()
data = xml.sax.parseString(self.xmldata, handler) data = xml.sax.parseString(self.xmldata, handler)
self.out += self.em.htmlescape(handler.output) self.out += self.em.htmlescape(handler.output)
@ -47,6 +53,14 @@ class XLSFilter:
if self.ntry: if self.ntry:
return ([], None) return ([], None)
self.ntry = 1 self.ntry = 1
# Some HTML files masquerade as XLS
try:
data = open(fn, 'rb').read(512)
if data.find('html') != -1 or data.find('HTML') != -1:
return ("cat", XLSProcessData(self.em, True))
except Exception as err:
self.em.rclog("Error reading %s:%s" % (fn, str(err)))
pass
cmd = rclexecm.which("xls-dump.py") cmd = rclexecm.which("xls-dump.py")
if cmd: if cmd:
# xls-dump.py often exits 1 with valid data. Ignore exit value # xls-dump.py often exits 1 with valid data. Ignore exit value