xls filter: catch HTML files disguising as XLS
This commit is contained in:
parent
110ee2d809
commit
92bb5bfc43
@ -46,30 +46,43 @@ class Executor:
|
||||
def runCmd(self, cmd, filename, postproc, opt):
|
||||
''' Substitute parameters and execute command, process output
|
||||
with the specific postprocessor and return the complete text.
|
||||
We expect cmd as a list of command name + arguments'''
|
||||
We expect cmd as a list of command name + arguments, except that, for
|
||||
the special value "cat", we just read the file'''
|
||||
|
||||
try:
|
||||
fullcmd = cmd + [filename]
|
||||
proc = subprocess.Popen(fullcmd,
|
||||
stdout = subprocess.PIPE)
|
||||
stdout = proc.stdout
|
||||
except subprocess.CalledProcessError as err:
|
||||
self.em.rclog("extractone: Popen(%s) error: %s" % (fullcmd, err))
|
||||
return (False, "")
|
||||
except OSError as err:
|
||||
self.em.rclog("extractone: Popen(%s) OS error: %s" % (fullcmd, err))
|
||||
return (False, "")
|
||||
|
||||
for line in stdout:
|
||||
postproc.takeLine(line.strip())
|
||||
|
||||
proc.wait()
|
||||
if (opt & self.opt_ignxval) == 0 and proc.returncode:
|
||||
self.em.rclog("extractone: [%s] returncode %d" % \
|
||||
(filename, proc.returncode))
|
||||
return False, postproc.wrapData()
|
||||
else:
|
||||
if cmd == "cat":
|
||||
try:
|
||||
data = open(filename, 'rb').read()
|
||||
ok = True
|
||||
except Exception as err:
|
||||
self.em.rclog("runCmd: error reading %s: %s"%(filename, err))
|
||||
return(False, "")
|
||||
for line in data.split('\n'):
|
||||
postproc.takeLine(line)
|
||||
return True, postproc.wrapData()
|
||||
else:
|
||||
try:
|
||||
fullcmd = cmd + [filename]
|
||||
proc = subprocess.Popen(fullcmd,
|
||||
stdout = subprocess.PIPE)
|
||||
stdout = proc.stdout
|
||||
except subprocess.CalledProcessError as err:
|
||||
self.em.rclog("extractone: Popen(%s) error: %s" % (fullcmd, err))
|
||||
return (False, "")
|
||||
except OSError as err:
|
||||
self.em.rclog("extractone: Popen(%s) OS error: %s" %
|
||||
(fullcmd, err))
|
||||
return (False, "")
|
||||
|
||||
for line in stdout:
|
||||
postproc.takeLine(line.strip())
|
||||
|
||||
proc.wait()
|
||||
if (opt & self.opt_ignxval) == 0 and proc.returncode:
|
||||
self.em.rclog("extractone: [%s] returncode %d" % \
|
||||
(filename, proc.returncode))
|
||||
return False, postproc.wrapData()
|
||||
else:
|
||||
return True, postproc.wrapData()
|
||||
|
||||
def extractone(self, params):
|
||||
#self.em.rclog("extractone %s %s" % (params["filename:"], \
|
||||
|
||||
@ -13,13 +13,17 @@ import os
|
||||
import xml.sax
|
||||
|
||||
class XLSProcessData:
|
||||
def __init__(self, em):
|
||||
def __init__(self, em, ishtml = False):
|
||||
self.em = em
|
||||
self.out = ""
|
||||
self.gotdata = 0
|
||||
self.xmldata = ""
|
||||
self.ishtml = ishtml
|
||||
|
||||
def takeLine(self, line):
|
||||
if self.ishtml:
|
||||
self.out += line + "\n"
|
||||
return
|
||||
if not self.gotdata:
|
||||
self.out += '''<html><head>''' + \
|
||||
'''<meta http-equiv="Content-Type" ''' + \
|
||||
@ -29,6 +33,8 @@ class XLSProcessData:
|
||||
self.xmldata += line
|
||||
|
||||
def wrapData(self):
|
||||
if self.ishtml:
|
||||
return self.out
|
||||
handler = xlsxmltocsv.XlsXmlHandler()
|
||||
data = xml.sax.parseString(self.xmldata, handler)
|
||||
self.out += self.em.htmlescape(handler.output)
|
||||
@ -47,6 +53,14 @@ class XLSFilter:
|
||||
if self.ntry:
|
||||
return ([], None)
|
||||
self.ntry = 1
|
||||
# Some HTML files masquerade as XLS
|
||||
try:
|
||||
data = open(fn, 'rb').read(512)
|
||||
if data.find('html') != -1 or data.find('HTML') != -1:
|
||||
return ("cat", XLSProcessData(self.em, True))
|
||||
except Exception as err:
|
||||
self.em.rclog("Error reading %s:%s" % (fn, str(err)))
|
||||
pass
|
||||
cmd = rclexecm.which("xls-dump.py")
|
||||
if cmd:
|
||||
# xls-dump.py often exits 1 with valid data. Ignore exit value
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user