xls filter: catch HTML files disguising as XLS
This commit is contained in:
parent
110ee2d809
commit
92bb5bfc43
@ -46,30 +46,43 @@ class Executor:
|
|||||||
def runCmd(self, cmd, filename, postproc, opt):
|
def runCmd(self, cmd, filename, postproc, opt):
|
||||||
''' Substitute parameters and execute command, process output
|
''' Substitute parameters and execute command, process output
|
||||||
with the specific postprocessor and return the complete text.
|
with the specific postprocessor and return the complete text.
|
||||||
We expect cmd as a list of command name + arguments'''
|
We expect cmd as a list of command name + arguments, except that, for
|
||||||
|
the special value "cat", we just read the file'''
|
||||||
|
|
||||||
try:
|
if cmd == "cat":
|
||||||
fullcmd = cmd + [filename]
|
try:
|
||||||
proc = subprocess.Popen(fullcmd,
|
data = open(filename, 'rb').read()
|
||||||
stdout = subprocess.PIPE)
|
ok = True
|
||||||
stdout = proc.stdout
|
except Exception as err:
|
||||||
except subprocess.CalledProcessError as err:
|
self.em.rclog("runCmd: error reading %s: %s"%(filename, err))
|
||||||
self.em.rclog("extractone: Popen(%s) error: %s" % (fullcmd, err))
|
return(False, "")
|
||||||
return (False, "")
|
for line in data.split('\n'):
|
||||||
except OSError as err:
|
postproc.takeLine(line)
|
||||||
self.em.rclog("extractone: Popen(%s) OS error: %s" % (fullcmd, err))
|
|
||||||
return (False, "")
|
|
||||||
|
|
||||||
for line in stdout:
|
|
||||||
postproc.takeLine(line.strip())
|
|
||||||
|
|
||||||
proc.wait()
|
|
||||||
if (opt & self.opt_ignxval) == 0 and proc.returncode:
|
|
||||||
self.em.rclog("extractone: [%s] returncode %d" % \
|
|
||||||
(filename, proc.returncode))
|
|
||||||
return False, postproc.wrapData()
|
|
||||||
else:
|
|
||||||
return True, postproc.wrapData()
|
return True, postproc.wrapData()
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
fullcmd = cmd + [filename]
|
||||||
|
proc = subprocess.Popen(fullcmd,
|
||||||
|
stdout = subprocess.PIPE)
|
||||||
|
stdout = proc.stdout
|
||||||
|
except subprocess.CalledProcessError as err:
|
||||||
|
self.em.rclog("extractone: Popen(%s) error: %s" % (fullcmd, err))
|
||||||
|
return (False, "")
|
||||||
|
except OSError as err:
|
||||||
|
self.em.rclog("extractone: Popen(%s) OS error: %s" %
|
||||||
|
(fullcmd, err))
|
||||||
|
return (False, "")
|
||||||
|
|
||||||
|
for line in stdout:
|
||||||
|
postproc.takeLine(line.strip())
|
||||||
|
|
||||||
|
proc.wait()
|
||||||
|
if (opt & self.opt_ignxval) == 0 and proc.returncode:
|
||||||
|
self.em.rclog("extractone: [%s] returncode %d" % \
|
||||||
|
(filename, proc.returncode))
|
||||||
|
return False, postproc.wrapData()
|
||||||
|
else:
|
||||||
|
return True, postproc.wrapData()
|
||||||
|
|
||||||
def extractone(self, params):
|
def extractone(self, params):
|
||||||
#self.em.rclog("extractone %s %s" % (params["filename:"], \
|
#self.em.rclog("extractone %s %s" % (params["filename:"], \
|
||||||
|
|||||||
@ -13,13 +13,17 @@ import os
|
|||||||
import xml.sax
|
import xml.sax
|
||||||
|
|
||||||
class XLSProcessData:
|
class XLSProcessData:
|
||||||
def __init__(self, em):
|
def __init__(self, em, ishtml = False):
|
||||||
self.em = em
|
self.em = em
|
||||||
self.out = ""
|
self.out = ""
|
||||||
self.gotdata = 0
|
self.gotdata = 0
|
||||||
self.xmldata = ""
|
self.xmldata = ""
|
||||||
|
self.ishtml = ishtml
|
||||||
|
|
||||||
def takeLine(self, line):
|
def takeLine(self, line):
|
||||||
|
if self.ishtml:
|
||||||
|
self.out += line + "\n"
|
||||||
|
return
|
||||||
if not self.gotdata:
|
if not self.gotdata:
|
||||||
self.out += '''<html><head>''' + \
|
self.out += '''<html><head>''' + \
|
||||||
'''<meta http-equiv="Content-Type" ''' + \
|
'''<meta http-equiv="Content-Type" ''' + \
|
||||||
@ -29,6 +33,8 @@ class XLSProcessData:
|
|||||||
self.xmldata += line
|
self.xmldata += line
|
||||||
|
|
||||||
def wrapData(self):
|
def wrapData(self):
|
||||||
|
if self.ishtml:
|
||||||
|
return self.out
|
||||||
handler = xlsxmltocsv.XlsXmlHandler()
|
handler = xlsxmltocsv.XlsXmlHandler()
|
||||||
data = xml.sax.parseString(self.xmldata, handler)
|
data = xml.sax.parseString(self.xmldata, handler)
|
||||||
self.out += self.em.htmlescape(handler.output)
|
self.out += self.em.htmlescape(handler.output)
|
||||||
@ -47,6 +53,14 @@ class XLSFilter:
|
|||||||
if self.ntry:
|
if self.ntry:
|
||||||
return ([], None)
|
return ([], None)
|
||||||
self.ntry = 1
|
self.ntry = 1
|
||||||
|
# Some HTML files masquerade as XLS
|
||||||
|
try:
|
||||||
|
data = open(fn, 'rb').read(512)
|
||||||
|
if data.find('html') != -1 or data.find('HTML') != -1:
|
||||||
|
return ("cat", XLSProcessData(self.em, True))
|
||||||
|
except Exception as err:
|
||||||
|
self.em.rclog("Error reading %s:%s" % (fn, str(err)))
|
||||||
|
pass
|
||||||
cmd = rclexecm.which("xls-dump.py")
|
cmd = rclexecm.which("xls-dump.py")
|
||||||
if cmd:
|
if cmd:
|
||||||
# xls-dump.py often exits 1 with valid data. Ignore exit value
|
# xls-dump.py often exits 1 with valid data. Ignore exit value
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user