This commit is contained in:
dockes 2009-10-24 06:37:00 +00:00
parent 96855e3aea
commit 63ac7f6458
3 changed files with 62 additions and 19 deletions

View File

@ -1,60 +1,98 @@
#!/usr/bin/env python
"""Extract Html files from a Microsoft Compiled Html Help file (.chm)
Needs at least python 2.2 for HTMLParser (chmlib needs 2.2 too)"""
import sys
import os
import rclexecm
from chm import chm,chmlib
from HTMLParser import HTMLParser
class ChmTopicsParser(HTMLParser):
"""Use HTMLParser to parse the chm's Topic file which is basically
"""Parse the chm's Topic file which is basically
a listing of internal nodes (html files mostly). Build a list of
all nodes (self.contents), which will then be used to walk and index
the chm.
Most nodes in the Topic file look like the following:
<LI> <OBJECT type="text/sitemap">
<param name="Name" value="Global Module Index">
<param name="Local" value="modindex.html">
</OBJECT>
Maybe we should filter out non "text/sitemap" Objects, and maybe there are
things of interest whose name is not Local, but for now, we just take
all values for parameters named "Local", and this seems to work ok.
all values for parameters named "Local" (with some filtering/massaging),
until proven wrong
"""
def __init__(self):
HTMLParser.__init__(self)
self.contents = []
def handle_starttag(self, tag, attrs):
#print >> sys.stderr, "Encountered the beginning of a %s tag" % tag
# If this is a param tag with name Local, we're interested in
# the value which lists an internal file. Discard those with #
# the value which lists a file ref. Discard those with #
# in them (references inside files)
if tag == 'param':
name = ''
for (nm,val) in attrs:
if nm == 'name':
name = val
if nm == 'value':
value = val.encode('utf-8')
if name == 'Local':
if value.find("#") == -1:
self.contents.append(value)
# Sometimes it seems that refs are like Vendor:filename::path,
# we only keep the path, and only if the file matches
if tag != 'param':
return
name = ''
value = ''
for (nm,val) in attrs:
if nm == 'name':
name = val
if nm == 'value':
value = val
if name != 'Local' or value == '':
return
localpath = ""
ll = value.split(":")
if len(ll) == 1:
localpath = value
elif len(ll) == 4 and ll[-1] and ll[-3]:
#print >>sys.stderr, "File: %s" % ll[-3]
if ll[-3] == self.fname:
localpath = ll[-1]
else:
#print >> sys.stderr, "SKIPPING %s" % ll[-3]
pass
if len(localpath) != 0 and localpath.find("#") == -1:
if localpath[0] != '/':
localpath = "/" + localpath
self.contents.append(localpath)
def reset(self):
self.contents = []
self.fname = ""
HTMLParser.reset(self)
def setname(self, name):
self.fname = name
class rclCHM:
"""RclExecM slave worker for extracting all files from an Msoft chm (.ics)
"""RclExecM slave worker for extracting all files from an Msoft chm
file. We first extract the list of internal nodes, and them return them
one by one. The ipath is the node path"""
def __init__(self, em):
self.chm = chm.CHMFile()
self.tp = ChmTopicsParser()
self.currentindex = 0
self.em = em
def extractone(self, path):
"""Extract one path-named internal file from the chm file"""
#self.em.rclog("extractone: [%s]"%(path))
eof = (self.currentindex >= len(self.tp.contents) -1)
res, ui = self.chm.ResolveObject("/" + path)
res, ui = self.chm.ResolveObject(path)
#self.em.rclog("extract: ResolveO: %d [%s]" % (res, ui))
if res != chmlib.CHM_RESOLVE_SUCCESS:
return (False, "", path, eof)
@ -68,15 +106,21 @@ class rclCHM:
def openfile(self, params):
"""Open the chm file and build the contents list by extracting and
parsing the Topics object"""
self.chm.LoadCHM(params["filename:"])
self.currentindex = 0
self.tp.reset()
filename = params["filename:"]
self.chm.LoadCHM(filename)
self.chm.GetArchiveInfo()
self.topics = self.chm.GetTopicsTree()
if self.topics == None:
return False
#self.em.rclog(self.topics)
# Parse Topics file and extract list of internal nodes
self.tp.setname(os.path.basename(filename))
self.tp.feed(self.topics)
self.tp.close()
#self.em.rclog("Contents size %d" % len(self.tp.contents))
return True
def getipath(self, params):

View File

@ -3,10 +3,10 @@
import rclexecm
from icalendar import Calendar, Event
class IcalExtractor:
def __init__(self, em):
self.file = ""
self.contents = []
self.em = em
em.setmimetype("text/plain")

View File

@ -2,7 +2,6 @@
# Zip file filter for Recoll
import os
import rclexecm
from zipfile import ZipFile, error