cleanup
This commit is contained in:
parent
96855e3aea
commit
63ac7f6458
@ -1,60 +1,98 @@
|
||||
#!/usr/bin/env python
|
||||
"""Extract Html files from a Microsoft Compiled Html Help file (.chm)
|
||||
Needs at least python 2.2 for HTMLParser (chmlib needs 2.2 too)"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import rclexecm
|
||||
from chm import chm,chmlib
|
||||
from HTMLParser import HTMLParser
|
||||
|
||||
class ChmTopicsParser(HTMLParser):
|
||||
"""Use HTMLParser to parse the chm's Topic file which is basically
|
||||
"""Parse the chm's Topic file which is basically
|
||||
a listing of internal nodes (html files mostly). Build a list of
|
||||
all nodes (self.contents), which will then be used to walk and index
|
||||
the chm.
|
||||
|
||||
Most nodes in the Topic file look like the following:
|
||||
<LI> <OBJECT type="text/sitemap">
|
||||
<param name="Name" value="Global Module Index">
|
||||
<param name="Local" value="modindex.html">
|
||||
</OBJECT>
|
||||
|
||||
Maybe we should filter out non "text/sitemap" Objects, and maybe there are
|
||||
things of interest whose name is not Local, but for now, we just take
|
||||
all values for parameters named "Local", and this seems to work ok.
|
||||
all values for parameters named "Local" (with some filtering/massaging),
|
||||
until proven wrong
|
||||
"""
|
||||
def __init__(self):
|
||||
HTMLParser.__init__(self)
|
||||
self.contents = []
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
#print >> sys.stderr, "Encountered the beginning of a %s tag" % tag
|
||||
# If this is a param tag with name Local, we're interested in
|
||||
# the value which lists an internal file. Discard those with #
|
||||
# the value which lists a file ref. Discard those with #
|
||||
# in them (references inside files)
|
||||
if tag == 'param':
|
||||
name = ''
|
||||
for (nm,val) in attrs:
|
||||
if nm == 'name':
|
||||
name = val
|
||||
if nm == 'value':
|
||||
value = val.encode('utf-8')
|
||||
if name == 'Local':
|
||||
if value.find("#") == -1:
|
||||
self.contents.append(value)
|
||||
# Sometimes it seems that refs are like Vendor:filename::path,
|
||||
# we only keep the path, and only if the file matches
|
||||
|
||||
if tag != 'param':
|
||||
return
|
||||
|
||||
name = ''
|
||||
value = ''
|
||||
for (nm,val) in attrs:
|
||||
if nm == 'name':
|
||||
name = val
|
||||
if nm == 'value':
|
||||
value = val
|
||||
|
||||
if name != 'Local' or value == '':
|
||||
return
|
||||
|
||||
localpath = ""
|
||||
ll = value.split(":")
|
||||
if len(ll) == 1:
|
||||
localpath = value
|
||||
elif len(ll) == 4 and ll[-1] and ll[-3]:
|
||||
#print >>sys.stderr, "File: %s" % ll[-3]
|
||||
if ll[-3] == self.fname:
|
||||
localpath = ll[-1]
|
||||
else:
|
||||
#print >> sys.stderr, "SKIPPING %s" % ll[-3]
|
||||
pass
|
||||
|
||||
if len(localpath) != 0 and localpath.find("#") == -1:
|
||||
if localpath[0] != '/':
|
||||
localpath = "/" + localpath
|
||||
self.contents.append(localpath)
|
||||
|
||||
def reset(self):
|
||||
self.contents = []
|
||||
self.fname = ""
|
||||
HTMLParser.reset(self)
|
||||
|
||||
def setname(self, name):
|
||||
self.fname = name
|
||||
|
||||
class rclCHM:
|
||||
"""RclExecM slave worker for extracting all files from an Msoft chm (.ics)
|
||||
"""RclExecM slave worker for extracting all files from an Msoft chm
|
||||
file. We first extract the list of internal nodes, and them return them
|
||||
one by one. The ipath is the node path"""
|
||||
|
||||
def __init__(self, em):
|
||||
self.chm = chm.CHMFile()
|
||||
self.tp = ChmTopicsParser()
|
||||
self.currentindex = 0
|
||||
self.em = em
|
||||
|
||||
|
||||
def extractone(self, path):
|
||||
"""Extract one path-named internal file from the chm file"""
|
||||
|
||||
#self.em.rclog("extractone: [%s]"%(path))
|
||||
eof = (self.currentindex >= len(self.tp.contents) -1)
|
||||
res, ui = self.chm.ResolveObject("/" + path)
|
||||
res, ui = self.chm.ResolveObject(path)
|
||||
#self.em.rclog("extract: ResolveO: %d [%s]" % (res, ui))
|
||||
if res != chmlib.CHM_RESOLVE_SUCCESS:
|
||||
return (False, "", path, eof)
|
||||
@ -68,15 +106,21 @@ class rclCHM:
|
||||
def openfile(self, params):
|
||||
"""Open the chm file and build the contents list by extracting and
|
||||
parsing the Topics object"""
|
||||
self.chm.LoadCHM(params["filename:"])
|
||||
|
||||
self.currentindex = 0
|
||||
self.tp.reset()
|
||||
filename = params["filename:"]
|
||||
self.chm.LoadCHM(filename)
|
||||
self.chm.GetArchiveInfo()
|
||||
self.topics = self.chm.GetTopicsTree()
|
||||
if self.topics == None:
|
||||
return False
|
||||
#self.em.rclog(self.topics)
|
||||
# Parse Topics file and extract list of internal nodes
|
||||
self.tp.setname(os.path.basename(filename))
|
||||
self.tp.feed(self.topics)
|
||||
self.tp.close()
|
||||
#self.em.rclog("Contents size %d" % len(self.tp.contents))
|
||||
return True
|
||||
|
||||
def getipath(self, params):
|
||||
|
||||
@ -3,10 +3,10 @@
|
||||
import rclexecm
|
||||
from icalendar import Calendar, Event
|
||||
|
||||
|
||||
class IcalExtractor:
|
||||
def __init__(self, em):
|
||||
self.file = ""
|
||||
self.contents = []
|
||||
self.em = em
|
||||
em.setmimetype("text/plain")
|
||||
|
||||
|
||||
@ -2,7 +2,6 @@
|
||||
|
||||
# Zip file filter for Recoll
|
||||
|
||||
import os
|
||||
import rclexecm
|
||||
from zipfile import ZipFile, error
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user