From 2cd0171ce6c47acb2d0a5a199fd37b406c5c098f Mon Sep 17 00:00:00 2001 From: dockes Date: Fri, 23 Oct 2009 16:45:56 +0000 Subject: [PATCH] comments --- src/filters/rclchm | 39 +++++++++++++++++++++++++++------------ 1 file changed, 27 insertions(+), 12 deletions(-) diff --git a/src/filters/rclchm b/src/filters/rclchm index a30ea113..7dc3a7df 100755 --- a/src/filters/rclchm +++ b/src/filters/rclchm @@ -6,12 +6,24 @@ from chm import chm,chmlib from HTMLParser import HTMLParser class ChmTopicsParser(HTMLParser): + """Use HTMLParser to parse the chm's Topic file which is basically + a listing of internal nodes (html files mostly). Build a list of + all nodes (self.contents), which will then be used to walk and index + the chm. + Most nodes in the Topic file look like the following: +
  • + + + + Maybe we should filter out non "text/sitemap" Objects, and maybe there are + things of interest whose name is not Local, but for now, we just take + all values for parameters named "Local", and this seems to work ok. + """ def __init__(self): HTMLParser.__init__(self) self.contents = [] def handle_starttag(self, tag, attrs): - # print "Encountered the beginning of a %s tag" % tag # If this is a param tag with name Local, we're interested in # the value which lists an internal file. Discard those with # # in them (references inside files) @@ -25,41 +37,44 @@ class ChmTopicsParser(HTMLParser): if name == 'Local': if value.find("#") == -1: self.contents.append(value) - #print "nm: %s val %s"%(nm,uval) - - def handle_endtag(self, tag): - #print "Encountered the end of a %s tag" % tag - return None + class rclCHM: + """RclExecM slave worker for extracting all files from an Msoft chm (.ics) + file. We first extract the list of internal nodes, and them return them + one by one. The ipath is the node path""" def __init__(self, em): - self.filename = "" self.chm = chm.CHMFile() self.tp = ChmTopicsParser() self.currentindex = 0 self.em = em + def extractone(self, path): - self.em.rclog("extractone: [%s]"%(path)) + """Extract one path-named internal file from the chm file""" + #self.em.rclog("extractone: [%s]"%(path)) eof = (self.currentindex >= len(self.tp.contents) -1) res, ui = self.chm.ResolveObject("/" + path) #self.em.rclog("extract: ResolveO: %d [%s]" % (res, ui)) if res != chmlib.CHM_RESOLVE_SUCCESS: return (False, "", path, eof) - # Retrieve object returns len,value + # RetrieveObject() returns len,value res, doc = self.chm.RetrieveObject(ui) - # self.em.rclog("extract: RetrieveObject: %d [%s]" % (res, doc)) + #self.em.rclog("extract: RetrieveObject: %d [%s]" % (res, doc)) if res > 0: return (True, doc, path, eof) return (False, "", path, eof) def openfile(self, params): - self.filename = params["filename:"] - self.chm.LoadCHM(self.filename) + """Open the chm file and build the contents list by extracting and + parsing the Topics object""" + self.chm.LoadCHM(params["filename:"]) self.chm.GetArchiveInfo() self.topics = self.chm.GetTopicsTree() if self.topics == None: return False + #self.em.rclog(self.topics) + # Parse Topics file and extract list of internal nodes self.tp.feed(self.topics) self.tp.close() return True