chm filter: handle files lacking a topics node

This commit is contained in:
Jean-Francois Dockes 2011-12-17 16:41:45 +01:00
parent 5fa720f23d
commit 502f7e783e
3 changed files with 109 additions and 19 deletions

View File

@ -4,7 +4,10 @@ Needs at least python 2.2 for HTMLParser (chmlib needs 2.2 too)"""
import sys import sys
import os import os
import urlparse
import rclexecm import rclexecm
try: try:
from chm import chm,chmlib from chm import chm,chmlib
except: except:
@ -85,6 +88,71 @@ class ChmTopicsParser(HTMLParser):
def setname(self, name): def setname(self, name):
self.fname = name self.fname = name
def getfile(chmfile, path):
"""Extract internal file text from chm object, given path"""
res, ui = chmfile.ResolveObject(path)
if res != chmlib.CHM_RESOLVE_SUCCESS:
#print "ResolveObject failed", path
return ""
res, doc = chmfile.RetrieveObject(ui)
if not res:
print "RetrieveObject failed", path
return ""
return doc
class ChmWalker(HTMLParser):
"""Links tree walker. This recursivelyfollows all internal links
found in the from the top node given as input, and augments the contents
list."""
def __init__(self, chm, path, contents):
HTMLParser.__init__(self)
self.chm = chm
self.contents = contents
self.path = os.path.normpath(path)
self.dir = os.path.dirname(self.path)
contents.append(self.path)
def handle_starttag(self, tag, attrs):
if tag != 'a':
return
href = ''
for (nm,val) in attrs:
if nm == 'href':
href = val
path = ""
res = urlparse.urlparse(href)
if (not res.scheme or res.scheme.lower == "ms-its"):
path = res.path
lpath = path.split(':')
if len(lpath) == 3 and lpath[1] == cefilename:
# MS-ITS::somefile.chm:/some/path/file.htm ?
path = lpath[2]
elif len(lpath) == 1:
path = lpath[0]
else:
path = ""
if path:
#print "got path", path, "me", self.path, "dir", self.dir
if path[0] == "/":
npath = os.path.normpath(path)
else:
npath = os.path.normpath(os.path.join(self.dir, path))
if not npath in self.contents:
#print("Going into [%s] paths [%s]\n" %
#(npath,str(self.contents)))
text = getfile(self.chm, npath)
if text:
try:
newwalker = ChmWalker(self.chm, npath, self.contents)
newwalker.feed(text)
except:
pass
class rclCHM: class rclCHM:
"""RclExecM slave worker for extracting all files from an Msoft chm """RclExecM slave worker for extracting all files from an Msoft chm
file. We first extract the list of internal nodes, and them return them file. We first extract the list of internal nodes, and them return them
@ -123,17 +191,39 @@ class rclCHM:
self.currentindex = 0 self.currentindex = 0
self.tp.reset() self.tp.reset()
filename = params["filename:"] filename = params["filename:"]
self.chm.LoadCHM(filename) if not self.chm.LoadCHM(filename):
self.chm.GetArchiveInfo() self.em.rclog("LoadCHM failed")
self.topics = self.chm.GetTopicsTree()
if self.topics == None:
return False return False
#self.em.rclog(self.topics) if not self.chm.GetArchiveInfo():
# Parse Topics file and extract list of internal nodes self.em.rclog("GetArchiveInfo failed")
self.tp.setname(os.path.basename(filename)) return False
self.tp.feed(self.topics)
self.tp.close() self.topics = self.chm.GetTopicsTree()
if self.topics:
# Parse Topics file and extract list of internal nodes
self.tp.setname(os.path.basename(filename))
self.tp.feed(self.topics)
self.tp.close()
else:
# No topics. If there is a home, let's try to walk the tree
#self.em.rclog("GetTopicsTree failed")
if not self.chm.home:
self.em.rclog("No topics and no home")
return False
home = self.chm.home
if home[0] != '/':
home = "/" + home
text = getfile(self.chm, home)
if not text:
self.em.rclog("No topics and no home content")
return False
walker = ChmWalker(self.chm, self.chm.home, self.tp.contents)
walker.feed(text)
walker.close()
#self.em.rclog("Contents size %d" % len(self.tp.contents)) #self.em.rclog("Contents size %d" % len(self.tp.contents))
uniq = set(self.tp.contents)
self.tp.contents = list(uniq)
return True return True
def getipath(self, params): def getipath(self, params):

View File

@ -5,14 +5,12 @@ topdir=`dirname $0`/..
initvariables $0 initvariables $0
recollq '"nokia ovi suite" wmdrm "windows media player version 11"' \ (
2> $mystderr | egrep -v '^Recoll query: ' > $mystdout recollq '"nokia ovi suite" wmdrm "windows media player version 11"'
recollq '"pour superposer mixer des fichiers son"'
recollq '"pour superposer mixer des fichiers son"' \ recollq '"Django comes with a user authentication system"'
2>> $mystderr | egrep -v '^Recoll query: ' >> $mystdout recollq '"establishment of a project cost accounting system of ledgers"'
) 2> $mystderr | egrep -v '^Recoll query: ' > $mystdout
recollq '"Django comes with a user authentication system"' \
2>> $mystderr | egrep -v '^Recoll query: ' >> $mystdout
diff -w ${myname}.txt $mystdout > $mydiffs 2>&1 diff -w ${myname}.txt $mystdout > $mydiffs 2>&1

View File

@ -4,3 +4,5 @@ text/html [file:///home/dockes/projets/fulltext/testrecoll/chm/Nokia_Nseries_Hel
text/html [file:///home/dockes/projets/fulltext/testrecoll/chm/soundrec.chm] [Superposer (mixer) des fichiers son] 35269 bytes text/html [file:///home/dockes/projets/fulltext/testrecoll/chm/soundrec.chm] [Superposer (mixer) des fichiers son] 35269 bytes
1 results 1 results
text/html [file:///home/dockes/projets/fulltext/testrecoll/chm/Django-1.1a1-r9905.chm] [User authentication in Django] 1731089 bytes text/html [file:///home/dockes/projets/fulltext/testrecoll/chm/Django-1.1a1-r9905.chm] [User authentication in Django] 1731089 bytes
1 results
text/html [file:///home/dockes/projets/fulltext/testrecoll/chm/PMGlossary.chm] [Project Management Glossary: P09] 782892 bytes