chm filter: handle files lacking a topics node
This commit is contained in:
parent
5fa720f23d
commit
502f7e783e
@ -4,7 +4,10 @@ Needs at least python 2.2 for HTMLParser (chmlib needs 2.2 too)"""
|
|||||||
|
|
||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
|
import urlparse
|
||||||
|
|
||||||
import rclexecm
|
import rclexecm
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from chm import chm,chmlib
|
from chm import chm,chmlib
|
||||||
except:
|
except:
|
||||||
@ -84,6 +87,71 @@ class ChmTopicsParser(HTMLParser):
|
|||||||
|
|
||||||
def setname(self, name):
|
def setname(self, name):
|
||||||
self.fname = name
|
self.fname = name
|
||||||
|
|
||||||
|
def getfile(chmfile, path):
|
||||||
|
"""Extract internal file text from chm object, given path"""
|
||||||
|
res, ui = chmfile.ResolveObject(path)
|
||||||
|
if res != chmlib.CHM_RESOLVE_SUCCESS:
|
||||||
|
#print "ResolveObject failed", path
|
||||||
|
return ""
|
||||||
|
res, doc = chmfile.RetrieveObject(ui)
|
||||||
|
if not res:
|
||||||
|
print "RetrieveObject failed", path
|
||||||
|
return ""
|
||||||
|
return doc
|
||||||
|
|
||||||
|
|
||||||
|
class ChmWalker(HTMLParser):
|
||||||
|
"""Links tree walker. This recursivelyfollows all internal links
|
||||||
|
found in the from the top node given as input, and augments the contents
|
||||||
|
list."""
|
||||||
|
|
||||||
|
def __init__(self, chm, path, contents):
|
||||||
|
HTMLParser.__init__(self)
|
||||||
|
self.chm = chm
|
||||||
|
self.contents = contents
|
||||||
|
self.path = os.path.normpath(path)
|
||||||
|
self.dir = os.path.dirname(self.path)
|
||||||
|
contents.append(self.path)
|
||||||
|
|
||||||
|
def handle_starttag(self, tag, attrs):
|
||||||
|
if tag != 'a':
|
||||||
|
return
|
||||||
|
|
||||||
|
href = ''
|
||||||
|
for (nm,val) in attrs:
|
||||||
|
if nm == 'href':
|
||||||
|
href = val
|
||||||
|
|
||||||
|
path = ""
|
||||||
|
res = urlparse.urlparse(href)
|
||||||
|
if (not res.scheme or res.scheme.lower == "ms-its"):
|
||||||
|
path = res.path
|
||||||
|
lpath = path.split(':')
|
||||||
|
if len(lpath) == 3 and lpath[1] == cefilename:
|
||||||
|
# MS-ITS::somefile.chm:/some/path/file.htm ?
|
||||||
|
path = lpath[2]
|
||||||
|
elif len(lpath) == 1:
|
||||||
|
path = lpath[0]
|
||||||
|
else:
|
||||||
|
path = ""
|
||||||
|
|
||||||
|
if path:
|
||||||
|
#print "got path", path, "me", self.path, "dir", self.dir
|
||||||
|
if path[0] == "/":
|
||||||
|
npath = os.path.normpath(path)
|
||||||
|
else:
|
||||||
|
npath = os.path.normpath(os.path.join(self.dir, path))
|
||||||
|
if not npath in self.contents:
|
||||||
|
#print("Going into [%s] paths [%s]\n" %
|
||||||
|
#(npath,str(self.contents)))
|
||||||
|
text = getfile(self.chm, npath)
|
||||||
|
if text:
|
||||||
|
try:
|
||||||
|
newwalker = ChmWalker(self.chm, npath, self.contents)
|
||||||
|
newwalker.feed(text)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
class rclCHM:
|
class rclCHM:
|
||||||
"""RclExecM slave worker for extracting all files from an Msoft chm
|
"""RclExecM slave worker for extracting all files from an Msoft chm
|
||||||
@ -123,19 +191,41 @@ class rclCHM:
|
|||||||
self.currentindex = 0
|
self.currentindex = 0
|
||||||
self.tp.reset()
|
self.tp.reset()
|
||||||
filename = params["filename:"]
|
filename = params["filename:"]
|
||||||
self.chm.LoadCHM(filename)
|
if not self.chm.LoadCHM(filename):
|
||||||
self.chm.GetArchiveInfo()
|
self.em.rclog("LoadCHM failed")
|
||||||
self.topics = self.chm.GetTopicsTree()
|
return False
|
||||||
if self.topics == None:
|
if not self.chm.GetArchiveInfo():
|
||||||
|
self.em.rclog("GetArchiveInfo failed")
|
||||||
return False
|
return False
|
||||||
#self.em.rclog(self.topics)
|
|
||||||
# Parse Topics file and extract list of internal nodes
|
|
||||||
self.tp.setname(os.path.basename(filename))
|
|
||||||
self.tp.feed(self.topics)
|
|
||||||
self.tp.close()
|
|
||||||
#self.em.rclog("Contents size %d" % len(self.tp.contents))
|
|
||||||
return True
|
|
||||||
|
|
||||||
|
self.topics = self.chm.GetTopicsTree()
|
||||||
|
if self.topics:
|
||||||
|
# Parse Topics file and extract list of internal nodes
|
||||||
|
self.tp.setname(os.path.basename(filename))
|
||||||
|
self.tp.feed(self.topics)
|
||||||
|
self.tp.close()
|
||||||
|
else:
|
||||||
|
# No topics. If there is a home, let's try to walk the tree
|
||||||
|
#self.em.rclog("GetTopicsTree failed")
|
||||||
|
if not self.chm.home:
|
||||||
|
self.em.rclog("No topics and no home")
|
||||||
|
return False
|
||||||
|
home = self.chm.home
|
||||||
|
if home[0] != '/':
|
||||||
|
home = "/" + home
|
||||||
|
text = getfile(self.chm, home)
|
||||||
|
if not text:
|
||||||
|
self.em.rclog("No topics and no home content")
|
||||||
|
return False
|
||||||
|
walker = ChmWalker(self.chm, self.chm.home, self.tp.contents)
|
||||||
|
walker.feed(text)
|
||||||
|
walker.close()
|
||||||
|
|
||||||
|
#self.em.rclog("Contents size %d" % len(self.tp.contents))
|
||||||
|
uniq = set(self.tp.contents)
|
||||||
|
self.tp.contents = list(uniq)
|
||||||
|
return True
|
||||||
|
|
||||||
def getipath(self, params):
|
def getipath(self, params):
|
||||||
return self.extractone(params["ipath:"])
|
return self.extractone(params["ipath:"])
|
||||||
|
|
||||||
|
|||||||
@ -5,14 +5,12 @@ topdir=`dirname $0`/..
|
|||||||
|
|
||||||
initvariables $0
|
initvariables $0
|
||||||
|
|
||||||
recollq '"nokia ovi suite" wmdrm "windows media player version 11"' \
|
(
|
||||||
2> $mystderr | egrep -v '^Recoll query: ' > $mystdout
|
recollq '"nokia ovi suite" wmdrm "windows media player version 11"'
|
||||||
|
recollq '"pour superposer mixer des fichiers son"'
|
||||||
recollq '"pour superposer mixer des fichiers son"' \
|
recollq '"Django comes with a user authentication system"'
|
||||||
2>> $mystderr | egrep -v '^Recoll query: ' >> $mystdout
|
recollq '"establishment of a project cost accounting system of ledgers"'
|
||||||
|
) 2> $mystderr | egrep -v '^Recoll query: ' > $mystdout
|
||||||
recollq '"Django comes with a user authentication system"' \
|
|
||||||
2>> $mystderr | egrep -v '^Recoll query: ' >> $mystdout
|
|
||||||
|
|
||||||
diff -w ${myname}.txt $mystdout > $mydiffs 2>&1
|
diff -w ${myname}.txt $mystdout > $mydiffs 2>&1
|
||||||
|
|
||||||
|
|||||||
@ -4,3 +4,5 @@ text/html [file:///home/dockes/projets/fulltext/testrecoll/chm/Nokia_Nseries_Hel
|
|||||||
text/html [file:///home/dockes/projets/fulltext/testrecoll/chm/soundrec.chm] [Superposer (mixer) des fichiers son] 35269 bytes
|
text/html [file:///home/dockes/projets/fulltext/testrecoll/chm/soundrec.chm] [Superposer (mixer) des fichiers son] 35269 bytes
|
||||||
1 results
|
1 results
|
||||||
text/html [file:///home/dockes/projets/fulltext/testrecoll/chm/Django-1.1a1-r9905.chm] [User authentication in Django] 1731089 bytes
|
text/html [file:///home/dockes/projets/fulltext/testrecoll/chm/Django-1.1a1-r9905.chm] [User authentication in Django] 1731089 bytes
|
||||||
|
1 results
|
||||||
|
text/html [file:///home/dockes/projets/fulltext/testrecoll/chm/PMGlossary.chm] [Project Management Glossary: P09] 782892 bytes
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user