diff --git a/src/filters/rclchm b/src/filters/rclchm index 41dfe13f..f805b102 100755 --- a/src/filters/rclchm +++ b/src/filters/rclchm @@ -91,17 +91,20 @@ class rclCHM: """Extract one path-named internal file from the chm file""" #self.em.rclog("extractone: [%s]"%(path)) - eof = (self.currentindex >= len(self.tp.contents) -1) + iseof = rclexecm.RclExecM.noteof + if self.currentindex >= len(self.tp.contents) -1: + iseof = rclexecm.RclExecM.eofnext + res, ui = self.chm.ResolveObject(path) #self.em.rclog("extract: ResolveO: %d [%s]" % (res, ui)) if res != chmlib.CHM_RESOLVE_SUCCESS: - return (False, "", path, eof) + return (False, "", path, iseof) # RetrieveObject() returns len,value res, doc = self.chm.RetrieveObject(ui) #self.em.rclog("extract: RetrieveObject: %d [%s]" % (res, doc)) if res > 0: - return (True, doc, path, eof) - return (False, "", path, eof) + return (True, doc, path, iseof) + return (False, "", path, iseof) def openfile(self, params): """Open the chm file and build the contents list by extracting and @@ -128,7 +131,7 @@ class rclCHM: def getnext(self, params): if self.currentindex >= len(self.tp.contents): - return (False, "", "", 1) + return (False, "", "", rclexecm.RclExecM.eofnow) else: ret= self.extractone(self.tp.contents[self.currentindex]) self.currentindex += 1 diff --git a/src/filters/rclexecm.py b/src/filters/rclexecm.py index 3a366dc3..dca20b89 100644 --- a/src/filters/rclexecm.py +++ b/src/filters/rclexecm.py @@ -6,6 +6,10 @@ import sys import os class RclExecM: + noteof = 0 + eofnext = 1 + eofnow = 2 + def __init__(self): self.myname = os.path.basename(sys.argv[0]) self.mimetype = "" @@ -45,7 +49,7 @@ class RclExecM: return (paramname, paramdata) # Send answer: document, ipath, possible eof. - def answer(self, docdata, ipath, iseof): + def answer(self, docdata, ipath, iseof = noteof): print "Document:", len(docdata) sys.stdout.write(docdata) @@ -59,8 +63,11 @@ class RclExecM: sys.stdout.write(self.mimetype) # If we're at the end of the contents, say so - if iseof: - print "Eof: 0" + if iseof == self.eofnow: + print "Eofnow: 0" + elif iseof == self.eofnext: + print "Eofnext: 0" + # End of message print sys.stdout.flush() diff --git a/src/filters/rclics b/src/filters/rclics index 116f0928..e6662475 100755 --- a/src/filters/rclics +++ b/src/filters/rclics @@ -15,8 +15,11 @@ class IcalExtractor: return(False, "", "", True) docdata = self.contents[index].as_string() #self.em.rclog(docdata) - eof = (self.currentindex >= len(self.contents) -1) - return (True, docdata, str(index), eof) + + iseof = rclexecm.RclExecM.noteof + if self.currentindex >= len(self.contents) -1: + iseof = rclexecm.RclExecM.eofnext + return (True, docdata, str(index), iseof) ###### File type handler api, used by rclexecm ----------> def openfile(self, params): @@ -40,7 +43,7 @@ class IcalExtractor: def getnext(self, params): if self.currentindex >= len(self.contents): #em.rclog("getnext: EOF hit") - return (False, "", "", 1) + return (False, "", "", rclexecm.RclExecM.eofnow) else: ret= self.extractone(self.currentindex) self.currentindex += 1 diff --git a/src/filters/rclimg b/src/filters/rclimg index a627924a..c8ceaefe 100755 --- a/src/filters/rclimg +++ b/src/filters/rclimg @@ -95,7 +95,7 @@ sub imgTagsToHtml { return $output; } -# Get one line from stdin, exit on eof +# Get one line from stdin (from recollindex), exit on eof sub readlineorexit { my $s = ; unless ($s) { @@ -131,7 +131,7 @@ sub readparam { } # -# Main program starts here +# Main program starts here. Talks the rclexecm protocol # # JFD: replaced the "use" call with a runtime load with error checking, @@ -159,9 +159,9 @@ while (1) { } unless (defined $params{"filename:"}) { print STDERR "RCLIMG: no filename ??\n"; - # Recoll is requesting next subdocument, but we have none, just say - # so: - print "Document: 0\n\n"; + # Recoll is requesting next subdocument (it shouldn't cause we + # returned eofnext last time), but we have none, just say so: + print "Eofnow:0\nDocument: 0\n\n"; next; } @@ -172,7 +172,7 @@ while (1) { # print STDERR "RCLIMG: writing $l bytes of data\n"; print $data; # Say we have no further documents for this file - print "Eof: 0\n"; + print "Eofnext: 0\n"; # End of output parameters: print empty line print "\n"; # print STDERR "RCLIMG: done writing data\n"; diff --git a/src/filters/rclzip b/src/filters/rclzip index 9562f9df..d6e95a31 100755 --- a/src/filters/rclzip +++ b/src/filters/rclzip @@ -22,8 +22,10 @@ class ZipExtractor: except error, err: self.em.rclog("extractone: failed: [%s]" % err) ok = False - eof = (self.currentindex >= len(self.zip.namelist()) -1) - return (ok, docdata, ipath, eof) + iseof = rclexecm.RclExecM.noteof + if self.currentindex >= len(self.zip.namelist()) -1: + iseof = rclexecm.RclExecM.eofnext + return (ok, docdata, ipath, iseof) ###### File type handler api, used by rclexecm ----------> def openfile(self, params): @@ -40,7 +42,7 @@ class ZipExtractor: def getnext(self, params): if self.currentindex >= len(self.zip.namelist()): #self.em.rclog("getnext: EOF hit") - return (False, "", "", 1) + return (False, "", "", rclexecm.RclExecM.eofnow) else: ret= self.extractone(self.zip.namelist()[self.currentindex]) self.currentindex += 1 diff --git a/src/internfile/mh_execm.cpp b/src/internfile/mh_execm.cpp index b6e42eda..8d3f0f35 100644 --- a/src/internfile/mh_execm.cpp +++ b/src/internfile/mh_execm.cpp @@ -166,7 +166,8 @@ bool MimeHandlerExecMultiple::next_document() // Read answer (multiple elements) LOGDEB1(("MHExecMultiple: reading answer\n")); - bool eof_received = false; + bool eofnext_received = false; + bool eofnow_received = false; string ipath; string mtype; for (int loop=0;;loop++) { @@ -176,9 +177,13 @@ bool MimeHandlerExecMultiple::next_document() } if (name.empty()) break; - if (!stringlowercmp("eof:", name)) { - LOGDEB(("MHExecMultiple: got EOF\n")); - eof_received = true; + if (!stringlowercmp("eofnext:", name)) { + LOGDEB(("MHExecMultiple: got EOFNEXT\n")); + eofnext_received = true; + } + if (!stringlowercmp("eofnow:", name)) { + LOGDEB(("MHExecMultiple: got EOFNOW\n")); + eofnow_received = true; } if (!stringlowercmp("ipath:", name)) { ipath = data; @@ -194,17 +199,21 @@ bool MimeHandlerExecMultiple::next_document() return false; } } - // The end of data can be signaled from the filter in two ways: - // either by returning an empty document (if the filter just hits - // eof while trying to read the doc), or with an "eof:" field - // accompanying a normal document (if the filter hit eof at the - // end of the current doc, which is the preferred way). - if (m_metaData["content"].length() == 0) { - LOGDEB(("MHExecMultiple: got empty document\n")); + + if (eofnow_received) { + // No more docs m_havedoc = false; return false; } + // It used to be that eof could be signalled just by an empty document, but + // this was wrong. Empty documents can be found ie in zip files and should + // not be interpreted as eof. + if (m_metaData["content"].length() == 0) { + LOGDEB0(("MHExecMultiple: got empty document inside [%s]: [%s]\n", + m_fn.c_str(), ipath.c_str())); + } + // If this has an ipath, it is an internal doc from a // multi-document file. In this case, either the filter supplies the // mimetype, or the ipath MUST be a filename-like string which we can use @@ -217,7 +226,10 @@ bool MimeHandlerExecMultiple::next_document() // mimetype() won't call idFile when there is no file. Do it mtype = idFileMem(m_metaData["content"]); if (mtype.empty()) { - LOGERR(("MHExecMultiple: cant guess mime type\n")); + // Note this happens for example for directory zip members + // We could recognize them by the end /, but wouldn't know + // what to do with them anyway. + LOGINFO(("MHExecMultiple: cant guess mime type\n")); mtype = "application/octet-stream"; } } @@ -238,7 +250,7 @@ bool MimeHandlerExecMultiple::next_document() } } - if (eof_received) + if (eofnext_received) m_havedoc = false; return true;