From 1fb94211638fdf7bf8187b536a9c638600ddc3fe Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Fri, 28 Feb 2020 09:22:03 +0100 Subject: [PATCH] OCR: small adjustments for Windows --- src/filters/rclocr.py | 27 +++++++++++++++++++++++++-- src/filters/rclpdf.py | 3 ++- src/python/recoll/recoll/conftree.py | 20 ++++++++++---------- 3 files changed, 37 insertions(+), 13 deletions(-) diff --git a/src/filters/rclocr.py b/src/filters/rclocr.py index c6b17165..9bcb1635 100755 --- a/src/filters/rclocr.py +++ b/src/filters/rclocr.py @@ -31,13 +31,32 @@ import importlib.util import rclconfig import rclocrcache +_mswindows = (sys.platform == "win32") def _deb(s): - print("rclocr: %s" % s, file=sys.stderr) + if not _mswindows: + print("rclocr: %s" % s, file=sys.stderr) def Usage(): _deb("Usage: rclocr.py ") sys.exit(1) +def breakwrite(f, data): + # On Windows, writing big chunks can fail with a "not enough space" + # error. Seems a combined windows/python bug, depending on versions. + # See https://bugs.python.org/issue11395 + # In any case, just break it up + total = len(data) + bs = 4*1024 + offset = 0 + while total > 0: + if total < bs: + tow = total + else: + tow = bs + f.write(data[offset:offset+tow]) + offset += tow + total -= tow + if len(sys.argv) != 2: Usage() @@ -50,7 +69,11 @@ cache = rclocrcache.OCRCache(config) incache, data = cache.get(path) if incache: - sys.stdout.buffer.write(data) + try: + breakwrite(sys.stdout.buffer, data) + except Exception as e: + _deb("RCLOCR error writing: %s" % e) + sys.exit(1) sys.exit(0) #### Data not in cache diff --git a/src/filters/rclpdf.py b/src/filters/rclpdf.py index 27fc339a..830b8e6c 100755 --- a/src/filters/rclpdf.py +++ b/src/filters/rclpdf.py @@ -410,7 +410,8 @@ class PDFExtractor: self.filename] data = subprocess.check_output(cmd) html = _htmlprefix + self.em.htmlescape(data) + _htmlsuffix - except: + except Exception as e: + self.em.rclog("%s failed: %s" % (cmd, e)) pass if self.extrameta: diff --git a/src/python/recoll/recoll/conftree.py b/src/python/recoll/recoll/conftree.py index 99e6eceb..53d445c7 100644 --- a/src/python/recoll/recoll/conftree.py +++ b/src/python/recoll/recoll/conftree.py @@ -195,22 +195,22 @@ class ConfTree(ConfSimple): raise TypeError("getbin: parameters must be binary not unicode") #_debug("ConfTree::getbin: nm [%s] sk [%s]" % (nm, sk)) - if sk == b'' or sk[0] != b'/'[0]: + # Note the test for root. There does not seem to be a direct + # way to do this in os.path + if not sk: return ConfSimple.getbin(self, nm, sk) - if sk[len(sk)-1] == b'/'[0]: - sk = sk[:len(sk)-1] - # Try all sk ancestors as submaps (/a/b/c-> /a/b/c, /a/b, /a, b'') - while sk: + while True: if sk in self.submaps: return ConfSimple.getbin(self, nm, sk) if sk + b'/' in self.submaps: - return ConfSimple.getbin(self, nm, sk+b'/') - i = sk.rfind(b'/') - if i == -1: - break - sk = sk[:i] + return ConfSimple.getbin(self, nm, sk + b'/') + nsk = os.path.dirname(sk) + if nsk == sk: + # sk was already root, we're done. + break; + sk = nsk return ConfSimple.getbin(self, nm)