#!/usr/bin/env python3 ################################# # Copyright (C) 2020 J.F.Dockes # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the # Free Software Foundation, Inc., # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ######################################################## # Caching OCR'd data # # OCR is extremely slow, caching the results is necessary. # # The cache stores 2 kinds of objects: # - Path files are named from the hash of the image file path and # contain the image data hash, the modification time and size of the # image file at the time the OCR'd data was stored in the cache, and # the image path itself (the last is for purging only). # - Data files are named with the hash of the image data and contain # the zlib-compressed OCR'd data. # # When retrieving data from the cache: # - We first use the image file size and modification time: if an # entry exists for the imagepath/mtime/size triplet, and is up to # date, the corresponding data is obtained from the data file and # returned. # - Else we then use the image data: if an entry exists for the # computed hashed value of the data, it is returned. This allows # moving files around without needing to run OCR again, but of # course, it is more expensive than the first step # # If we need to use the second step, as a side effect, a path file is # created or updated so that the data will be found with the first # step next time around. # # Purging the cache of obsolete data. # # - The cache path and data files are stored under 2 different # directories (objects, paths) to make purging easier. # - Purging the paths tree just involves walking it, reading the # files, and checking the existence of the recorded paths. # - There is no easy way to purge the data tree. The only possibility # is to input a list of possible source files (e.g. result of a # find in the image files area), and compute all the hashes. Data # files which do not match one of the hashes are deleted. import sys import os import hashlib import urllib.parse import zlib def deb(s): print("%s" %s, file=sys.stderr) class OCRCache(object): def __init__(self, conf): self.config = conf self.cachedir = conf.getConfParam("ocrcachedir") if not self.cachedir: self.cachedir = os.path.join(self.config.getConfDir(), "ocrcache") self.objdir = os.path.join(self.cachedir, "objects") self.pathdir = os.path.join(self.cachedir, "paths") for dir in (self.objdir, self.pathdir): if not os.path.exists(dir): os.makedirs(dir) # Compute sha1 of path, as two parts of 2 and 38 chars def _hashpath(self, data): if type(data) != type(b""): data = data.encode('utf-8') m = hashlib.sha1() m.update(data) h = m.hexdigest() return h[0:2], h[2:] # Compute sha1 of path data contents, as two parts of 2 and 38 chars def _hashdata(self, path): #deb("Hashing DATA") m = hashlib.sha1() with open(path, "rb") as f: while True: d = f.read(8192) if not d: break m.update(d) h = m.hexdigest() return h[0:2], h[2:] # Try to read the stored attributes for a given path: data hash, # modification time and size. If this fails, the path itself is # not cached (but the data still might be, maybe the file was moved) def _cachedpathattrs(self, path): pd,pf = self._hashpath(path) o = os.path.join(self.pathdir, pd, pf) if not os.path.exists(o): return False, None, None, None, None line = open(o, "r").read() dd,df,tm,sz,pth = line.split() tm = int(tm) sz = int(sz) return True, dd, df, tm, sz # Compute the path hash, and get the mtime and size for given # path, for updating the cache path file def _newpathattrs(self, path): pd,pf = self._hashpath(path) tm = int(os.path.getmtime(path)) sz = int(os.path.getsize(path)) return pd, pf, tm, sz # Check if the cache appears up to date for a given path, only # using the modification time and size. Return the data file path # elements if we get a hit. def _pathincache(self, path): ret, od, of, otm, osz = self._cachedpathattrs(path) if not ret: return False, None, None pd, pf, ntm, nsz = self._newpathattrs(path) #deb(" tm %d sz %d" % (ntm, nsz)) #deb("otm %d osz %d" % (otm, osz)) if otm != ntm or osz != nsz: return False, None, None return True, od, of # Check if cache appears up to date for path (no data check), # return True/False def pathincache(self, path): ret, dd, df = self._pathincache(path) return ret # Compute the data file name for path. Expensive: we compute the data hash. # Return both the data file path and path elements (for storage in path file) def _datafilename(self, path): d, f = self._hashdata(path) return os.path.join(self.objdir, d, f), d, f # Check if the data for path is in cache: expensive, needs to # compute the hash for the path's data contents. Returns True/False def dataincache(self, path): return os.path.exists(self._datafilename(path)[0]) # Create path file with given elements. def _updatepathfile(self, pd, pf, dd, df, tm, sz, path): dir = os.path.join(self.pathdir, pd) if not os.path.exists(dir): os.makedirs(dir) pfile = os.path.join(dir, pf) codedpath = urllib.parse.quote(path) with open(pfile, "w") as f: f.write("%s %s %d %d %s\n" % (dd, df, tm, sz, codedpath)) # Store data for path. Only rewrite an existing data file if told # to do so: this is only useful if we are forcing an OCR re-run. def store(self, path, datatostore, force=False): dd,df = self._hashdata(path) pd, pf, tm, sz = self._newpathattrs(path) self._updatepathfile(pd, pf, dd, df, tm, sz, path) dir = os.path.join(self.objdir, dd) if not os.path.exists(dir): os.makedirs(dir) dfile = os.path.join(dir, df) if force or not os.path.exists(dfile): #deb("Storing data") cpressed = zlib.compress(datatostore) with open(dfile, "wb") as f: f.write(cpressed) return True # Retrieve cached OCR'd data for image path. Possibly update the # path file as a side effect (case where the image has moved, but # the data has not changed). def get(self, path): pincache, dd, df = self._pathincache(path) if pincache: dfn = os.path.join(self.objdir, dd, df) else: dfn, dd, df = self._datafilename(path) if not os.path.exists(dfn): return False, b"" if not pincache: # File has moved. create/Update path file for next time deb("ocrcache::get file %s was moved, updating path data" % path) pd, pf, tm, sz = self._newpathattrs(path) self._updatepathfile(pd, pf, dd, df, tm, sz, path) with open(dfn, "rb") as f: cpressed = f.read() data = zlib.decompress(cpressed) return True, data if __name__ == '__main__': import rclconfig conf = rclconfig.RclConfig() cache = OCRCache(conf) path = sys.argv[1] def trycache(p): deb("== CACHE tests for %s"%p) ret = cache.pathincache(p) s = "" if ret else " not" deb("path for %s%s in cache" % (p, s)) if not ret: return False ret = cache.dataincache(p) s = "" if ret else " not" deb("data for %s%s in cache" % (p, s)) return ret def trystore(p): deb("== STORE test for %s" % p) cache.store(p, b"my OCR'd text is one line\n", force=False) def tryget(p): deb("== GET test for %s" % p) incache, data = cache.get(p) if incache: deb("Data from cache [%s]" % data) else: deb("Data was not found in cache") return incache, data incache, data = tryget(path) if not incache: trystore(path)