ocrcache: implemented purge functions/script
This commit is contained in:
parent
8c36ea9853
commit
7bc70a30ae
@ -59,9 +59,10 @@ import os
|
|||||||
import hashlib
|
import hashlib
|
||||||
import urllib.parse
|
import urllib.parse
|
||||||
import zlib
|
import zlib
|
||||||
|
import glob
|
||||||
|
|
||||||
def deb(s):
|
def _deb(s):
|
||||||
print("%s" %s, file=sys.stderr)
|
print("rclocrcache: %s" %s, file=sys.stderr)
|
||||||
|
|
||||||
class OCRCache(object):
|
class OCRCache(object):
|
||||||
def __init__(self, conf):
|
def __init__(self, conf):
|
||||||
@ -86,7 +87,7 @@ class OCRCache(object):
|
|||||||
|
|
||||||
# Compute sha1 of path data contents, as two parts of 2 and 38 chars
|
# Compute sha1 of path data contents, as two parts of 2 and 38 chars
|
||||||
def _hashdata(self, path):
|
def _hashdata(self, path):
|
||||||
#deb("Hashing DATA")
|
#_deb("Hashing DATA")
|
||||||
m = hashlib.sha1()
|
m = hashlib.sha1()
|
||||||
with open(path, "rb") as f:
|
with open(path, "rb") as f:
|
||||||
while True:
|
while True:
|
||||||
@ -97,19 +98,30 @@ class OCRCache(object):
|
|||||||
h = m.hexdigest()
|
h = m.hexdigest()
|
||||||
return h[0:2], h[2:]
|
return h[0:2], h[2:]
|
||||||
|
|
||||||
|
|
||||||
|
def _readpathfile(self, ppf):
|
||||||
|
'''Read path file and return values. We do not decode the image path
|
||||||
|
as this is only used for purging'''
|
||||||
|
with open(ppf, 'r') as f:
|
||||||
|
line = f.read()
|
||||||
|
dd,df,tm,sz,pth = line.split()
|
||||||
|
tm = int(tm)
|
||||||
|
sz = int(sz)
|
||||||
|
return dd,df,tm,sz,pth
|
||||||
|
|
||||||
# Try to read the stored attributes for a given path: data hash,
|
# Try to read the stored attributes for a given path: data hash,
|
||||||
# modification time and size. If this fails, the path itself is
|
# modification time and size. If this fails, the path itself is
|
||||||
# not cached (but the data still might be, maybe the file was moved)
|
# not cached (but the data still might be, maybe the file was moved)
|
||||||
def _cachedpathattrs(self, path):
|
def _cachedpathattrs(self, path):
|
||||||
pd,pf = self._hashpath(path)
|
pd,pf = self._hashpath(path)
|
||||||
o = os.path.join(self.pathdir, pd, pf)
|
pathfilepath = os.path.join(self.pathdir, pd, pf)
|
||||||
if not os.path.exists(o):
|
if not os.path.exists(pathfilepath):
|
||||||
|
return False, None, None, None, None
|
||||||
|
try:
|
||||||
|
dd, df, tm, sz, pth = self._readpathfile(pathfilepath)
|
||||||
|
return True, dd, df, tm, sz
|
||||||
|
except:
|
||||||
return False, None, None, None, None
|
return False, None, None, None, None
|
||||||
line = open(o, "r").read()
|
|
||||||
dd,df,tm,sz,pth = line.split()
|
|
||||||
tm = int(tm)
|
|
||||||
sz = int(sz)
|
|
||||||
return True, dd, df, tm, sz
|
|
||||||
|
|
||||||
# Compute the path hash, and get the mtime and size for given
|
# Compute the path hash, and get the mtime and size for given
|
||||||
# path, for updating the cache path file
|
# path, for updating the cache path file
|
||||||
@ -127,8 +139,8 @@ class OCRCache(object):
|
|||||||
if not ret:
|
if not ret:
|
||||||
return False, None, None
|
return False, None, None
|
||||||
pd, pf, ntm, nsz = self._newpathattrs(path)
|
pd, pf, ntm, nsz = self._newpathattrs(path)
|
||||||
#deb(" tm %d sz %d" % (ntm, nsz))
|
#_deb(" tm %d sz %d" % (ntm, nsz))
|
||||||
#deb("otm %d osz %d" % (otm, osz))
|
#_deb("otm %d osz %d" % (otm, osz))
|
||||||
if otm != ntm or osz != nsz:
|
if otm != ntm or osz != nsz:
|
||||||
return False, None, None
|
return False, None, None
|
||||||
return True, od, of
|
return True, od, of
|
||||||
@ -171,7 +183,7 @@ class OCRCache(object):
|
|||||||
os.makedirs(dir)
|
os.makedirs(dir)
|
||||||
dfile = os.path.join(dir, df)
|
dfile = os.path.join(dir, df)
|
||||||
if force or not os.path.exists(dfile):
|
if force or not os.path.exists(dfile):
|
||||||
#deb("Storing data")
|
#_deb("Storing data")
|
||||||
cpressed = zlib.compress(datatostore)
|
cpressed = zlib.compress(datatostore)
|
||||||
with open(dfile, "wb") as f:
|
with open(dfile, "wb") as f:
|
||||||
f.write(cpressed)
|
f.write(cpressed)
|
||||||
@ -192,7 +204,7 @@ class OCRCache(object):
|
|||||||
|
|
||||||
if not pincache:
|
if not pincache:
|
||||||
# File has moved. create/Update path file for next time
|
# File has moved. create/Update path file for next time
|
||||||
deb("ocrcache::get file %s was moved, updating path data" % path)
|
_deb("ocrcache::get file %s was moved, updating path data" % path)
|
||||||
pd, pf, tm, sz = self._newpathattrs(path)
|
pd, pf, tm, sz = self._newpathattrs(path)
|
||||||
self._updatepathfile(pd, pf, dd, df, tm, sz, path)
|
self._updatepathfile(pd, pf, dd, df, tm, sz, path)
|
||||||
|
|
||||||
@ -201,41 +213,115 @@ class OCRCache(object):
|
|||||||
data = zlib.decompress(cpressed)
|
data = zlib.decompress(cpressed)
|
||||||
return True, data
|
return True, data
|
||||||
|
|
||||||
|
def _pathstale(self, origpath, otm, osz):
|
||||||
|
'''Return True if the input path has been removed or modified'''
|
||||||
|
if not os.path.exists(origpath):
|
||||||
|
return True
|
||||||
|
ntm = int(os.path.getmtime(origpath))
|
||||||
|
nsz = int(os.path.getsize(origpath))
|
||||||
|
if ntm != otm or nsz != osz:
|
||||||
|
#_deb("Purgepaths otm %d ntm %d osz %d nsz %d"%(otm, ntm, osz, nsz))
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
def purgepaths(self):
|
||||||
|
'''Remove all stale pathfiles: source image does not exist or has
|
||||||
|
been changed. Mostly useful for removed files, modified ones would be
|
||||||
|
processed by recollindex.'''
|
||||||
|
allpathfiles = glob.glob(os.path.join(self.pathdir, "*", "*"))
|
||||||
|
for pathfile in allpathfiles:
|
||||||
|
dd, df, tm, sz, orgpath = self._readpathfile(pathfile)
|
||||||
|
needpurge = self._pathstale(orgpath, tm, sz)
|
||||||
|
if needpurge:
|
||||||
|
_deb("purgepaths: removing %s (%s)" % (pathfile, orgpath))
|
||||||
|
os.remove(pathfile)
|
||||||
|
|
||||||
|
def _walk(self, topdir, cb):
|
||||||
|
'''Specific fs walk: we know that our tree has 2 levels. Call cb with
|
||||||
|
the file path as parameter for each file'''
|
||||||
|
dlist = glob.glob(os.path.join(topdir, "*"))
|
||||||
|
for dir in dlist:
|
||||||
|
files = glob.glob(os.path.join(dir, "*"))
|
||||||
|
for f in files:
|
||||||
|
cb(f)
|
||||||
|
|
||||||
|
def _pgdt_pathcb(self, f):
|
||||||
|
'''Get a pathfile name, read it, and record datafile identifier
|
||||||
|
(concatenate data file subdir and file name)'''
|
||||||
|
#_deb("_pgdt_pathcb: %s" % f)
|
||||||
|
dd, df, tm, sz, orgpath = self._readpathfile(f)
|
||||||
|
self._pgdt_alldatafns.add(dd+df)
|
||||||
|
|
||||||
|
def _pgdt_datacb(self, datafn):
|
||||||
|
'''Get a datafile name and check that it is referenced by a previously
|
||||||
|
seen pathfile'''
|
||||||
|
p1,fn = os.path.split(datafn)
|
||||||
|
p2,dn = os.path.split(p1)
|
||||||
|
tst = dn+fn
|
||||||
|
if tst in self._pgdt_alldatafns:
|
||||||
|
_deb("purgedata: ok : %s" % datafn)
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
_deb("purgedata: removing : %s" % datafn)
|
||||||
|
os.remove(datafn)
|
||||||
|
|
||||||
|
def purgedata(self):
|
||||||
|
'''Remove all data files which do not match any from the input list,
|
||||||
|
based on data contents hash. We make a list of all data files
|
||||||
|
referenced by the path files, then walk the data tree,
|
||||||
|
removing all unreferenced files. This should only be run after
|
||||||
|
an indexing pass, so that the path files are up to date. It's
|
||||||
|
a relatively onerous operation as we have to read all the path
|
||||||
|
files, and walk both sets of files.'''
|
||||||
|
|
||||||
|
self._pgdt_alldatafns = set()
|
||||||
|
self._walk(self.pathdir, self._pgdt_pathcb)
|
||||||
|
self._walk(self.objdir, self._pgdt_datacb)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
import rclconfig
|
import rclconfig
|
||||||
|
def _Usage():
|
||||||
|
_deb("Usage: rclocrcache.py --purge")
|
||||||
|
sys.exit(1)
|
||||||
|
if len(sys.argv) != 2:
|
||||||
|
_Usage()
|
||||||
|
if sys.argv[1] != "--purge":
|
||||||
|
_Usage()
|
||||||
|
|
||||||
conf = rclconfig.RclConfig()
|
conf = rclconfig.RclConfig()
|
||||||
cache = OCRCache(conf)
|
cache = OCRCache(conf)
|
||||||
path = sys.argv[1]
|
cache.purgepaths()
|
||||||
|
cache.purgedata()
|
||||||
def trycache(p):
|
sys.exit(0)
|
||||||
deb("== CACHE tests for %s"%p)
|
|
||||||
ret = cache.pathincache(p)
|
|
||||||
s = "" if ret else " not"
|
|
||||||
deb("path for %s%s in cache" % (p, s))
|
|
||||||
if not ret:
|
|
||||||
return False
|
|
||||||
ret = cache.dataincache(p)
|
|
||||||
s = "" if ret else " not"
|
|
||||||
deb("data for %s%s in cache" % (p, s))
|
|
||||||
return ret
|
|
||||||
|
|
||||||
def trystore(p):
|
# def trycache(p):
|
||||||
deb("== STORE test for %s" % p)
|
# _deb("== CACHE tests for %s"%p)
|
||||||
cache.store(p, b"my OCR'd text is one line\n", force=False)
|
# ret = cache.pathincache(p)
|
||||||
|
# s = "" if ret else " not"
|
||||||
def tryget(p):
|
# _deb("path for %s%s in cache" % (p, s))
|
||||||
deb("== GET test for %s" % p)
|
# if not ret:
|
||||||
incache, data = cache.get(p)
|
# return False
|
||||||
if incache:
|
# ret = cache.dataincache(p)
|
||||||
deb("Data from cache [%s]" % data)
|
# s = "" if ret else " not"
|
||||||
else:
|
# _deb("data for %s%s in cache" % (p, s))
|
||||||
deb("Data was not found in cache")
|
# return ret
|
||||||
return incache, data
|
# def trystore(p):
|
||||||
|
# _deb("== STORE test for %s" % p)
|
||||||
incache, data = tryget(path)
|
# cache.store(p, b"my OCR'd text is one line\n", force=False)
|
||||||
if not incache:
|
# def tryget(p):
|
||||||
trystore(path)
|
# _deb("== GET test for %s" % p)
|
||||||
|
# incache, data = cache.get(p)
|
||||||
|
# if incache:
|
||||||
|
# _deb("Data from cache [%s]" % data)
|
||||||
|
# else:
|
||||||
|
# _deb("Data was not found in cache")
|
||||||
|
# return incache, data
|
||||||
|
# if False:
|
||||||
|
# path = sys.argv[1]
|
||||||
|
# incache, data = tryget(path)
|
||||||
|
# if not incache:
|
||||||
|
# trystore(path)
|
||||||
|
#
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user