updated sample for python3

This commit is contained in:
Jean-Francois Dockes 2020-05-25 08:37:20 +02:00
parent 8ac74ca8f5
commit 1a5d081093

View File

@ -1,5 +1,4 @@
#!/usr/bin/env python #!/usr/bin/env python3
from __future__ import print_function
import sys import sys
import xapian import xapian
@ -13,30 +12,37 @@ def wrap_prefix(prefix):
if o_index_stripchars: if o_index_stripchars:
return prefix return prefix
else: else:
return ":" + prefix + ":" return b":" + prefix + b":"
def init_stripchars(xdb): def init_stripchars(xdb):
global o_index_stripchars global o_index_stripchars
global md5wpref global md5wpref
t = xdb.allterms() t = xdb.allterms()
t.skip_to(":") t.skip_to(b":")
for term in t: for term in t:
if term.term.find(":") == 0: if term.term.find(b":") == 0:
o_index_stripchars = False o_index_stripchars = False
break break
md5wpref = wrap_prefix("XM") md5wpref = wrap_prefix(b"XM")
# Retrieve named value from document data record. # Retrieve named value from document data record.
# The record format is a sequence of nm=value lines # The record format is a sequence of nm=value lines
def get_attribute(xdb, docid, fld): def get_attributes(xdb, docid, flds, decode=True):
doc = xdb.get_document(docid) doc = xdb.get_document(docid)
data = doc.get_data() data = doc.get_data()
s = data.find(fld+"=") res = []
if s == -1: for fld in flds:
return "" s = data.find(fld + b"=")
e = data.find("\n", s) if s == -1:
return data[s+len(fld)+1:e] res.append(None)
else:
e = data.find(b"\n", s)
if decode:
res.append(data[s+len(fld)+1:e].decode('UTF-8'))
else:
res.append(data[s+len(fld)+1:e])
return res
# Convenience: retrieve postings as Python list # Convenience: retrieve postings as Python list
def get_postlist(xdb, term): def get_postlist(xdb, term):
@ -45,6 +51,7 @@ def get_postlist(xdb, term):
ret.append(posting.docid) ret.append(posting.docid)
return ret return ret
# Return list of docids having same md5 including self # Return list of docids having same md5 including self
def get_dups(xdb, docid): def get_dups(xdb, docid):
doc = xdb.get_document(int(docid)) doc = xdb.get_document(int(docid))
@ -76,11 +83,11 @@ def find_all_dups(xdb):
alldups.append(dups) alldups.append(dups)
return alldups return alldups
# Print docid url ipath for list of docids # Print docid url ipath for list of docids
def print_urlipath(xdb, doclist): def print_urlipath(xdb, doclist):
for docid in doclist: for docid in doclist:
url = get_attribute(xdb, docid, "url") url,ipath = get_attributes(xdb, docid, [b"url", b"ipath"])
ipath = get_attribute(xdb, docid, "ipath")
print("%s %s %s" % (docid, url, ipath)) print("%s %s %s" % (docid, url, ipath))
def msg(s): def msg(s):
@ -106,6 +113,7 @@ try:
if len(sys.argv) == 2: if len(sys.argv) == 2:
# No docid args, # No docid args,
alldups = find_all_dups(xdb) alldups = find_all_dups(xdb)
for dups in alldups: for dups in alldups:
print_urlipath(xdb, dups) print_urlipath(xdb, dups)
print("") print("")
@ -116,5 +124,5 @@ try:
print_urlipath(xdb, dups) print_urlipath(xdb, dups)
except Exception as e: except Exception as e:
msg("Xapian error: %s" % str(e)) msg("Error: %s" % str(e))
sys.exit(1) sys.exit(1)