updated sample for python3
This commit is contained in:
parent
8ac74ca8f5
commit
1a5d081093
@ -1,5 +1,4 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python3
|
||||||
from __future__ import print_function
|
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
import xapian
|
import xapian
|
||||||
@ -13,30 +12,37 @@ def wrap_prefix(prefix):
|
|||||||
if o_index_stripchars:
|
if o_index_stripchars:
|
||||||
return prefix
|
return prefix
|
||||||
else:
|
else:
|
||||||
return ":" + prefix + ":"
|
return b":" + prefix + b":"
|
||||||
|
|
||||||
def init_stripchars(xdb):
|
def init_stripchars(xdb):
|
||||||
global o_index_stripchars
|
global o_index_stripchars
|
||||||
global md5wpref
|
global md5wpref
|
||||||
t = xdb.allterms()
|
t = xdb.allterms()
|
||||||
t.skip_to(":")
|
t.skip_to(b":")
|
||||||
for term in t:
|
for term in t:
|
||||||
if term.term.find(":") == 0:
|
if term.term.find(b":") == 0:
|
||||||
o_index_stripchars = False
|
o_index_stripchars = False
|
||||||
break
|
break
|
||||||
md5wpref = wrap_prefix("XM")
|
md5wpref = wrap_prefix(b"XM")
|
||||||
|
|
||||||
|
|
||||||
# Retrieve named value from document data record.
|
# Retrieve named value from document data record.
|
||||||
# The record format is a sequence of nm=value lines
|
# The record format is a sequence of nm=value lines
|
||||||
def get_attribute(xdb, docid, fld):
|
def get_attributes(xdb, docid, flds, decode=True):
|
||||||
doc = xdb.get_document(docid)
|
doc = xdb.get_document(docid)
|
||||||
data = doc.get_data()
|
data = doc.get_data()
|
||||||
s = data.find(fld+"=")
|
res = []
|
||||||
if s == -1:
|
for fld in flds:
|
||||||
return ""
|
s = data.find(fld + b"=")
|
||||||
e = data.find("\n", s)
|
if s == -1:
|
||||||
return data[s+len(fld)+1:e]
|
res.append(None)
|
||||||
|
else:
|
||||||
|
e = data.find(b"\n", s)
|
||||||
|
if decode:
|
||||||
|
res.append(data[s+len(fld)+1:e].decode('UTF-8'))
|
||||||
|
else:
|
||||||
|
res.append(data[s+len(fld)+1:e])
|
||||||
|
return res
|
||||||
|
|
||||||
# Convenience: retrieve postings as Python list
|
# Convenience: retrieve postings as Python list
|
||||||
def get_postlist(xdb, term):
|
def get_postlist(xdb, term):
|
||||||
@ -45,6 +51,7 @@ def get_postlist(xdb, term):
|
|||||||
ret.append(posting.docid)
|
ret.append(posting.docid)
|
||||||
return ret
|
return ret
|
||||||
|
|
||||||
|
|
||||||
# Return list of docids having same md5 including self
|
# Return list of docids having same md5 including self
|
||||||
def get_dups(xdb, docid):
|
def get_dups(xdb, docid):
|
||||||
doc = xdb.get_document(int(docid))
|
doc = xdb.get_document(int(docid))
|
||||||
@ -76,11 +83,11 @@ def find_all_dups(xdb):
|
|||||||
alldups.append(dups)
|
alldups.append(dups)
|
||||||
return alldups
|
return alldups
|
||||||
|
|
||||||
|
|
||||||
# Print docid url ipath for list of docids
|
# Print docid url ipath for list of docids
|
||||||
def print_urlipath(xdb, doclist):
|
def print_urlipath(xdb, doclist):
|
||||||
for docid in doclist:
|
for docid in doclist:
|
||||||
url = get_attribute(xdb, docid, "url")
|
url,ipath = get_attributes(xdb, docid, [b"url", b"ipath"])
|
||||||
ipath = get_attribute(xdb, docid, "ipath")
|
|
||||||
print("%s %s %s" % (docid, url, ipath))
|
print("%s %s %s" % (docid, url, ipath))
|
||||||
|
|
||||||
def msg(s):
|
def msg(s):
|
||||||
@ -106,6 +113,7 @@ try:
|
|||||||
if len(sys.argv) == 2:
|
if len(sys.argv) == 2:
|
||||||
# No docid args,
|
# No docid args,
|
||||||
alldups = find_all_dups(xdb)
|
alldups = find_all_dups(xdb)
|
||||||
|
|
||||||
for dups in alldups:
|
for dups in alldups:
|
||||||
print_urlipath(xdb, dups)
|
print_urlipath(xdb, dups)
|
||||||
print("")
|
print("")
|
||||||
@ -116,5 +124,5 @@ try:
|
|||||||
print_urlipath(xdb, dups)
|
print_urlipath(xdb, dups)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
msg("Xapian error: %s" % str(e))
|
msg("Error: %s" % str(e))
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user