Allow access to Xapian docid for Recoll document in recollq and Python API. Add sample Python program to find document duplicates, using MD5 terms
This commit is contained in:
parent
6574ff514a
commit
12acdc4faf
@ -403,7 +403,7 @@ PyDoc_STRVAR(doc_Doc_items,
|
|||||||
static PyObject *
|
static PyObject *
|
||||||
Doc_items(recoll_DocObject *self)
|
Doc_items(recoll_DocObject *self)
|
||||||
{
|
{
|
||||||
LOGDEB(("Doc_getbinurl\n"));
|
LOGDEB(("Doc_items\n"));
|
||||||
if (self->doc == 0 ||
|
if (self->doc == 0 ||
|
||||||
the_docs.find(self->doc) == the_docs.end()) {
|
the_docs.find(self->doc) == the_docs.end()) {
|
||||||
PyErr_SetString(PyExc_AttributeError, "doc");
|
PyErr_SetString(PyExc_AttributeError, "doc");
|
||||||
@ -448,12 +448,27 @@ Doc_get(recoll_DocObject *self, PyObject *args)
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
string value;
|
string value;
|
||||||
if (self->doc->getmeta(key, 0)) {
|
bool found = false;
|
||||||
value = self->doc->meta[key];
|
|
||||||
|
//
|
||||||
|
if (!key.compare("xdocid")) {
|
||||||
|
char cid[30];
|
||||||
|
sprintf(cid, "%lu", (unsigned long)self->doc->xdocid);
|
||||||
|
value = cid;
|
||||||
|
found = true;
|
||||||
|
} else {
|
||||||
|
if (self->doc->getmeta(key, 0)) {
|
||||||
|
value = self->doc->meta[key];
|
||||||
|
found = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (found) {
|
||||||
return PyUnicode_Decode(value.c_str(),
|
return PyUnicode_Decode(value.c_str(),
|
||||||
value.size(),
|
value.size(),
|
||||||
"UTF-8", "replace");
|
"UTF-8", "replace");
|
||||||
}
|
}
|
||||||
|
|
||||||
Py_RETURN_NONE;
|
Py_RETURN_NONE;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -552,6 +567,14 @@ Doc_getattro(recoll_DocObject *self, PyObject *nameobj)
|
|||||||
value = self->doc->text; found = true;
|
value = self->doc->text; found = true;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
case 'x':
|
||||||
|
if (!key.compare("xdocid")) {
|
||||||
|
char cid[30];
|
||||||
|
sprintf(cid, "%lu", (unsigned long)self->doc->xdocid);
|
||||||
|
value = cid;
|
||||||
|
found = true;
|
||||||
|
}
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!found) {
|
if (!found) {
|
||||||
|
|||||||
116
src/python/samples/docdups.py
Executable file
116
src/python/samples/docdups.py
Executable file
@ -0,0 +1,116 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import xapian
|
||||||
|
|
||||||
|
o_index_stripchars = True
|
||||||
|
md5wpref = "XM"
|
||||||
|
|
||||||
|
# Handle caps/diac-stripping option. If the db is raw the prefixes are
|
||||||
|
# wrapped with ":"
|
||||||
|
def wrap_prefix(prefix):
|
||||||
|
if o_index_stripchars:
|
||||||
|
return prefix
|
||||||
|
else:
|
||||||
|
return ":" + prefix + ":"
|
||||||
|
|
||||||
|
def init_stripchars(xdb):
|
||||||
|
global o_index_stripchars
|
||||||
|
global md5wpref
|
||||||
|
t = xdb.allterms()
|
||||||
|
t.skip_to(":")
|
||||||
|
for term in t:
|
||||||
|
if term.term.find(":") == 0:
|
||||||
|
o_index_stripchars = False
|
||||||
|
break
|
||||||
|
md5wpref = wrap_prefix("XM")
|
||||||
|
|
||||||
|
|
||||||
|
# Retrieve named value from document data record.
|
||||||
|
# The record format is a sequence of nm=value lines
|
||||||
|
def get_attribute(xdb, docid, fld):
|
||||||
|
doc = xdb.get_document(docid)
|
||||||
|
data = doc.get_data()
|
||||||
|
s = data.find(fld+"=")
|
||||||
|
if s == -1:
|
||||||
|
return ""
|
||||||
|
e = data.find("\n", s)
|
||||||
|
return data[s+len(fld)+1:e]
|
||||||
|
|
||||||
|
# Convenience: retrieve postings as Python list
|
||||||
|
def get_postlist(xdb, term):
|
||||||
|
ret = list()
|
||||||
|
for posting in xdb.postlist(term):
|
||||||
|
ret.append(posting.docid)
|
||||||
|
return ret
|
||||||
|
|
||||||
|
# Return list of docids having same md5 including self
|
||||||
|
def get_dups(xdb, docid):
|
||||||
|
doc = xdb.get_document(int(docid))
|
||||||
|
|
||||||
|
# It would be more efficient to retrieve the value, but it's
|
||||||
|
# binary so we'd have to decode it
|
||||||
|
md5term = doc.termlist().skip_to(md5wpref).term
|
||||||
|
if not md5term.startswith(md5wpref):
|
||||||
|
return
|
||||||
|
|
||||||
|
posts = get_postlist(xdb, md5term)
|
||||||
|
return posts
|
||||||
|
|
||||||
|
# Retrieve all sets of duplicates:
|
||||||
|
# walk the list of all MD5 terms, look up their posting lists, and
|
||||||
|
# store the docids where the list is longer than one.
|
||||||
|
def find_all_dups(xdb):
|
||||||
|
alldups = list()
|
||||||
|
|
||||||
|
# Walk the MD5 terms
|
||||||
|
t = xdb.allterms()
|
||||||
|
t.skip_to(md5wpref)
|
||||||
|
for term in t:
|
||||||
|
if not term.term.startswith(md5wpref):
|
||||||
|
break
|
||||||
|
# Check postlist for term, if it's not of length 1, we have a dup
|
||||||
|
dups = get_postlist(xdb, term.term)
|
||||||
|
if len(dups) != 1:
|
||||||
|
alldups.append(dups)
|
||||||
|
return alldups
|
||||||
|
|
||||||
|
# Print docid url ipath for list of docids
|
||||||
|
def print_urlipath(xdb, doclist):
|
||||||
|
for docid in doclist:
|
||||||
|
url = get_attribute(xdb, docid, "url")
|
||||||
|
ipath = get_attribute(xdb, docid, "ipath")
|
||||||
|
print docid, url, ipath
|
||||||
|
|
||||||
|
########## Main program
|
||||||
|
|
||||||
|
if len(sys.argv) < 2:
|
||||||
|
print >> sys.stderr, "Usage: %s /path/to/db [docid [docid ...]]" % \
|
||||||
|
sys.argv[0]
|
||||||
|
print >> sys.stderr, " will print all sets of dups if no docid is given"
|
||||||
|
print >> sys.stderr, " else only the duplicates for the given docids"
|
||||||
|
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
xdbpath = sys.argv[1]
|
||||||
|
xdb = xapian.Database(xdbpath)
|
||||||
|
|
||||||
|
init_stripchars(xdb)
|
||||||
|
|
||||||
|
try:
|
||||||
|
|
||||||
|
if len(sys.argv) == 2:
|
||||||
|
# No docid args,
|
||||||
|
alldups = find_all_dups(xdb)
|
||||||
|
for dups in alldups:
|
||||||
|
print_urlipath(xdb, dups)
|
||||||
|
print
|
||||||
|
else:
|
||||||
|
for docid in sys.argv[2:]:
|
||||||
|
dups = get_dups(xdb, docid)
|
||||||
|
if dups is not None and len(dups) > 1:
|
||||||
|
print_urlipath(xdb, dups)
|
||||||
|
|
||||||
|
except Exception, e:
|
||||||
|
print >> sys.stderr, "Xapian error: %s" % str(e)
|
||||||
|
sys.exit(1)
|
||||||
@ -74,6 +74,10 @@ void output_fields(vector<string> fields, Rcl::Doc& doc,
|
|||||||
string abstract;
|
string abstract;
|
||||||
query.makeDocAbstract(doc, abstract);
|
query.makeDocAbstract(doc, abstract);
|
||||||
base64_encode(abstract, out);
|
base64_encode(abstract, out);
|
||||||
|
} else if (!it->compare("xdocid")) {
|
||||||
|
char cdocid[30];
|
||||||
|
sprintf(cdocid, "%lu", (unsigned long)doc.xdocid);
|
||||||
|
base64_encode(cdocid, out);
|
||||||
} else {
|
} else {
|
||||||
base64_encode(doc.meta[*it], out);
|
base64_encode(doc.meta[*it], out);
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user