Allow access to Xapian docid for Recoll document in recollq and Python API. Add sample Python program to find document duplicates, using MD5 terms

2014-05-19 12:00:15 +02:00 · 2014-05-19 12:00:15 +02:00 · 12acdc4faf
commit 12acdc4faf
parent 6574ff514a
3 changed files with 146 additions and 3 deletions
--- a/src/python/recoll/pyrecoll.cpp
+++ b/src/python/recoll/pyrecoll.cpp
@ -403,7 +403,7 @@ PyDoc_STRVAR(doc_Doc_items,
 static PyObject *
 Doc_items(recoll_DocObject *self)
 {
-    LOGDEB(("Doc_getbinurl\n"));
+    LOGDEB(("Doc_items\n"));
    if (self->doc == 0 || 
 	the_docs.find(self->doc) == the_docs.end()) {
        PyErr_SetString(PyExc_AttributeError, "doc");
@ -448,12 +448,27 @@ Doc_get(recoll_DocObject *self, PyObject *args)
 	return 0;
    }
    string value;
-    if (self->doc->getmeta(key, 0)) {
-	value = self->doc->meta[key];
+    bool found = false;
+
+    // 
+    if (!key.compare("xdocid")) {
+        char cid[30];
+        sprintf(cid, "%lu", (unsigned long)self->doc->xdocid);
+        value = cid;
+        found = true;
+    } else {
+        if (self->doc->getmeta(key, 0)) {
+            value = self->doc->meta[key];
+            found = true;
+        }
+    }
+
+    if (found) {
 	return PyUnicode_Decode(value.c_str(), 
 				value.size(), 
 				"UTF-8", "replace");
    }
+
    Py_RETURN_NONE;
 }

@ -552,6 +567,14 @@ Doc_getattro(recoll_DocObject *self, PyObject *nameobj)
 	    value = self->doc->text; found = true;
 	}
 	break;
+    case 'x':
+        if (!key.compare("xdocid")) {
+            char cid[30];
+            sprintf(cid, "%lu", (unsigned long)self->doc->xdocid);
+            value = cid;
+            found = true;
+        }
+        break;
    }

    if (!found) {
--- a/src/python/samples/docdups.py
+++ b/src/python/samples/docdups.py
@ -0,0 +1,116 @@
+#!/usr/bin/env python
+
+import sys
+import xapian
+
+o_index_stripchars = True
+md5wpref = "XM"
+
+# Handle caps/diac-stripping option. If the db is raw the prefixes are
+# wrapped with ":"
+def wrap_prefix(prefix):
+    if o_index_stripchars:
+        return prefix
+    else:
+        return ":" + prefix + ":"
+
+def init_stripchars(xdb):
+    global o_index_stripchars
+    global md5wpref
+    t = xdb.allterms()
+    t.skip_to(":")
+    for term in t:
+        if term.term.find(":") == 0:
+            o_index_stripchars = False
+        break
+    md5wpref = wrap_prefix("XM")
+    
+
+# Retrieve named value from document data record.
+# The record format is a sequence of nm=value lines
+def get_attribute(xdb, docid, fld):
+    doc = xdb.get_document(docid)
+    data = doc.get_data()
+    s = data.find(fld+"=")
+    if s == -1:
+        return ""
+    e = data.find("\n", s)
+    return data[s+len(fld)+1:e]
+
+# Convenience: retrieve postings as Python list
+def get_postlist(xdb, term):
+    ret = list()
+    for posting in xdb.postlist(term):
+        ret.append(posting.docid)
+    return ret
+    
+# Return list of docids having same md5 including self
+def get_dups(xdb, docid):
+    doc = xdb.get_document(int(docid))
+
+    # It would be more efficient to retrieve the value, but it's
+    # binary so we'd have to decode it
+    md5term = doc.termlist().skip_to(md5wpref).term
+    if not md5term.startswith(md5wpref):
+        return
+
+    posts = get_postlist(xdb, md5term)
+    return posts
+
+# Retrieve all sets of duplicates:
+#   walk the list of all MD5 terms, look up their posting lists, and
+#   store the docids where the list is longer than one.
+def find_all_dups(xdb):
+    alldups = list()
+
+    # Walk the MD5 terms
+    t = xdb.allterms()
+    t.skip_to(md5wpref)
+    for term in t:
+        if not term.term.startswith(md5wpref):
+            break
+        # Check postlist for term, if it's not of length 1, we have a dup
+        dups = get_postlist(xdb, term.term)
+        if len(dups) != 1:
+            alldups.append(dups)
+    return alldups
+
+# Print docid url ipath for list of docids
+def print_urlipath(xdb, doclist):
+    for docid in doclist:
+        url = get_attribute(xdb, docid, "url")
+        ipath = get_attribute(xdb, docid, "ipath")
+        print docid, url, ipath
+
+########## Main program
+
+if len(sys.argv) < 2:
+    print >> sys.stderr, "Usage: %s /path/to/db [docid [docid ...]]" % \
+          sys.argv[0]
+    print >> sys.stderr, " will print all sets of dups if no docid is given"
+    print >> sys.stderr, " else only the duplicates for the given docids"
+    
+    sys.exit(1)
+
+xdbpath = sys.argv[1]
+xdb = xapian.Database(xdbpath)
+
+init_stripchars(xdb)
+
+try:
+    
+    if len(sys.argv) == 2:
+        # No docid args, 
+        alldups = find_all_dups(xdb)
+        for dups in alldups:
+            print_urlipath(xdb, dups)
+            print
+    else:
+        for docid in sys.argv[2:]:
+            dups = get_dups(xdb, docid)
+            if dups is not None and len(dups) > 1:
+                print_urlipath(xdb, dups)
+                
+except Exception, e:
+    print >> sys.stderr, "Xapian error: %s" % str(e)
+    sys.exit(1)
--- a/src/query/recollq.cpp
+++ b/src/query/recollq.cpp
@ -74,6 +74,10 @@ void output_fields(vector<string> fields, Rcl::Doc& doc,
 	    string abstract;
 	    query.makeDocAbstract(doc, abstract);
 	    base64_encode(abstract, out);
+        } else if (!it->compare("xdocid")) {
+            char cdocid[30];
+            sprintf(cdocid, "%lu", (unsigned long)doc.xdocid);
+            base64_encode(cdocid, out);
 	} else {
 	    base64_encode(doc.meta[*it], out);
 	}