From e802fce01ea10b9621fa05904d0b7d5ceae9001e Mon Sep 17 00:00:00 2001
From: Jean-Francois Dockes <jf@dockes.org>
Date: Sun, 5 Jan 2020 17:49:06 +0100
Subject: [PATCH] added python sample for extracting doc texts

---
 src/python/samples/doctexts.py | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)
 create mode 100755 src/python/samples/doctexts.py

diff --git a/src/python/samples/doctexts.py b/src/python/samples/doctexts.py
new file mode 100755
index 00000000..8ab4cfe7
--- /dev/null
+++ b/src/python/samples/doctexts.py
@@ -0,0 +1,34 @@
+#!/usr/bin/python3
+'''Show how to extract the document texts from an index which stores them,
+which is the default for Recoll versions with Xapian 1.4 support, after 1.24.
+Would not work with 1.23 and earlier. This also depends on the
+indexStoreDocText configuration variable. The usual RECOLL_CONFDIR can be used
+to determine the index we operate on.
+Use pyloglevel/pylogfilename or redirect stderr to get rid of the log messages.
+'''
+
+import sys
+from recoll import recoll
+
+
+def deb(s):
+    print("%s"%s, file=sys.stderr)
+
+def usage():
+    deb("Usage doctexts.py")
+    sys.exit(1)
+    
+if len(sys.argv) != 1:
+    usage()
+
+db = recoll.connect()
+q = db.query()
+q.execute("mime:*", fetchtext=True)
+
+ndocs = 0
+for doc in q:
+    ndocs += 1
+    print("TITLE: %s" % doc.title)
+    print("TEXT: %s" % doc.get('text'))
+
+print("Got %d documents" %ndocs)