From e802fce01ea10b9621fa05904d0b7d5ceae9001e Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Sun, 5 Jan 2020 17:49:06 +0100 Subject: [PATCH] added python sample for extracting doc texts --- src/python/samples/doctexts.py | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100755 src/python/samples/doctexts.py diff --git a/src/python/samples/doctexts.py b/src/python/samples/doctexts.py new file mode 100755 index 00000000..8ab4cfe7 --- /dev/null +++ b/src/python/samples/doctexts.py @@ -0,0 +1,34 @@ +#!/usr/bin/python3 +'''Show how to extract the document texts from an index which stores them, +which is the default for Recoll versions with Xapian 1.4 support, after 1.24. +Would not work with 1.23 and earlier. This also depends on the +indexStoreDocText configuration variable. The usual RECOLL_CONFDIR can be used +to determine the index we operate on. +Use pyloglevel/pylogfilename or redirect stderr to get rid of the log messages. +''' + +import sys +from recoll import recoll + + +def deb(s): + print("%s"%s, file=sys.stderr) + +def usage(): + deb("Usage doctexts.py") + sys.exit(1) + +if len(sys.argv) != 1: + usage() + +db = recoll.connect() +q = db.query() +q.execute("mime:*", fetchtext=True) + +ndocs = 0 +for doc in q: + ndocs += 1 + print("TITLE: %s" % doc.title) + print("TEXT: %s" % doc.get('text')) + +print("Got %d documents" %ndocs)