python api tests: added rclextract test

2013-10-30 18:28:36 +01:00 · 2013-10-30 18:28:36 +01:00 · 338ec6eb42
commit 338ec6eb42
parent 4b12c0d06c
3 changed files with 53 additions and 2 deletions
--- a/src/doc/user/usermanual.sgml
+++ b/src/doc/user/usermanual.sgml
@ -4188,13 +4188,15 @@ or
                <listitem>Extract document defined
                by <replaceable>ipath</replaceable> and return
                a <literal>Doc</literal> object. The doc.text field
-                has the document text as either text/plain or
+                has the document text converted to either text/plain or
                text/html according to doc.mimetype. The typical use
                would be as follows:
                  <programlisting>
 qdoc = query.fetchone()
 extractor = recoll.Extractor(qdoc)
-text = extractor.textextract(qdoc.ipath)</programlisting>
+doc = extractor.textextract(qdoc.ipath)
+# use doc.text, e.g. for previewing
+</programlisting>
                </listitem>
              </varlistentry>
              <varlistentry>
--- a/tests/pythonapi/extract.py
+++ b/tests/pythonapi/extract.py
@ -0,0 +1,45 @@
+import sys
+import hashlib
+from recoll import recoll
+from recoll import rclextract
+
+if sys.version_info[0] >= 3:
+    ISP3 = True
+else:
+    ISP3 = False
+
+def utf8string(s):
+    if ISP3:
+        return s
+    else:
+        return s.encode('utf8')
+
+db = recoll.connect()
+query = db.query()
+
+# This normally has only one result, a well-known html file
+nres = query.execute("HtmlAttachment_uniqueTerm", stemming=0)
+print("Result count: %d %d" % (nres, query.rowcount))
+doc = query.fetchone()
+xtrac = rclextract.Extractor(doc)
+doc = xtrac.textextract(doc.ipath)
+print("Text length: %d"%len(doc.text))
+
+refdigest = 'bfbb63f7a245c31767585b45014dbd07'
+
+# This normally has 2 results, one of which is a pdf attachment.
+nres = query.execute("population_size_cultural_transmission", stemming=0)
+for doc in query:
+    if doc.mimetype == 'application/pdf':
+        xtrac = rclextract.Extractor(doc)
+        filename = xtrac.idoctofile(doc.ipath, doc.mimetype)
+        f = open(filename, 'rb')
+        data = f.read()
+        f.close()
+        m = hashlib.md5()
+        m.update(data)
+        digest = m.hexdigest()
+        print(digest)
+        if digest != refdigest:
+            print("extract.py: wrong digest for extracted file!")
+            
--- a/tests/pythonapi/pythonapi.txt
+++ b/tests/pythonapi/pythonapi.txt
@ -53,6 +53,10 @@ User query [title:"été à noël"]
 Xapian query: [(10 * (Sete PHRASE 3 Sa PHRASE 3 Snoel))]
 nres 1
 doc.title: [HTML fields test file: été à noël]
+python extract.py
+Result count: 1 1
+Text length: 3457
+bfbb63f7a245c31767585b45014dbd07
 python simple.py
 Xapian query: [(huniique:(wqf=11))]
 Result count: 2 2