python api tests: added rclextract test

2013-10-30 18:28:36 +01:00 · 2013-10-30 18:28:36 +01:00 · 338ec6eb42
commit 338ec6eb42
parent 4b12c0d06c
3 changed files with 53 additions and 2 deletions
--- a/src/doc/user/usermanual.sgml
+++ b/src/doc/user/usermanual.sgml
@ -4188,13 +4188,15 @@ or
                <listitem>Extract document defined
                by <replaceable>ipath</replaceable> and return
                a <literal>Doc</literal> object. The doc.text field
-                has the document text as either text/plain or
+                has the document text converted to either text/plain or
                text/html according to doc.mimetype. The typical use
                would be as follows:
                  <programlisting>
 qdoc = query.fetchone()
 extractor = recoll.Extractor(qdoc)
-text = extractor.textextract(qdoc.ipath)</programlisting>
+doc = extractor.textextract(qdoc.ipath)
 # use doc.text, e.g. for previewing
 </programlisting>
                </listitem>
              </varlistentry>
              <varlistentry>
--- a/tests/pythonapi/extract.py
+++ b/tests/pythonapi/extract.py
@ -0,0 +1,45 @@
 import sys
 import hashlib
 from recoll import recoll
 from recoll import rclextract
 if sys.version_info[0] >= 3:
    ISP3 = True
 else:
    ISP3 = False
 def utf8string(s):
    if ISP3:
        return s
    else:
        return s.encode('utf8')
 db = recoll.connect()
 query = db.query()
 # This normally has only one result, a well-known html file
 nres = query.execute("HtmlAttachment_uniqueTerm", stemming=0)
 print("Result count: %d %d" % (nres, query.rowcount))
 doc = query.fetchone()
 xtrac = rclextract.Extractor(doc)
 doc = xtrac.textextract(doc.ipath)
 print("Text length: %d"%len(doc.text))
 refdigest = 'bfbb63f7a245c31767585b45014dbd07'
 # This normally has 2 results, one of which is a pdf attachment.
 nres = query.execute("population_size_cultural_transmission", stemming=0)
 for doc in query:
    if doc.mimetype == 'application/pdf':
        xtrac = rclextract.Extractor(doc)
        filename = xtrac.idoctofile(doc.ipath, doc.mimetype)
        f = open(filename, 'rb')
        data = f.read()
        f.close()
        m = hashlib.md5()
        m.update(data)
        digest = m.hexdigest()
        print(digest)
        if digest != refdigest:
            print("extract.py: wrong digest for extracted file!")
--- a/tests/pythonapi/pythonapi.txt
+++ b/tests/pythonapi/pythonapi.txt
@ -53,6 +53,10 @@ User query [title:"été à noël"]
 Xapian query: [(10 * (Sete PHRASE 3 Sa PHRASE 3 Snoel))]
 nres 1
 doc.title: [HTML fields test file: été à noël]
 python extract.py
 Result count: 1 1
 Text length: 3457
 bfbb63f7a245c31767585b45014dbd07
 python simple.py
 Xapian query: [(huniique:(wqf=11))]
 Result count: 2 2