python api tests: added rclextract test

This commit is contained in:
Jean-Francois Dockes 2013-10-30 18:28:36 +01:00
parent 4b12c0d06c
commit 338ec6eb42
3 changed files with 53 additions and 2 deletions

View File

@ -4188,13 +4188,15 @@ or
<listitem>Extract document defined
by <replaceable>ipath</replaceable> and return
a <literal>Doc</literal> object. The doc.text field
has the document text as either text/plain or
has the document text converted to either text/plain or
text/html according to doc.mimetype. The typical use
would be as follows:
<programlisting>
qdoc = query.fetchone()
extractor = recoll.Extractor(qdoc)
text = extractor.textextract(qdoc.ipath)</programlisting>
doc = extractor.textextract(qdoc.ipath)
# use doc.text, e.g. for previewing
</programlisting>
</listitem>
</varlistentry>
<varlistentry>

View File

@ -0,0 +1,45 @@
import sys
import hashlib
from recoll import recoll
from recoll import rclextract
if sys.version_info[0] >= 3:
ISP3 = True
else:
ISP3 = False
def utf8string(s):
if ISP3:
return s
else:
return s.encode('utf8')
db = recoll.connect()
query = db.query()
# This normally has only one result, a well-known html file
nres = query.execute("HtmlAttachment_uniqueTerm", stemming=0)
print("Result count: %d %d" % (nres, query.rowcount))
doc = query.fetchone()
xtrac = rclextract.Extractor(doc)
doc = xtrac.textextract(doc.ipath)
print("Text length: %d"%len(doc.text))
refdigest = 'bfbb63f7a245c31767585b45014dbd07'
# This normally has 2 results, one of which is a pdf attachment.
nres = query.execute("population_size_cultural_transmission", stemming=0)
for doc in query:
if doc.mimetype == 'application/pdf':
xtrac = rclextract.Extractor(doc)
filename = xtrac.idoctofile(doc.ipath, doc.mimetype)
f = open(filename, 'rb')
data = f.read()
f.close()
m = hashlib.md5()
m.update(data)
digest = m.hexdigest()
print(digest)
if digest != refdigest:
print("extract.py: wrong digest for extracted file!")

View File

@ -53,6 +53,10 @@ User query [title:"été à noël"]
Xapian query: [(10 * (Sete PHRASE 3 Sa PHRASE 3 Snoel))]
nres 1
doc.title: [HTML fields test file: été à noël]
python extract.py
Result count: 1 1
Text length: 3457
bfbb63f7a245c31767585b45014dbd07
python simple.py
Xapian query: [(huniique:(wqf=11))]
Result count: 2 2