python api tests: added rclextract test
This commit is contained in:
parent
4b12c0d06c
commit
338ec6eb42
@ -4188,13 +4188,15 @@ or
|
||||
<listitem>Extract document defined
|
||||
by <replaceable>ipath</replaceable> and return
|
||||
a <literal>Doc</literal> object. The doc.text field
|
||||
has the document text as either text/plain or
|
||||
has the document text converted to either text/plain or
|
||||
text/html according to doc.mimetype. The typical use
|
||||
would be as follows:
|
||||
<programlisting>
|
||||
qdoc = query.fetchone()
|
||||
extractor = recoll.Extractor(qdoc)
|
||||
text = extractor.textextract(qdoc.ipath)</programlisting>
|
||||
doc = extractor.textextract(qdoc.ipath)
|
||||
# use doc.text, e.g. for previewing
|
||||
</programlisting>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
|
||||
45
tests/pythonapi/extract.py
Normal file
45
tests/pythonapi/extract.py
Normal file
@ -0,0 +1,45 @@
|
||||
import sys
|
||||
import hashlib
|
||||
from recoll import recoll
|
||||
from recoll import rclextract
|
||||
|
||||
if sys.version_info[0] >= 3:
|
||||
ISP3 = True
|
||||
else:
|
||||
ISP3 = False
|
||||
|
||||
def utf8string(s):
|
||||
if ISP3:
|
||||
return s
|
||||
else:
|
||||
return s.encode('utf8')
|
||||
|
||||
db = recoll.connect()
|
||||
query = db.query()
|
||||
|
||||
# This normally has only one result, a well-known html file
|
||||
nres = query.execute("HtmlAttachment_uniqueTerm", stemming=0)
|
||||
print("Result count: %d %d" % (nres, query.rowcount))
|
||||
doc = query.fetchone()
|
||||
xtrac = rclextract.Extractor(doc)
|
||||
doc = xtrac.textextract(doc.ipath)
|
||||
print("Text length: %d"%len(doc.text))
|
||||
|
||||
refdigest = 'bfbb63f7a245c31767585b45014dbd07'
|
||||
|
||||
# This normally has 2 results, one of which is a pdf attachment.
|
||||
nres = query.execute("population_size_cultural_transmission", stemming=0)
|
||||
for doc in query:
|
||||
if doc.mimetype == 'application/pdf':
|
||||
xtrac = rclextract.Extractor(doc)
|
||||
filename = xtrac.idoctofile(doc.ipath, doc.mimetype)
|
||||
f = open(filename, 'rb')
|
||||
data = f.read()
|
||||
f.close()
|
||||
m = hashlib.md5()
|
||||
m.update(data)
|
||||
digest = m.hexdigest()
|
||||
print(digest)
|
||||
if digest != refdigest:
|
||||
print("extract.py: wrong digest for extracted file!")
|
||||
|
||||
@ -53,6 +53,10 @@ User query [title:"été à noël"]
|
||||
Xapian query: [(10 * (Sete PHRASE 3 Sa PHRASE 3 Snoel))]
|
||||
nres 1
|
||||
doc.title: [HTML fields test file: été à noël]
|
||||
python extract.py
|
||||
Result count: 1 1
|
||||
Text length: 3457
|
||||
bfbb63f7a245c31767585b45014dbd07
|
||||
python simple.py
|
||||
Xapian query: [(huniique:(wqf=11))]
|
||||
Result count: 2 2
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user