python api tests: added rclextract test
This commit is contained in:
parent
4b12c0d06c
commit
338ec6eb42
@ -4188,13 +4188,15 @@ or
|
|||||||
<listitem>Extract document defined
|
<listitem>Extract document defined
|
||||||
by <replaceable>ipath</replaceable> and return
|
by <replaceable>ipath</replaceable> and return
|
||||||
a <literal>Doc</literal> object. The doc.text field
|
a <literal>Doc</literal> object. The doc.text field
|
||||||
has the document text as either text/plain or
|
has the document text converted to either text/plain or
|
||||||
text/html according to doc.mimetype. The typical use
|
text/html according to doc.mimetype. The typical use
|
||||||
would be as follows:
|
would be as follows:
|
||||||
<programlisting>
|
<programlisting>
|
||||||
qdoc = query.fetchone()
|
qdoc = query.fetchone()
|
||||||
extractor = recoll.Extractor(qdoc)
|
extractor = recoll.Extractor(qdoc)
|
||||||
text = extractor.textextract(qdoc.ipath)</programlisting>
|
doc = extractor.textextract(qdoc.ipath)
|
||||||
|
# use doc.text, e.g. for previewing
|
||||||
|
</programlisting>
|
||||||
</listitem>
|
</listitem>
|
||||||
</varlistentry>
|
</varlistentry>
|
||||||
<varlistentry>
|
<varlistentry>
|
||||||
|
|||||||
45
tests/pythonapi/extract.py
Normal file
45
tests/pythonapi/extract.py
Normal file
@ -0,0 +1,45 @@
|
|||||||
|
import sys
|
||||||
|
import hashlib
|
||||||
|
from recoll import recoll
|
||||||
|
from recoll import rclextract
|
||||||
|
|
||||||
|
if sys.version_info[0] >= 3:
|
||||||
|
ISP3 = True
|
||||||
|
else:
|
||||||
|
ISP3 = False
|
||||||
|
|
||||||
|
def utf8string(s):
|
||||||
|
if ISP3:
|
||||||
|
return s
|
||||||
|
else:
|
||||||
|
return s.encode('utf8')
|
||||||
|
|
||||||
|
db = recoll.connect()
|
||||||
|
query = db.query()
|
||||||
|
|
||||||
|
# This normally has only one result, a well-known html file
|
||||||
|
nres = query.execute("HtmlAttachment_uniqueTerm", stemming=0)
|
||||||
|
print("Result count: %d %d" % (nres, query.rowcount))
|
||||||
|
doc = query.fetchone()
|
||||||
|
xtrac = rclextract.Extractor(doc)
|
||||||
|
doc = xtrac.textextract(doc.ipath)
|
||||||
|
print("Text length: %d"%len(doc.text))
|
||||||
|
|
||||||
|
refdigest = 'bfbb63f7a245c31767585b45014dbd07'
|
||||||
|
|
||||||
|
# This normally has 2 results, one of which is a pdf attachment.
|
||||||
|
nres = query.execute("population_size_cultural_transmission", stemming=0)
|
||||||
|
for doc in query:
|
||||||
|
if doc.mimetype == 'application/pdf':
|
||||||
|
xtrac = rclextract.Extractor(doc)
|
||||||
|
filename = xtrac.idoctofile(doc.ipath, doc.mimetype)
|
||||||
|
f = open(filename, 'rb')
|
||||||
|
data = f.read()
|
||||||
|
f.close()
|
||||||
|
m = hashlib.md5()
|
||||||
|
m.update(data)
|
||||||
|
digest = m.hexdigest()
|
||||||
|
print(digest)
|
||||||
|
if digest != refdigest:
|
||||||
|
print("extract.py: wrong digest for extracted file!")
|
||||||
|
|
||||||
@ -53,6 +53,10 @@ User query [title:"été à noël"]
|
|||||||
Xapian query: [(10 * (Sete PHRASE 3 Sa PHRASE 3 Snoel))]
|
Xapian query: [(10 * (Sete PHRASE 3 Sa PHRASE 3 Snoel))]
|
||||||
nres 1
|
nres 1
|
||||||
doc.title: [HTML fields test file: été à noël]
|
doc.title: [HTML fields test file: été à noël]
|
||||||
|
python extract.py
|
||||||
|
Result count: 1 1
|
||||||
|
Text length: 3457
|
||||||
|
bfbb63f7a245c31767585b45014dbd07
|
||||||
python simple.py
|
python simple.py
|
||||||
Xapian query: [(huniique:(wqf=11))]
|
Xapian query: [(huniique:(wqf=11))]
|
||||||
Result count: 2 2
|
Result count: 2 2
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user