diff --git a/src/doc/user/usermanual.sgml b/src/doc/user/usermanual.sgml index 48e4b12e..d23d809a 100644 --- a/src/doc/user/usermanual.sgml +++ b/src/doc/user/usermanual.sgml @@ -4188,13 +4188,15 @@ or Extract document defined by ipath and return a Doc object. The doc.text field - has the document text as either text/plain or + has the document text converted to either text/plain or text/html according to doc.mimetype. The typical use would be as follows: qdoc = query.fetchone() extractor = recoll.Extractor(qdoc) -text = extractor.textextract(qdoc.ipath) +doc = extractor.textextract(qdoc.ipath) +# use doc.text, e.g. for previewing + diff --git a/tests/pythonapi/extract.py b/tests/pythonapi/extract.py new file mode 100644 index 00000000..4861fde7 --- /dev/null +++ b/tests/pythonapi/extract.py @@ -0,0 +1,45 @@ +import sys +import hashlib +from recoll import recoll +from recoll import rclextract + +if sys.version_info[0] >= 3: + ISP3 = True +else: + ISP3 = False + +def utf8string(s): + if ISP3: + return s + else: + return s.encode('utf8') + +db = recoll.connect() +query = db.query() + +# This normally has only one result, a well-known html file +nres = query.execute("HtmlAttachment_uniqueTerm", stemming=0) +print("Result count: %d %d" % (nres, query.rowcount)) +doc = query.fetchone() +xtrac = rclextract.Extractor(doc) +doc = xtrac.textextract(doc.ipath) +print("Text length: %d"%len(doc.text)) + +refdigest = 'bfbb63f7a245c31767585b45014dbd07' + +# This normally has 2 results, one of which is a pdf attachment. +nres = query.execute("population_size_cultural_transmission", stemming=0) +for doc in query: + if doc.mimetype == 'application/pdf': + xtrac = rclextract.Extractor(doc) + filename = xtrac.idoctofile(doc.ipath, doc.mimetype) + f = open(filename, 'rb') + data = f.read() + f.close() + m = hashlib.md5() + m.update(data) + digest = m.hexdigest() + print(digest) + if digest != refdigest: + print("extract.py: wrong digest for extracted file!") + diff --git a/tests/pythonapi/pythonapi.txt b/tests/pythonapi/pythonapi.txt index b0beb11e..bacfbb95 100644 --- a/tests/pythonapi/pythonapi.txt +++ b/tests/pythonapi/pythonapi.txt @@ -53,6 +53,10 @@ User query [title:"été à noël"] Xapian query: [(10 * (Sete PHRASE 3 Sa PHRASE 3 Snoel))] nres 1 doc.title: [HTML fields test file: été à noël] +python extract.py +Result count: 1 1 +Text length: 3457 +bfbb63f7a245c31767585b45014dbd07 python simple.py Xapian query: [(huniique:(wqf=11))] Result count: 2 2