From fe2eb103ecbe98ec6f25da4694e1ca31ec239e63 Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Sun, 10 Dec 2017 14:47:11 +0100 Subject: [PATCH] doc --- src/doc/user/usermanual.html | 75 ++++++++++++++++------------ src/doc/user/usermanual.xml | 94 ++++++++++++++++++++---------------- 2 files changed, 98 insertions(+), 71 deletions(-) diff --git a/src/doc/user/usermanual.html b/src/doc/user/usermanual.html index f678ab36..73f46560 100644 --- a/src/doc/user/usermanual.html +++ b/src/doc/user/usermanual.html @@ -6667,10 +6667,11 @@ alink="#0000FF"> show the snippets text). In order to access the actual document data, the data extraction part of the indexing process must be performed (subdocument access and - format translation). This is not trivial in general. - The rclextract module - currently provides a single class which can be used to - access the data content for result documents.

+ format translation). This is not trivial in the case of + embedded documents. The rclextract module provides a single + class which can be used to access the data content for + result documents.

@@ -6709,16 +6710,24 @@ alink="#0000FF">

Extract document defined by ipath and return a Doc - object. The doc.text field has the document - text converted to either text/plain or - text/html according to doc.mimetype. The - typical use would be as follows:

+ object. The doc.text field has the + document text converted to either text/plain + or text/html according to doc.mimetype. The typical + use would be as follows:

-                    qdoc = query.fetchone()
-                    extractor = recoll.Extractor(qdoc)
-                    doc = extractor.textextract(qdoc.ipath)
-                    # use doc.text, e.g. for previewing
-                  
+qdoc = query.fetchone() +extractor = recoll.Extractor(qdoc) +doc = extractor.textextract(qdoc.ipath) +# use doc.text, e.g. for previewing +

Passing qdoc.ipath to textextract() is redundant, + but reflects the fact that the Extractor object actually + has the capability to access the other + entries in a compound document.

Extractor.idoctofile(ipath, targetmtype, @@ -6729,9 +6738,17 @@ alink="#0000FF"> created as a temporary file to be deleted by the caller. Typical use:

-                    qdoc = query.fetchone()
-                    extractor = recoll.Extractor(qdoc)
-                  filename = extractor.idoctofile(qdoc.ipath, qdoc.mimetype)
+qdoc = query.fetchone() +extractor = recoll.Extractor(qdoc) +filename = extractor.idoctofile(qdoc.ipath, qdoc.mimetype) +

In all cases the output is a copy, even if + the requested document is a regular system + file, which may be wasteful in some cases. If + you want to avoid this, you can test for a + simple file document as follows:

+
+not doc.ipath and (not "rclbes" in doc.keys() or doc["rclbes"] == "FS")
+
@@ -6758,9 +6775,9 @@ alink="#0000FF"> embryonic GUI which demonstrates the highlighting and data extraction functions.

-            #!/usr/bin/env python
-            
-                     from recoll import recoll
+#!/usr/bin/env python
+
+from recoll import recoll
 
 db = recoll.connect()
 db.setAbstractParams(maxchars=80, contextwords=4)
@@ -6769,18 +6786,16 @@ query = db.query()
 nres = query.execute("some user question")
 print "Result count: ", nres
 if nres > 5:
-nres = 5
+    nres = 5
 for i in range(nres):
-doc = query.fetchone()
-print "Result #%d" % (query.rownumber,)
-for k in ("title", "size"):
-print k, ":", getattr(doc, k).encode('utf-8')
-abs = db.makeDocAbstract(doc, query).encode('utf-8')
-print abs
-print
-
-            
-          
+ doc = query.fetchone() + print "Result #%d" % (query.rownumber,) + for k in ("title", "size"): + print k, ":", getattr(doc, k).encode('utf-8') + abs = db.makeDocAbstract(doc, query).encode('utf-8') + print abs + print +
diff --git a/src/doc/user/usermanual.xml b/src/doc/user/usermanual.xml index fb40f17c..34f04ce2 100644 --- a/src/doc/user/usermanual.xml +++ b/src/doc/user/usermanual.xml @@ -5196,13 +5196,13 @@ Index queries do not provide document content (only a partial and unprecise reconstruction is performed to show the - snippets text). In order to access the actual document data, - the data extraction part of the indexing process - must be performed (subdocument access and format - translation). This is not trivial in - general. The rclextract module currently - provides a single class which can be used to access the data - content for result documents. + snippets text). In order to access the actual document data, the + data extraction part of the indexing process must be performed + (subdocument access and format translation). This is not trivial + in the case of embedded documents. The + rclextract module provides a single class + which can be used to access the data content for result + documents. Classes @@ -5220,30 +5220,43 @@ Extractor.textextract(ipath) - Extract document defined - by ipath and return - a Doc object. The doc.text field - has the document text converted to either text/plain or - text/html according to doc.mimetype. The typical use - would be as follows: - - qdoc = query.fetchone() - extractor = recoll.Extractor(qdoc) - doc = extractor.textextract(qdoc.ipath) - # use doc.text, e.g. for previewing - - + Extract document defined by + ipath and return a + Doc object. The + doc.text field has the document text + converted to either text/plain or text/html according to + doc.mimetype. The typical use would be + as follows: + +qdoc = query.fetchone() +extractor = recoll.Extractor(qdoc) +doc = extractor.textextract(qdoc.ipath) +# use doc.text, e.g. for previewing + Passing qdoc.ipath to + textextract() is redundant, but + reflects the fact that the Extractor + object actually has the capability to access the other + entries in a compound document. + Extractor.idoctofile(ipath, targetmtype, outfile='') Extracts document into an output file, which can be given explicitly or will be created as a - temporary file to be deleted by the caller. Typical use: - - qdoc = query.fetchone() - extractor = recoll.Extractor(qdoc) - filename = extractor.idoctofile(qdoc.ipath, qdoc.mimetype) - + temporary file to be deleted by the caller. Typical + use: + +qdoc = query.fetchone() +extractor = recoll.Extractor(qdoc) +filename = extractor.idoctofile(qdoc.ipath, qdoc.mimetype) + + In all cases the output is a copy, even if the + requested document is a regular system file, which may be + wasteful in some cases. If you want to avoid this, you + can test for a simple file document as follows: + +not doc.ipath and (not "rclbes" in doc.keys() or doc["rclbes"] == "FS") + @@ -5253,6 +5266,7 @@ + Search API usage example @@ -5263,10 +5277,10 @@ has a very embryonic GUI which demonstrates the highlighting and data extraction functions. - - #!/usr/bin/env python - 5: -nres = 5 + nres = 5 for i in range(nres): -doc = query.fetchone() -print "Result #%d" % (query.rownumber,) -for k in ("title", "size"): -print k, ":", getattr(doc, k).encode('utf-8') -abs = db.makeDocAbstract(doc, query).encode('utf-8') -print abs -print - - ]]> - + doc = query.fetchone() + print "Result #%d" % (query.rownumber,) + for k in ("title", "size"): + print k, ":", getattr(doc, k).encode('utf-8') + abs = db.makeDocAbstract(doc, query).encode('utf-8') + print abs + print +]]>