doc
This commit is contained in:
parent
216c69ff2d
commit
fe2eb103ec
@ -6667,10 +6667,11 @@ alink="#0000FF">
|
|||||||
show the snippets text). In order to access the actual
|
show the snippets text). In order to access the actual
|
||||||
document data, the data extraction part of the indexing
|
document data, the data extraction part of the indexing
|
||||||
process must be performed (subdocument access and
|
process must be performed (subdocument access and
|
||||||
format translation). This is not trivial in general.
|
format translation). This is not trivial in the case of
|
||||||
The <code class="literal">rclextract</code> module
|
embedded documents. The <code class=
|
||||||
currently provides a single class which can be used to
|
"literal">rclextract</code> module provides a single
|
||||||
access the data content for result documents.</p>
|
class which can be used to access the data content for
|
||||||
|
result documents.</p>
|
||||||
<div class="sect4">
|
<div class="sect4">
|
||||||
<div class="titlepage">
|
<div class="titlepage">
|
||||||
<div>
|
<div>
|
||||||
@ -6709,16 +6710,24 @@ alink="#0000FF">
|
|||||||
<p>Extract document defined by <em class=
|
<p>Extract document defined by <em class=
|
||||||
"replaceable"><code>ipath</code></em> and
|
"replaceable"><code>ipath</code></em> and
|
||||||
return a <code class="literal">Doc</code>
|
return a <code class="literal">Doc</code>
|
||||||
object. The doc.text field has the document
|
object. The <code class=
|
||||||
text converted to either text/plain or
|
"literal">doc.text</code> field has the
|
||||||
text/html according to doc.mimetype. The
|
document text converted to either text/plain
|
||||||
typical use would be as follows:</p>
|
or text/html according to <code class=
|
||||||
|
"literal">doc.mimetype</code>. The typical
|
||||||
|
use would be as follows:</p>
|
||||||
<pre class="programlisting">
|
<pre class="programlisting">
|
||||||
qdoc = query.fetchone()
|
qdoc = query.fetchone()
|
||||||
extractor = recoll.Extractor(qdoc)
|
extractor = recoll.Extractor(qdoc)
|
||||||
doc = extractor.textextract(qdoc.ipath)
|
doc = extractor.textextract(qdoc.ipath)
|
||||||
# use doc.text, e.g. for previewing
|
# use doc.text, e.g. for previewing</pre>
|
||||||
</pre>
|
<p>Passing <code class=
|
||||||
|
"literal">qdoc.ipath</code> to <code class=
|
||||||
|
"literal">textextract()</code> is redundant,
|
||||||
|
but reflects the fact that the <code class=
|
||||||
|
"literal">Extractor</code> object actually
|
||||||
|
has the capability to access the other
|
||||||
|
entries in a compound document.</p>
|
||||||
</dd>
|
</dd>
|
||||||
<dt><span class=
|
<dt><span class=
|
||||||
"term">Extractor.idoctofile(ipath, targetmtype,
|
"term">Extractor.idoctofile(ipath, targetmtype,
|
||||||
@ -6729,9 +6738,17 @@ alink="#0000FF">
|
|||||||
created as a temporary file to be deleted by
|
created as a temporary file to be deleted by
|
||||||
the caller. Typical use:</p>
|
the caller. Typical use:</p>
|
||||||
<pre class="programlisting">
|
<pre class="programlisting">
|
||||||
qdoc = query.fetchone()
|
qdoc = query.fetchone()
|
||||||
extractor = recoll.Extractor(qdoc)
|
extractor = recoll.Extractor(qdoc)
|
||||||
filename = extractor.idoctofile(qdoc.ipath, qdoc.mimetype)</pre>
|
filename = extractor.idoctofile(qdoc.ipath, qdoc.mimetype)</pre>
|
||||||
|
<p>In all cases the output is a copy, even if
|
||||||
|
the requested document is a regular system
|
||||||
|
file, which may be wasteful in some cases. If
|
||||||
|
you want to avoid this, you can test for a
|
||||||
|
simple file document as follows:</p>
|
||||||
|
<pre class="programlisting">
|
||||||
|
not doc.ipath and (not "rclbes" in doc.keys() or doc["rclbes"] == "FS")
|
||||||
|
</pre>
|
||||||
</dd>
|
</dd>
|
||||||
</dl>
|
</dl>
|
||||||
</div>
|
</div>
|
||||||
@ -6758,9 +6775,9 @@ alink="#0000FF">
|
|||||||
embryonic GUI which demonstrates the highlighting and
|
embryonic GUI which demonstrates the highlighting and
|
||||||
data extraction functions.</p>
|
data extraction functions.</p>
|
||||||
<pre class="programlisting">
|
<pre class="programlisting">
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
|
||||||
from recoll import recoll
|
from recoll import recoll
|
||||||
|
|
||||||
db = recoll.connect()
|
db = recoll.connect()
|
||||||
db.setAbstractParams(maxchars=80, contextwords=4)
|
db.setAbstractParams(maxchars=80, contextwords=4)
|
||||||
@ -6769,18 +6786,16 @@ query = db.query()
|
|||||||
nres = query.execute("some user question")
|
nres = query.execute("some user question")
|
||||||
print "Result count: ", nres
|
print "Result count: ", nres
|
||||||
if nres > 5:
|
if nres > 5:
|
||||||
nres = 5
|
nres = 5
|
||||||
for i in range(nres):
|
for i in range(nres):
|
||||||
doc = query.fetchone()
|
doc = query.fetchone()
|
||||||
print "Result #%d" % (query.rownumber,)
|
print "Result #%d" % (query.rownumber,)
|
||||||
for k in ("title", "size"):
|
for k in ("title", "size"):
|
||||||
print k, ":", getattr(doc, k).encode('utf-8')
|
print k, ":", getattr(doc, k).encode('utf-8')
|
||||||
abs = db.makeDocAbstract(doc, query).encode('utf-8')
|
abs = db.makeDocAbstract(doc, query).encode('utf-8')
|
||||||
print abs
|
print abs
|
||||||
print
|
print
|
||||||
|
</pre>
|
||||||
|
|
||||||
</pre>
|
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
<div class="sect2">
|
<div class="sect2">
|
||||||
|
|||||||
@ -5196,13 +5196,13 @@
|
|||||||
|
|
||||||
<para>Index queries do not provide document content (only a
|
<para>Index queries do not provide document content (only a
|
||||||
partial and unprecise reconstruction is performed to show the
|
partial and unprecise reconstruction is performed to show the
|
||||||
snippets text). In order to access the actual document data,
|
snippets text). In order to access the actual document data, the
|
||||||
the data extraction part of the indexing process
|
data extraction part of the indexing process must be performed
|
||||||
must be performed (subdocument access and format
|
(subdocument access and format translation). This is not trivial
|
||||||
translation). This is not trivial in
|
in the case of embedded documents. The
|
||||||
general. The <literal>rclextract</literal> module currently
|
<literal>rclextract</literal> module provides a single class
|
||||||
provides a single class which can be used to access the data
|
which can be used to access the data content for result
|
||||||
content for result documents.</para>
|
documents.</para>
|
||||||
|
|
||||||
<sect4 id="RCL.PROGRAM.PYTHONAPI.RCLEXTRACT.CLASSES">
|
<sect4 id="RCL.PROGRAM.PYTHONAPI.RCLEXTRACT.CLASSES">
|
||||||
<title>Classes</title>
|
<title>Classes</title>
|
||||||
@ -5220,30 +5220,43 @@
|
|||||||
</varlistentry>
|
</varlistentry>
|
||||||
<varlistentry>
|
<varlistentry>
|
||||||
<term>Extractor.textextract(ipath)</term>
|
<term>Extractor.textextract(ipath)</term>
|
||||||
<listitem><para>Extract document defined
|
<listitem><para>Extract document defined by
|
||||||
by <replaceable>ipath</replaceable> and return
|
<replaceable>ipath</replaceable> and return a
|
||||||
a <literal>Doc</literal> object. The doc.text field
|
<literal>Doc</literal> object. The
|
||||||
has the document text converted to either text/plain or
|
<literal>doc.text</literal> field has the document text
|
||||||
text/html according to doc.mimetype. The typical use
|
converted to either text/plain or text/html according to
|
||||||
would be as follows:
|
<literal>doc.mimetype</literal>. The typical use would be
|
||||||
<programlisting>
|
as follows:</para>
|
||||||
qdoc = query.fetchone()
|
<programlisting>
|
||||||
extractor = recoll.Extractor(qdoc)
|
qdoc = query.fetchone()
|
||||||
doc = extractor.textextract(qdoc.ipath)
|
extractor = recoll.Extractor(qdoc)
|
||||||
# use doc.text, e.g. for previewing
|
doc = extractor.textextract(qdoc.ipath)
|
||||||
</programlisting>
|
# use doc.text, e.g. for previewing</programlisting>
|
||||||
</para></listitem>
|
<para>Passing <literal>qdoc.ipath</literal> to
|
||||||
|
<literal>textextract()</literal> is redundant, but
|
||||||
|
reflects the fact that the <literal>Extractor</literal>
|
||||||
|
object actually has the capability to access the other
|
||||||
|
entries in a compound document.</para>
|
||||||
|
</listitem>
|
||||||
</varlistentry>
|
</varlistentry>
|
||||||
<varlistentry>
|
<varlistentry>
|
||||||
<term>Extractor.idoctofile(ipath, targetmtype, outfile='')</term>
|
<term>Extractor.idoctofile(ipath, targetmtype, outfile='')</term>
|
||||||
<listitem><para>Extracts document into an output file,
|
<listitem><para>Extracts document into an output file,
|
||||||
which can be given explicitly or will be created as a
|
which can be given explicitly or will be created as a
|
||||||
temporary file to be deleted by the caller. Typical use:
|
temporary file to be deleted by the caller. Typical
|
||||||
<programlisting>
|
use:</para>
|
||||||
qdoc = query.fetchone()
|
<programlisting>
|
||||||
extractor = recoll.Extractor(qdoc)
|
qdoc = query.fetchone()
|
||||||
filename = extractor.idoctofile(qdoc.ipath, qdoc.mimetype)</programlisting>
|
extractor = recoll.Extractor(qdoc)
|
||||||
|
filename = extractor.idoctofile(qdoc.ipath, qdoc.mimetype)</programlisting>
|
||||||
|
|
||||||
|
<para>In all cases the output is a copy, even if the
|
||||||
|
requested document is a regular system file, which may be
|
||||||
|
wasteful in some cases. If you want to avoid this, you
|
||||||
|
can test for a simple file document as follows:
|
||||||
|
<programlisting>
|
||||||
|
not doc.ipath and (not "rclbes" in doc.keys() or doc["rclbes"] == "FS")
|
||||||
|
</programlisting>
|
||||||
</para></listitem>
|
</para></listitem>
|
||||||
</varlistentry>
|
</varlistentry>
|
||||||
|
|
||||||
@ -5253,6 +5266,7 @@
|
|||||||
</sect4> <!-- rclextract classes -->
|
</sect4> <!-- rclextract classes -->
|
||||||
</sect3> <!-- rclextract module -->
|
</sect3> <!-- rclextract module -->
|
||||||
|
|
||||||
|
|
||||||
<sect3 id="RCL.PROGRAM.PYTHONAPI.SEARCH.EXAMPLE">
|
<sect3 id="RCL.PROGRAM.PYTHONAPI.SEARCH.EXAMPLE">
|
||||||
<title>Search API usage example</title>
|
<title>Search API usage example</title>
|
||||||
|
|
||||||
@ -5263,10 +5277,10 @@
|
|||||||
has a very embryonic GUI which demonstrates the
|
has a very embryonic GUI which demonstrates the
|
||||||
highlighting and data extraction functions.</para>
|
highlighting and data extraction functions.</para>
|
||||||
|
|
||||||
<programlisting>
|
<programlisting><![CDATA[
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
<![CDATA[
|
|
||||||
from recoll import recoll
|
from recoll import recoll
|
||||||
|
|
||||||
db = recoll.connect()
|
db = recoll.connect()
|
||||||
db.setAbstractParams(maxchars=80, contextwords=4)
|
db.setAbstractParams(maxchars=80, contextwords=4)
|
||||||
@ -5275,18 +5289,16 @@ query = db.query()
|
|||||||
nres = query.execute("some user question")
|
nres = query.execute("some user question")
|
||||||
print "Result count: ", nres
|
print "Result count: ", nres
|
||||||
if nres > 5:
|
if nres > 5:
|
||||||
nres = 5
|
nres = 5
|
||||||
for i in range(nres):
|
for i in range(nres):
|
||||||
doc = query.fetchone()
|
doc = query.fetchone()
|
||||||
print "Result #%d" % (query.rownumber,)
|
print "Result #%d" % (query.rownumber,)
|
||||||
for k in ("title", "size"):
|
for k in ("title", "size"):
|
||||||
print k, ":", getattr(doc, k).encode('utf-8')
|
print k, ":", getattr(doc, k).encode('utf-8')
|
||||||
abs = db.makeDocAbstract(doc, query).encode('utf-8')
|
abs = db.makeDocAbstract(doc, query).encode('utf-8')
|
||||||
print abs
|
print abs
|
||||||
print
|
print
|
||||||
|
]]></programlisting>
|
||||||
]]>
|
|
||||||
</programlisting>
|
|
||||||
|
|
||||||
</sect3>
|
</sect3>
|
||||||
</sect2>
|
</sect2>
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user