From 7130edd5cbcd96f25f2fe7d971827c2ec073df0e Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Thu, 21 Feb 2013 19:13:31 +0100 Subject: [PATCH] doc --- src/doc/user/usermanual.sgml | 618 ++++++++++++++++++----------- src/python/recoll/pyrclextract.cpp | 2 +- 2 files changed, 397 insertions(+), 223 deletions(-) diff --git a/src/doc/user/usermanual.sgml b/src/doc/user/usermanual.sgml index 0cbcf8aa..57a99f55 100644 --- a/src/doc/user/usermanual.sgml +++ b/src/doc/user/usermanual.sgml @@ -3557,15 +3557,15 @@ application/x-chm = execm rclchm - + Data for an external indexer, should be stored in a - separate index, not the one for the &RCL; internal file system - indexer, except if the latter is not used at all). The reason - is that the main document indexer purge pass would remove all - the other indexer's documents, as they were not seen during - indexing. The main indexer documents would also probably be a - problem for the external indexer purge operation. + separate index, not the one for the &RCL; internal file system + indexer, except if the latter is not used at all). The reason + is that the main document indexer purge pass would remove all + the other indexer's documents, as they were not seen during + indexing. The main indexer documents would also probably be a + problem for the external indexer purge operation. @@ -3578,262 +3578,436 @@ application/x-chm = execm rclchm &RCL; versions after 1.11 define a Python programming interface, both for searching and indexing. + The API is inspired by the Python database API + specification, version 1.0 for &RCL; versions up to 1.18, + version 2.0 for &RCL; versions 1.19 and later. The package + structure changed with &RCL; 1.19 too. We will mostly + describe the new API and package structure here. A paragraph + at the end of this section will explain a few differences + and ways to write code compatible with both versions. + The Python interface can be found in the source package, under python/recoll. - In order to build the module, you should first build - or re-build the Recoll library using position-independant - objects: - - cd recoll-xxx/ - configure --enable-pic - make - - There is no significant disadvantage in using PIC objects - for the main Recoll executables, so you can use the - option for the main build - too. The python/recoll/ directory - contains the usual setup.py - script which you can then use to build and install the - module: - - cd recoll-xxx/python/recoll - python setup.py build - python setup.py install - + contains the usual setup.py. After + configuring the main &RCL; code, you can use the script to + build and install the Python module: + + cd recoll-xxx/python/recoll + python setup.py build + python setup.py install + - - - Interface manual - - -NAME - recoll - This is an interface to the Recoll full text indexer. - -FILE - /usr/local/lib/python2.5/site-packages/recoll.so - -CLASSES - Db - Doc - Query - SearchData - - class Db(__builtin__.object) - | Db([confdir=None], [extra_dbs=None], [writable = False]) - | - | A Db object holds a connection to a Recoll index. Use the connect() - | function to create one. - | confdir specifies a Recoll configuration directory (default: - | $RECOLL_CONFDIR or ~/.recoll). - | extra_dbs is a list of external databases (xapian directories) - | writable decides if we can index new data through this connection - | - | Methods defined here: - | - | - | addOrUpdate(...) - | addOrUpdate(udi, doc, parent_udi=None) -> None - | Add or update index data for a given document - | The udi string must define a unique id for the document. It is not - | interpreted inside Recoll - | doc is a Doc object - | if parent_udi is set, this is a unique identifier for the - | top-level container (ie mbox file) - | - | delete(...) - | delete(udi) -> Bool. - | Purge index from all data for udi. If udi matches a container - | document, purge all subdocs (docs with a parent_udi matching udi). - | - | makeDocAbstract(...) - | makeDocAbstract(Doc, Query) -> string - | Build and return 'keyword-in-context' abstract for document - | and query. - | - | needUpdate(...) - | needUpdate(udi, sig) -> Bool. - | Check if the index is up to date for the document defined by udi, - | having the current signature sig. - | - | purge(...) - | purge() -> Bool. - | Delete all documents that were not touched during the just finished - | indexing pass (since open-for-write). These are the documents for - | the needUpdate() call was not performed, indicating that they no - | longer exist in the primary storage system. - | - | query(...) - | query() -> Query. Return a new, blank query object for this index. - | - | setAbstractParams(...) - | setAbstractParams(maxchars, contextwords). - | Set the parameters used to build 'keyword-in-context' abstracts - | - | ---------------------------------------------------------------------- - | Data and other attributes defined here: - | - - class Doc(__builtin__.object) - | Doc() - | - | A Doc object contains index data for a given document. - | The data is extracted from the index when searching, or set by the - | indexer program when updating. The Doc object has no useful methods but - | many attributes to be read or set by its user. It matches exactly the - | Rcl::Doc c++ object. Some of the attributes are predefined, but, - | especially when indexing, others can be set, the name of which will be - | processed as field names by the indexing configuration. - | Inputs can be specified as unicode or strings. - | Outputs are unicode objects. - | All dates are specified as unix timestamps, printed as strings - | Predefined attributes (index/query/both): - | text (index): document plain text - | url (both) - | fbytes (both) optional) file size in bytes - | filename (both) - | fmtime (both) optional file modification date. Unix time printed - | as string - | dbytes (both) document text bytes - | dmtime (both) document creation/modification date - | ipath (both) value private to the app.: internal access path - | inside file - | mtype (both) mime type for original document - | mtime (query) dmtime if set else fmtime - | origcharset (both) charset the text was converted from - | size (query) dbytes if set, else fbytes - | sig (both) app-defined file modification signature. - | For up to date checks - | relevancyrating (query) - | abstract (both) - | author (both) - | title (both) - | keywords (both) - | - | Methods defined here: - | - | - | ---------------------------------------------------------------------- - | Data and other attributes defined here: - | - - class Query(__builtin__.object) - | Recoll Query objects are used to execute index searches. - | They must be created by the Db.query() method. - | - | Methods defined here: - | - | - | execute(...) - | execute(query_string, stemming=1|0, stemlang="stemming language") - | - | Starts a search for query_string, a Recoll search language string - | (mostly Xesam-compatible). - | The query can be a simple list of terms (and'ed by default), or more - | complicated with field specs etc. See the Recoll manual. - | - | executesd(...) - | executesd(SearchData) - | - | Starts a search for the query defined by the SearchData object. - | - | fetchone(...) - | fetchone(None) -> Doc - | - | Fetches the next Doc object in the current search results. - | - | sortby(...) - | sortby(field=fieldname, ascending=true) - | Sort results by 'fieldname', in ascending or descending order. - | Only one field can be used, no subsorts for now. - | Must be called before executing the search - | - | ---------------------------------------------------------------------- - | Data descriptors defined here: - | - | next - | Next index to be fetched from results. Normally increments after - | each fetchone() call, but can be set/reset before the call effect - | seeking. Starts at 0 - | - | ---------------------------------------------------------------------- - | Data and other attributes defined here: - | - - class SearchData(__builtin__.object) - | SearchData() - | - | A SearchData object describes a query. It has a number of global - | parameters and a chain of search clauses. - | - | Methods defined here: - | - | - | addclause(...) - | addclause(type='and'|'or'|'excl'|'phrase'|'near'|'sub', - | qstring=string, slack=int, field=string, stemming=1|0, - | subSearch=SearchData) - | Adds a simple clause to the SearchData And/Or chain, or a subquery - | defined by another SearchData object - | - | ---------------------------------------------------------------------- - | Data and other attributes defined here: - | - -FUNCTIONS - connect(...) - connect([confdir=None], [extra_dbs=None], [writable = False]) - -> Db. + + Recoll package - Connects to a Recoll database and returns a Db object. - confdir specifies a Recoll configuration directory - (the default is built like for any Recoll program). - extra_dbs is a list of external databases (xapian directories) - writable decides if we can index new data through this connection + The recoll package contains two + modules: + + The recoll module contains + functions and classes used to query (or update) the + index. + The rclextract module contains + functions and classes used to access document + data. + + + + + + The recoll module + + + Functions + + + + connect(confdir=None, extra_dbs=None, + writable = False) + + The connect() function connects to + one or several &RCL; index(es) and returns + a Db object. + + confdir may specify + a configuration directory. The usual defaults + apply. + extra_dbs is a list of + additional indexes (Xapian directories). + writable decides if + we can index new data through this + connection. + + + + + + + + + + Classes + + + The Db class + + A Db object is created by + a connect() function and holds a + connection to a Recoll index. + + Methods + + Db.close() + Closes the connection. You can't do anything + with the Db object after + this. + + + Db.query(), Db.cursor() These + aliases return a blank Query object + for this index. + + + + Db.setAbstractParams(maxchars, contextwords) + Set the parameters used to build snippets. + + + + + + + + + The Query class + + A Query object (equivalent to a + cursor in the Python DB API) is created by + a Db.query() call. It is used to + execute index searches. + + + Methods + + + Query.sortby(fieldname, ascending=True) + Sort results + by fieldname, in ascending + or descending order. Must be called before executing + the search. + + + + Query.execute(query_string, stemming=1, + stemlang="english") + Starts a search + for query_string, a &RCL; + search language string. + + + + Query.executesd(SearchData) + Starts a search for the query defined by the + SearchData object. + + + + Query.fetchmany(size=query.arraysize) + + Fetches + the next Doc objects in the current + search results, and returns them as an array of the + required size, which is by default the value of + the arraysize data member. + + + + Query.fetchone() + Fetches the next Doc object + from the current search results. + + + + Query.close() + Closes the connection. The object is unusable + after the call. + + + + Query.scroll(value, mode='relative') + Adjusts the position in the current result + set. mode can + be relative + or absolute. + + + + Query.getgroups() + Retrieves the expanded query terms as a list + of pairs. Meaningful only after executexx + In each pair, the first entry is a list of user terms, + the second a list of query terms as derived from the + user terms and used in the Xapian Query. The size of + each list is one for simple terms, or more for group + and phrase clauses. + + + + Query.getxquery() + Return the Xapian query description as a Unicode string. + Meaningful only after executexx. + + + + Query.highlight(text, ishtml = 0, methods = object) + Will insert <span "class=rclmatch">, + </span> tags around the match areas in the input text + and return the modified text. ishtml + can be set to indicate that the input text is HTML and + that HTML special characters should not be escaped. + methods if set should be an object + with methods startMatch(i) and endMatch() which will be + called for each match and should return a begin and end + tag + + + + Query.makedocabstract(doc, methods = object)) + Create a snippets abstract + for doc (a Doc + object) by selecting text around the match terms. + If methods is set, will also perform highlighting. See + the highlight method. + + + + + Query.__iter__() and Query.next() + So that things like for doc in + query: will work. + + + + + Data descriptors + + Query.arraysize Default + number of records processed by fetchmany (r/w). + + Query.rowcountNumber of + records returned by the last execute. + Query.rownumberNext index + to be fetched from results. Normally increments after + each fetchone() call, but can be set/reset before the + call effect seeking. Starts at 0. + + + + + + + + + The Doc class + + A Doc object contains index data + for a given document. The data is extracted from the + index when searching, or set by the indexer program when + updating. The Doc object has many attributes to be read or + set by its user. It matches exactly the Rcl::Doc C++ + object. Some of the attributes are predefined, but, + especially when indexing, others can be set, the name of + which will be processed as field names by the indexing + configuration. Inputs can be specified as Unicode or + strings. Outputs are Unicode objects. All dates are + specified as Unix timestamps, printed as strings. Please + refer to the rcldb/rcldoc.h C++ file + for a description of the predefined attributes. + + At query time, only the fields that are defined + as stored either by default or in + the fields configuration file will be + meaningful in the Doc + object. Especially this will not be the case for the + document text. See the rclextract + module for accessing document contents. + + + Methods + + + get(key), [] operator + Retrieve the named doc attribute + + getbinurl()Retrieve + the URL in byte array format (no transcoding), for use as + parameter to a system call. + + + items() + Return a dictionary of doc object + keys/values + + + keys() + list of doc object keys (attribute + names). + + + + + + + The SearchData class + + A SearchData object allows building + a query by combining clauses, for execution + by Query.executesd(). It can be used + in replacement of the query language approach. The + interface is going to change a little, so no detailed doc + for now... + + + Methods + + + addclause(type='and'|'or'|'excl'|'phrase'|'near'|'sub', + qstring=string, slack=0, field='', stemming=1, + subSearch=SearchData) + + + + + + + + + + + The rclextract module + + Document content is not provided by an index query. To + access it, the data extraction part of the indexing process + must be performed (subdocument access and format + translation). This is not trivial in + general. The rclextract module currently + provides a single class which can be used to access the data + content for result documents. + + + Classes + + + The Extractor class + + + Methods + + + Extractor(doc) + An Extractor object is + built from a Doc object, output + from a query. + + + Extractor.textextract(ipath) + Extract document defined + by ipath and return + a Doc object. The doc.text field + has the document text as either text/plain or + text/html according to doc.mimetype. + + + Extractor.idoctofile() + Extracts document into an output file, + which can be given explicitly or will be created as a + temporary file to be deleted by the caller. + + + + + + + - - Example code The following sample would query the index with a user language string. See the python/samples - directory inside the &RCL; source for other examples. + directory inside the &RCL; source for other + examples. The recollgui subdirectory + has a very embryonic GUI which demonstrates the + highlighting and data extraction functions. #!/usr/bin/env python 5: nres = 5 -while query.next >= 0 and query.next < nres: +for i in range(nres): doc = query.fetchone() - print query.next + print "Result #%d" % (query.rownumber,) for k in ("title", "size"): print k, ":", getattr(doc, k).encode('utf-8') abs = db.makeDocAbstract(doc, query).encode('utf-8') print abs print - ]]> + + Compatibility with the previous version + + The following code fragments can be used to ensure that + code can run with both the old and the new API (as long as it + does not use the new abilities of the new API of + course). + + Adapting to the new package structure: + + + + + Adapting to the change of nature of + the next Query + member. The same test can be used to choose to use + the scroll() method (new) or set + the next value (old). + + + + + + diff --git a/src/python/recoll/pyrclextract.cpp b/src/python/recoll/pyrclextract.cpp index d38f4543..e5b4986a 100644 --- a/src/python/recoll/pyrclextract.cpp +++ b/src/python/recoll/pyrclextract.cpp @@ -150,7 +150,7 @@ Extractor_textextract(rclx_ExtractorObject* self, PyObject *args, } PyDoc_STRVAR(doc_Extractor_idoctofile, -"idoctofile(ipath)\n" +"idoctofile(ipath='', mimetype='', ofilename='')\n" "Extract document defined by ipath into a file, in its native format.\n" ); static PyObject *