diff --git a/src/doc/user/Makefile b/src/doc/user/Makefile index 7ff97ea7..2c768bc3 100644 --- a/src/doc/user/Makefile +++ b/src/doc/user/Makefile @@ -1,5 +1,37 @@ +# Wherever docbook.xsl and chunk.xsl live +# Fbsd +#XSLDIR="/usr/local/share/xsl/docbook/" +# Mac +#XSLDIR="/opt/local/share/xsl/docbook-xsl/" +#Linux +XSLDIR="/usr/share/xml/docbook/stylesheet/docbook-xsl/" + + +# Options common to the single-file and chunked versions +commonoptions=--stringparam section.autolabel 1 \ + --stringparam section.autolabel.max.depth 3 \ + --stringparam section.label.includes.component.label 1 \ + --stringparam autotoc.label.in.hyperlink 0 \ + --stringparam abstract.notitle.enabled 1 \ + --stringparam html.stylesheet docbook-xsl.css \ + --stringparam generate.toc "book toc,title,figure,table,example,equation" + + +all: usermanual.html index.html usermanual.pdf + usermanual.html: usermanual.xml - sh xmlmake.sh + xsltproc ${commonoptions} \ + -o tmpfile.html "${XSLDIR}/html/docbook.xsl" usermanual.xml + -tidy -indent tmpfile.html > usermanual.html + +index.html: usermanual.xml + xsltproc ${commonoptions} \ + --stringparam use.id.as.filename 1 \ + --stringparam root.filename index \ + "${XSLDIR}/html/chunk.xsl" usermanual.xml + +usermanual.pdf: usermanual.xml + dblatex usermanual.xml clean: - rm -f RCL.*.html usermanual.pdf usermanual.html index.html + rm -f RCL.*.html usermanual.pdf usermanual.html index.html tmpfile.html diff --git a/src/doc/user/usermanual.xml b/src/doc/user/usermanual.xml index f71bba8a..380c07c2 100644 --- a/src/doc/user/usermanual.xml +++ b/src/doc/user/usermanual.xml @@ -39,9 +39,6 @@ This document introduces full text search notions and describes the installation and use of the &RCL; application. It currently describes &RCL; &RCLVERSION;. - @@ -141,7 +138,7 @@ &RCL; stores all internal data in Unicode UTF-8 format, and it can index files with different character sets, encodings, and languages into the same - index. It has input filters for many document types. + index. It has can process many document types. Stemming is the process by which &RCL; reduces words to their radicals so that searching does not depend, for example, on a @@ -381,9 +378,9 @@ patterns to the skippedNames list, which can be done from the GUI Index configuration menu. It is also possible to exclude a mime type independantly of the - file name by associating it with - the rclnull filter. This can be done by - editing the + file name by associating it with the + rclnull input handler. This can be done + by editing the mimeconf configuration file. @@ -2463,7 +2460,7 @@ fs.inotify.max_user_watches=32768 and filename), so this feature will need some custom local configuration to be useful. An example candidate would be the recipient field - which is generated by the message filters. + which is generated by the message input handlers. The default value for the paragraph format string is: More about wildcards. - The document filters used while indexing have the + The document input handlers used while indexing have the possibility to create other fields with arbitrary names, and aliases may be defined in the configuration, so that the exact field search possibilities may be different for you if someone @@ -3293,7 +3290,7 @@ dir:recoll dir:src -dir:utils -dir:common Python language. Another less radical way to extend the application is to - write filters for new types of documents. + write input handlers for new types of documents. The processing of metadata attributes for documents (fields) is highly configurable. @@ -3301,69 +3298,77 @@ dir:recoll dir:src -dir:utils -dir:common - Writing a document filter + Writing a document input handler + + TerminologyThe small programs or pieces + of code which handle the processing of the different document + types for &RCL; used to be called filters, + which is still reflected in the name of the directory which + holds them and many configuration variables. They were named + this way because one of their primary functions is to filter + out the formatting directives and keep the text + content. However these modules may have other behaviours, and + the term input handler is now progressively + substituted in the documentation. filter is + still used in many places though. - &RCL; filters cooperate to translate from the multitude + &RCL; input handlers cooperate to translate from the multitude of input document formats, simple ones as opendocument, acrobat), or compound ones such as Zip or Email, into the final &RCL; - indexing input format, which may - be text/plain - or text/html. Most filters are executable - programs or scripts. A few filters are coded in C++ and live + indexing input format, which is plain text. + Most input handlers are executable + programs or scripts. A few handlers are coded in C++ and live inside recollindex. This latter kind will not be described here. There are currently (1.18 and since 1.13) two kinds of - external executable filters: + external executable input handlers: - Simple filters (exec - filters) run once and - exit. They can be bare programs - like antiword, or scripts - using other programs. They are very simple to write, - because they just need to print the converted document - to the standard output. Their output can - be text/plain - or text/html. + Simple exec handlers + run once and exit. They can be bare programs like + antiword, or scripts using other + programs. They are very simple to write, because they just + need to print the converted document to the standard + output. Their output can be plain text or HTML. HTML is + usually preferred because it can store metadata fields and + it allows preserving some of the formatting for the GUI + preview. - Multiple filters (execm - filters), run as long as - their master process (recollindex) is - active. They can process multiple files (sparing the - process startup time which can be very significant), - or multiple documents per file (e.g.: for zip or chm - files). They communicate with the indexer through a - simple protocol, but are nevertheless a bit more - complicated than the older kind. Most of new - filters are written - in Python, using a common - module to handle the protocol. There is an - exception, rclimg which is written - in Perl. The subdocuments output by these filters can - be directly indexable (text or HTML), or they can be - other simple or compound documents that will need to - be processed by another filter. + Multiple execm handlers + can process multiple files (sparing the process startup + time which can be very significant), or multiple documents + per file (e.g.: for zip or + chm files). They communicate + with the indexer through a simple protocol, but are + nevertheless a bit more complicated than the older + kind. Most of new handlers are written in + Python, using a common module + to handle the protocol. There is an exception, + rclimg which is written in Perl. The + subdocuments output by these handlers can be directly + indexable (text or HTML), or they can be other simple or + compound documents that will need to be processed by + another handler. - In both cases, filters deal with regular file system + In both cases, handlers deal with regular file system files, and can process either a single document, or a linear list of documents in each file. &RCL; is responsible for performing up to date checks, deal with more complex embedding and other upper level issues. - In the extreme case of a simple filter returning a - document in text/plain format, no - metadata can be transferred from the filter to the - indexer. Generic metadata, like document size or - modification date, will be gathered and stored by the - indexer. + A simple handler returning a + document in text/plain format, can transfer + no metadata to the indexer. Generic metadata, like document + size or modification date, will be gathered and stored by + the indexer. - Filters that produce text/html + Handlers that produce text/html format can return an arbitrary amount of metadata inside HTML meta tags. These will be processed according to the directives found in @@ -3371,7 +3376,7 @@ dir:recoll dir:src -dir:utils -dir:common fields configuration file. - The filters that can handle multiple documents per file + The handlers that can handle multiple documents per file return a single piece of data to identify each document inside the file. This piece of data, called an ipath element will be sent back by @@ -3380,27 +3385,27 @@ dir:recoll dir:src -dir:utils -dir:common viewer. The following section describes the simple - filters, and the next one gives a few explanations about + handlers, and the next one gives a few explanations about the execm ones. You could conceivably - write a simple filter with only the elements in the + write a simple handler with only the elements in the manual. This will not be the case for the other ones, for which you will have to look at the code. - Simple filters + Simple input handlers - &RCL; simple filters are usually shell-scripts, but this is in + &RCL; simple handlers are usually shell-scripts, but this is in no way necessary. Extracting the text from the native format is the difficult part. Outputting the format expected by &RCL; is trivial. Happily enough, most document formats have translators or - text extractors which can be called from the filter. In some cases + text extractors which can be called from the handler. In some cases the output of the translating program is completely appropriate, and no intermediate shell-script is needed. - Filters are called with a single argument which is the + Input handlers are called with a single argument which is the source file name. They should output the result to stdout. - When writing a filter, you should decide if it will output + When writing a handler, you should decide if it will output plain text or HTML. Plain text is simpler, but you will not be able to add metadata or vary the output character encoding (this will be defined in a configuration file). Additionally, some formatting may @@ -3411,25 +3416,25 @@ dir:recoll dir:src -dir:utils -dir:common The RECOLL_FILTER_FORPREVIEW environment variable (values yes, no) - tells the filter if the operation is for indexing or - previewing. Some filters use this to output a slightly different + tells the handler if the operation is for indexing or + previewing. Some handlers use this to output a slightly different format, for example stripping uninteresting repeated keywords (ie: Subject: for email) when indexing. This is not essential. - You should look at one of the simple filters, for example + You should look at one of the simple handlers, for example rclps for a starting point. - Don't forget to make your filter executable before + Don't forget to make your handler executable before testing ! - "Multiple" filters + "Multiple" handlers If you can program and want to write - an execm filter, it should not be too + an execm handler, it should not be too difficult to make sense of one of the existing modules. For example, look at rclzip which uses Zip file paths as identifiers (ipath), @@ -3438,7 +3443,7 @@ dir:recoll dir:src -dir:utils -dir:common the internfile/mh_execm.h file and possibly at the corresponding module. - execm filters sometimes need to make + execm handlers sometimes need to make a choice for the nature of the ipath elements that they use in communication with the indexer. Here are a few guidelines: @@ -3453,16 +3458,16 @@ dir:recoll dir:src -dir:utils -dir:common separator to store a complex path internally (for deeper embedding). Colons inside the ipath elements output by a - filter will be escaped, but would be a bad choice as a - filter-specific separator (mostly, again, for + handler will be escaped, but would be a bad choice as a + handler-specific separator (mostly, again, for debugging issues). In any case, the main goal is that it should - be easy for the filter to extract the target document, given + be easy for the handler to extract the target document, given the file name and the ipath element. - execm filters will also produce + execm handlers will also produce a document with a null ipath element. Depending on the type of document, this may have some associated data (e.g. the body of an email message), or @@ -3472,11 +3477,11 @@ dir:recoll dir:src -dir:utils -dir:common - Telling &RCL; about the filter + Telling &RCL; about the handler - There are two elements that link a file to the filter which + There are two elements that link a file to the handler which should process it: the association of file to mime type and the - association of a mime type with a filter. + association of a mime type with a handler. The association of files to mime types is mostly based on name suffixes. The types are defined inside the @@ -3490,7 +3495,7 @@ dir:recoll dir:src -dir:utils -dir:common to execute the file -i command to determine a mime type. - The association of file types to filters is performed in + The association of file types to handlers is performed in the mimeconf file. A sample will probably be of better help than a long explanation: @@ -3532,7 +3537,7 @@ application/x-chm = execm rclchm unrtf in the HTML header section. application/x-chm is processed - by a persistant filter. This is determined by the + by a persistant handler. This is determined by the execm keyword. @@ -3541,7 +3546,7 @@ application/x-chm = execm rclchm - Filter HTML output + Input handler HTML output The output HTML could be very minimal like the following example: @@ -3607,7 +3612,7 @@ or - Filters also have the possibility to "invent" field + Input handlers also have the possibility to "invent" field names. This should also be output as meta tags: @@ -3634,10 +3639,10 @@ or Page numbers The indexer will interpret ^L characters - in the filter output as indicating page breaks, and will record + in the handler output as indicating page breaks, and will record them. At query time, this allows starting a viewer on the right page for a hit or a snippet. Currently, only the PDF, Postscript - and DVI filters generate page breaks. + and DVI handlers generate page breaks. @@ -3651,7 +3656,7 @@ or author, abstract. The field values for documents can appear in several ways - during indexing: either output by filters + during indexing: either output by input handlers as meta fields in the HTML header section, or extracted from file extended attributes, or added as attributes of the Doc object when using the API, or @@ -3661,7 +3666,7 @@ or specific field. &RCL; defines a number of default fields. Additional - ones can be output by filters, and described in the + ones can be output by handlers, and described in the fields configuration file. Fields can be: @@ -3903,7 +3908,7 @@ or The Db class A Db object is created by - a connect() function and holds a + a connect() call and holds a connection to a Recoll index. Methods @@ -4381,7 +4386,7 @@ except: directory. A list of common file types which need external - commands follows. Many of the filters need the + commands follows. Many of the handlers need the iconv command, which is not always listed as a dependancy. @@ -4398,7 +4403,7 @@ except: type is important to you. As of &RCL; release 1.14, a number of XML-based formats that - were handled by ad hoc filter code now use the + were handled by ad hoc handler code now use the xsltproc command, which usually comes with libxslt. These are: abiword, fb2 (ebooks), kword, openoffice, svg. @@ -4425,8 +4430,8 @@ except: be used as a fallback for some files which antiword does not handle. - MS Excel and PowerPoint need - catdoc. + MS Excel and PowerPoint are processed by + internal Python handlers. MS Open XML (docx) needs xsltproc. @@ -4451,15 +4456,10 @@ except: djvused from the DjVuLibre package. - Audio files: &RCL; releases before 1.13 - used the id3info command from the - id3lib package to extract mp3 tag information, - metaflac (standard flac tools) for flac files, - and ogginfo (vorbis tools) for ogg - files. Releases 1.14 and later use a single - Python filter based - on mutagen for all audio file - types. + Audio files: &RCL; releases 1.14 and later use + a single Python handler based + on mutagen for all audio file + types. Pictures: &RCL; uses the @@ -4471,7 +4471,7 @@ except: store personal tags or textual descriptions inside the image files. - chm: files in microsoft help format need Python and + chm: files in Microsoft help format need Python and the pychm module (which needs chmlib). @@ -4498,15 +4498,15 @@ except: Konqueror webarchive format with Python (uses the Tarfile module). - mimehtml web archive format (support based on the email - filter, which introduces some mild weirdness, but still - usable). + Mimehtml web archive format (support based on + the email handler, which introduces some mild weirdness, but + still usable). Text, HTML, email folders, and Scribus files are processed internally. Lyx is used to - index Lyx files. Many filters need iconv and the + index Lyx files. Many handlers need iconv and the standard sed and awk. @@ -4994,10 +4994,10 @@ skippedPaths = ~/somedir/*.txt A space-separated list of patterns for names of files or directories that should be ignored inside zip archives. This is used directly by the zip - filter, and has a function similar to skippedNames, but + handler, and has a function similar to skippedNames, but works independantly. Can be redefined for filesystem subdirectories. For versions up to 1.19, you will need - to update the Zip filter and install a supplementary + to update the Zip handler and install a supplementary Python module. The details are described on the &RCL; wiki. @@ -5552,13 +5552,13 @@ mondelaypatterns = *.log:20 "this one has spaces*:10" filtermaxseconds - Maximum filter execution time, after which it + Maximum handler execution time, after which it is aborted. Some postscript programs just loop... filtersdir A directory to search for the external - filter scripts used to index some types of files. The + input handler scripts used to index some types of files. The value should not be changed, except if you want to modify one of the default scripts. The value can be redefined for any sub-directory. @@ -5678,9 +5678,9 @@ mondelaypatterns = *.log:20 "this one has spaces*:10" - filter-specific sections - Some filters may need specific - configuration for handling fields. Only the email message filter + handler-specific sections + Some input handlers may need specific + configuration for handling fields. Only the email message handler currently has such a section (named [mail]). It allows indexing arbitrary email headers in addition to the ones indexed by default. Other such @@ -5694,7 +5694,7 @@ mondelaypatterns = *.log:20 "this one has spaces*:10" fields file. This would extract a specific email header and use it as a searchable field, with data displayable inside result - lists. (Side note: as the email filter does no decoding on the values, + lists. (Side note: as the email handler does no decoding on the values, only plain ascii headers can be indexed, and only the first occurrence will be used for headers that occur several times). @@ -6007,7 +6007,7 @@ application/x-blobapp = exec rclblob - The rclblob filter should + The rclblob handler should be an executable program or script which exists inside /usr/[local/]share/recoll/filters. It will be given a file name as argument and should output the @@ -6015,7 +6015,7 @@ application/x-blobapp = exec rclblob The filter programming section describes in more detail how - to write a filter. + to write an input handler. diff --git a/src/doc/user/xmlmake.sh b/src/doc/user/xmlmake.sh deleted file mode 100644 index 2b6e66b3..00000000 --- a/src/doc/user/xmlmake.sh +++ /dev/null @@ -1,60 +0,0 @@ -#!/bin/sh - -# A script to produce the Recoll manual with an xml toolchain. -# Tools used: -# - xsltproc -# - The docbook-xsl styleets -# - dblatex for producing the PDF. -# -# Limitations: -# - Does not produce the links to the whole/chunked versions at the top -# of the document -# - The anchor names from the source text are converted to uppercase -# by the sgml toolchain. This does not happen with the xml -# toolchain, which means that external links like -# usermanual.html#RCL.CONFIG.INDEXING won't work because fragments -# are case-sensitive. This has been solved by converting all ids -# inside the source file to upper-case. DON'T REINTRODUCE -# lower-case IDS - -# Wherever docbook.xsl and chunk.xsl live -# Fbsd -#XSLDIR="/usr/local/share/xsl/docbook/" -# Mac -#XSLDIR="/opt/local/share/xsl/docbook-xsl/" -#Linux -XSLDIR="/usr/share/xml/docbook/stylesheet/docbook-xsl/" - -dochunky=1 -test $# -eq 1 && dochunky=0 - -# Options common to the single-file and chunked versions -commonoptions="--stringparam section.autolabel 1 \ - --stringparam section.autolabel.max.depth 3 \ - --stringparam section.label.includes.component.label 1 \ - --stringparam autotoc.label.in.hyperlink 0 \ - --stringparam abstract.notitle.enabled 1 \ - --stringparam html.stylesheet docbook-xsl.css \ - --stringparam generate.toc \"book toc,title,figure,table,example,equation\" \ -" - -# Do the chunky thing -if test $dochunky -ne 0 ; then - eval xsltproc $commonoptions \ - --stringparam use.id.as.filename 1 \ - --stringparam root.filename index \ - "$XSLDIR/html/chunk.xsl" \ - usermanual.xml -fi - -# Produce the single file version -eval xsltproc $commonoptions \ - -o usermanual.html \ - "$XSLDIR/html/docbook.xsl" \ - usermanual.xml - -tidy -indent usermanual.html > tmpfile -mv -f tmpfile usermanual.html - -# And the pdf with dblatex -dblatex usermanual.xml