From ce3088e635ea32c7bcb1db9c925c0d6d5eb84273 Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Wed, 13 Jan 2021 09:22:25 +0100 Subject: [PATCH] recoll.conf man page --- src/doc/man/recoll.conf.5 | 116 +++++++++++++++++++++++++++++-------- src/sampleconf/recoll.conf | 2 +- 2 files changed, 92 insertions(+), 26 deletions(-) diff --git a/src/doc/man/recoll.conf.5 b/src/doc/man/recoll.conf.5 index 9b5a7714..58e3adb4 100644 --- a/src/doc/man/recoll.conf.5 +++ b/src/doc/man/recoll.conf.5 @@ -92,6 +92,11 @@ list. List of name endings to add to the default skippedNames list. .TP +.BI "onlyNames = "string +Regular file name filter patterns If this is set, only the file names not in skippedNames and +matching one of the patterns will be considered for indexing. Can be +redefined per subtree. Does not apply to directories. +.TP .BI "noContentSuffixes = "string List of name endings (not necessarily dot-separated suffixes) for which we don't try MIME type identification, and don't uncompress or @@ -292,14 +297,19 @@ will reduce the index size. This can only be set for a whole index, not for a subtree. .TP .BI "dehyphenate = "bool -Determines if we index 'coworker' also when the input is 'co-worker'. This is new +Determines if we index 'coworker' +also when the input is 'co-worker'. This is new in version 1.22, and on by default. Setting the variable to off allows restoring the previous behaviour. .TP .BI "backslashasletter = "bool -Process backslash as normal letter This may make sense for people wanting to index TeX commands as +Process backslash as normal letter. This may make sense for people wanting to index TeX commands as such but is not of much general use. .TP +.BI "underscoreasletter = "bool +Process underscore as normal letter. This makes sense in so many cases that one wonders if it should +not be the default. +.TP .BI "maxtermlength = "int Maximum term length. Words longer than this will be discarded. The default is 40 and used to be hard-coded, but it can now be @@ -323,7 +333,8 @@ as large. .BI "indexstemminglanguages = "string Languages for which to create stemming expansion data. Stemmer names can be found by executing 'recollindex --l', or this can also be set from a list in the GUI. +-l', or this can also be set from a list in the GUI. The values are full +language names, e.g. english, french... .TP .BI "defaultcharset = "string Default character @@ -348,16 +359,24 @@ lowercase and upper-case versions of a character should be specified, as appartenance to the list will turn-off both standard accent and case processing. The value is global and affects both indexing and querying. Examples: +.br Swedish: +.br unac_except_trans = ää Ää öö Öö üü Üü ßss œoe Œoe æae Æae ffff fifi flfl åå Åå -. German: +.br +German: +.br unac_except_trans = ää Ää öö Öö üü Üü ßss œoe Œoe æae Æae ffff fifi flfl -In French, you probably want to decompose oe and ae and nobody would type +.br +French: you probably want to decompose oe and ae and nobody would type a German ß +.br unac_except_trans = ßss œoe Œoe æae Æae ffff fifi flfl -. The default for all until someone protests follows. These decompositions +.br +The default for all until someone protests follows. These decompositions are not performed by unac, but it is unlikely that someone would type the composed forms in a search. +.br unac_except_trans = ßss œoe Œoe æae Æae ffff fifi flfl .TP .BI "maildefcharset = "string @@ -452,6 +471,10 @@ a directory between different configurations. Minimum mbox file size over which we cache the offsets. There is really no sense in caching offsets for small files. The default is 5 MB. .TP +.BI "mboxmaxmsgmbs = "int +Maximum mbox member message size in megabytes. Size over which we assume that the mbox format is bad or we +misinterpreted it, at which point we just stop processing the file. +.TP .BI "webcachedir = "dfn Directory where we store the archived web pages. This is only used by the web history indexing code Default: cachedir/webcache if cachedir is set, else @@ -515,10 +538,11 @@ is mainly to avoid infinite loops in postscript files .TP .BI "filtermaxmbytes = "int Maximum virtual memory space for filter processes -(setrlimit(RLIMIT_AS)), in megabytes. Note that this -includes any mapped libs (there is no reliable Linux way to limit the -data space only), so we need to be a bit generous here. Anything over -2000 will be ignored on 32 bits machines. +(setrlimit(RLIMIT_AS)), in megabytes. Note that this includes any mapped libs (there is no reliable +Linux way to limit the data space only), so we need to be a bit generous +here. Anything over 2000 will be ignored on 32 bits machines. The +previous default value of 2000 would prevent java pdftk to work when +executed from Python rclpdf.py. .TP .BI "thrQSizes = "string Stage input queues configuration. There are three @@ -567,6 +591,12 @@ Override logfilename for the indexer in real time mode. The default is to use the idx... values if set, else the log... values. .TP +.BI "pyloglevel = "int +Override loglevel for the python module. +.TP +.BI "pylogfilename = "fn +Override logfilename for the python module. +.TP .BI "orgidxconfdir = "dfn Original location of the configuration directory. This is used exclusively for movable datasets. Locating the configuration directory inside the directory tree makes it possible to @@ -633,9 +663,9 @@ space issues. .BI "aspellLanguage = "string Language definitions to use when creating the aspell dictionary. The value must match a set of aspell language -definition files. You can type "aspell dicts" to see a list The default -if this is not set is to use the NLS environment to guess the -value. +definition files. You can type "aspell dicts" to see a list The default +if this is not set is to use the NLS environment to guess the value. The +values are the 2-letter language codes (e.g. 'en', 'fr'...) .TP .BI "aspellAddCreateParam = "string Additional option and parameter to aspell dictionary creation @@ -676,13 +706,19 @@ containing white space with double quotes (quote the whole entry, not the pattern). The default is empty. Example: mondelaypatterns = *.log:20 "*with spaces.*:30" .TP +.BI "idxniceprio = "int +"nice" process priority for the indexing processes. Default: 19 +(lowest) Appeared with 1.26.5. Prior versions were fixed at 19. +.TP .BI "monioniceclass = "int -ionice class for the real time indexing process On platforms where this is supported. The default value is -3. +ionice class for the indexing process. Despite the misleading name, and on platforms where this is +supported, this affects all indexing processes, +not only the real time/monitoring ones. The default value is 3 (use +lowest "Idle" priority). .TP .BI "monioniceclassdata = "string -ionice class parameter for the real time indexing process. On platforms where this is supported. The default is -empty. +ionice class level parameter if the class supports it. The default is empty, as the default "Idle" class has no +levels. .TP .BI "autodiacsens = "bool auto-trigger diacritics sensitivity (raw index only). IF the index is not stripped, decide if we automatically trigger @@ -719,14 +755,8 @@ insufficient for very big documents, the consequence would be snippets with possibly meaning-altering missing words. .TP .BI "pdfocr = "bool -Attempt OCR of PDF files with no text content if both tesseract and -pdftoppm are installed. The default is off because OCR is so -very slow. -.TP -.BI "pdfocrlang = "string -Language to assume for PDF OCR. This is very important for having a reasonable rate of errors -with tesseract. This can also be set through a configuration variable -or directory-local parameters. See the rclpdf.py script. +Attempt OCR of PDF files with no text content. This can be defined in subdirectories. The default is off because +OCR is so very slow. .TP .BI "pdfattach = "bool Enable PDF attachment extraction by executing pdftk (if @@ -750,6 +780,42 @@ method which will be called with the qualified tag name and value of each selected field, for editing or erasing. A new instance is created for each document, so that the object can keep state for, e.g. eliminating duplicate values. +.TP +.BI "ocrprogs = "string +OCR modules to try. The top OCR script will try to load the corresponding modules in +order and use the first which reports being capable of performing OCR on +the input file. Modules for tesseract (tesseract) and ABBYY FineReader +(abbyy) are present in the standard distribution. For compatibility with +the previous version, if this is not defined at all, the default value is +"tesseract". Use an explicit empty value if needed. A value of "abbyy +tesseract" will try everything. +.TP +.BI "ocrcachedir = "dfn +Location for caching OCR data. The default if this is empty or undefined is to store the cached +OCR data under $RECOLL_CONFDIR/ocrcache. +.TP +.BI "tesseractlang = "string +Language to assume for tesseract OCR. Important for improving the OCR accuracy. This can also be set +through the contents of a file in +the currently processed directory. See the rclocrtesseract.py +script. Example values: eng, fra... See the tesseract documentation. +.TP +.BI "tesseractcmd = "fn +Path for the tesseract command. Do not quote. This is mostly useful on Windows, or for specifying a non-default +tesseract command. E.g. on Windows. +tesseractcmd = C:/Program Files (x86)/Tesseract-OCR/tesseract.exe + +.TP +.BI "abbyylang = "string +Language to assume for abbyy OCR. Important for improving the OCR accuracy. This can also be set +through the contents of a file in +the currently processed directory. See the rclocrabbyy.py +script. Typical values: English, French... See the ABBYY documentation. + +.TP +.BI "abbyycmd = "fn +Path for the abbyy command The ABBY directory is usually not in the path, so you should set this. + .TP .BI "mhmboxquirks = "string Enable thunderbird/mozilla-seamonkey mbox format quirks Set this for the directory where the email mbox files are diff --git a/src/sampleconf/recoll.conf b/src/sampleconf/recoll.conf index c1d28da6..8d161247 100644 --- a/src/sampleconf/recoll.conf +++ b/src/sampleconf/recoll.conf @@ -387,7 +387,7 @@ indexstemminglanguages = english # unac_except_trans = ää Ää öö Öö üü Üü ßss œoe Œoe æae Æae ffff fifi flfl åå Åå # . German: # unac_except_trans = ää Ää öö Öö üü Üü ßss œoe Œoe æae Æae ffff fifi flfl -# In French, you probably want to decompose oe and ae and nobody would type +# . French: you probably want to decompose oe and ae and nobody would type # a German ß # unac_except_trans = ßss œoe Œoe æae Æae ffff fifi flfl # . The default for all until someone protests follows. These decompositions