# Recoll default main configuration file # The XML tags in the comments are used to help produce the documentation # from the sample/reference file, and not at all at run time, where # comments are just comments. Edit at will. # This typically lives in $prefix/share/recoll/examples and provides # default values. You can override selected parameters by adding assigments # to ~/.recoll/recoll.conf (or $RECOLL_CONFDIR/recoll.conf) # # Most of the important values in this file can be set from the GUI # configuration menus, which may be an easier approach than direct editing. # Parameters affecting what documents we index # Space-separated list of files or # directories to recursively index.Default to ~ (indexes # $HOME). You can use symbolic links in the list, they will be followed, # independantly of the value of the followLinks variable. topdirs = ~ # Wildcard expressions for # names of files and directories that we should ignore. # White space separated list of wildcard patterns (simple # ones, not paths, must contain no / ), which will be tested against file # and directory names. The list in the default configuration does not # exclude hidden directories (names beginning with a dot), which means that # it may index quite a few things that you do not want. On the other hand, # email user agents like Thunderbird usually store messages in hidden # directories, and you probably want this indexed. One possible solution is # to have '.*' in 'skippedNames', and add things like '~/.thunderbird' # '~/.evolution' to 'topdirs'. Not even the file names are indexed for # patterns in this list, see the 'noContentSuffixes' variable for an # alternative approach which indexes the file names. Can be redefined for # any subtree. skippedNames = #* bin CVS Cache cache* .cache caughtspam tmp \ .thumbnails .svn \ *~ .beagle .git .hg .bzr loop.ps .xsession-errors \ .recoll* xapiandb recollrc recoll.conf # List of name endings (not # necessarily dot-separated suffixes) for which we don't try MIME type # identification, and don't uncompress or index content.Only # the names will be indexed. This complements the now obsoleted mimemap # recoll_noindex list, which will go away in a future release (the move # from mimemap to recoll.conf allows editing the list through the # GUI). This is different from skippedNames because these are name ending # matches only (not wildcard patterns), and the file name itself gets # indexed normally. This can be redefined for subdirectories. noContentSuffixes = .md5 .map \ .o .lib .dll .a .sys .exe .com \ .mpp .mpt .vsd \ .img .img.gz .img.bz2 .img.xz .image .image.gz .image.bz2 .image.xz \ .dat .bak .rdf .log.gz .log .db .msf .pid \ ,v ~ # # Space-separated list of # wildcard expressions for paths we shouldn't go into.Can # contain files and directories. The database and configuration directories # will automatically be added. The expressions are matched 'fnmatch(3)' # with the FNM_PATHNAME flag set by default. This means that '/' characters # must be matched explicitely. You can set 'skippedPathsFnmPathname' to 0 # to disable the use of FNM_PATHNAME (meaning that '/*/dir3' will match # '/dir1/dir2/dir3'). The default contains the usual mount point for # removable media by default to remind people that it is a bad idea to # naively have recoll work on these (esp. with the monitor: media gets # indexed on mount, all data gets erased on unmount). Typically the # presence of '/media' is mostly a reminder, it would only have effect for # someone who is indexing '/'. Explicitely adding '/media/xxx' to the # topdirs will override this. skippedPaths = /media # Set to 0 to # override use of FNM_PATHNAME for matching skipped # paths. #skippedPathsFnmPathname = 1 # skippedPaths equivalent specific to # real time indexing.This enables having parts of the tree # which are initially indexed but not monitored. If daemSkippedPaths is # not set, the daemon uses skippedPaths. #daemSkippedPaths = # Space-separated list of # wildcard expresions for names that should be ignored # inside zip archives.This is used directly by the zip # handler, and has a function similar to skippedNames, but # works independantly. Can be redefined for subdirectories. Supported by # recoll 1.20 and newer. See # https://bitbucket.org/medoc/recoll/wiki/Filtering%20out%20Zip%20archive%20members # #zipSkippedNames = # Follow symbolic links during # indexing.The default is to ignore symbolic links to avoid # multiple indexing of linked files. No effort is made to avoid duplication # when this option is set to true. This option can be set individually for # each of the 'topdirs' members by using sections. It can not be changed # below the 'topdirs' level. Links in the 'topdirs' list itself are always # followed. #followLinks = 0 # Restrictive list of # indexed mime types.Normally not set (in which case all # supported types are indexed). If it is set, # only the types from the list will have their contents indexed. The names # will be indexed anyway if indexallfilenames is set (default). MIME # type names should be taken from the mimemap file. Can be redefined for # subtrees. #indexedmimetypes = # List of excluded MIME # types.Lets you exclude some types from indexing. Can be # redefined for subtrees. #excludedmimetypes = # Size limit for compressed # files.We need to decompress these in a # temporary directory for identification, which can be wasteful in some # cases. Limit the waste. Negative means no limit. 0 results in no # processing of any compressed file. compressedfilemaxkbs = 50000 # Size limit for text # files.Mostly for skipping monster # logs. textfilemaxmbs = 20 # Index the file names of # unprocessed filesIndex the names of files the contents of # which we don't index because of an excluded or unsupported MIME # type. indexallfilenames = 1 # Use a system command # for file MIME type guessing as a final step in file type # identificationThis is generally useful, but will usually # cause the indexing of many bogus 'text' files. See 'systemfilecommand' # for the command used. usesystemfilecommand = 1 # Command used to guess # MIME types if the internal methods failsThis should be a # "file -i" workalike. The file path will be added as a last parameter to # the command line. 'xdg-mime' works better than the traditional 'file' # command, and is now the configured default (with a hard-coded fallback to # 'file') systemfilecommand = xdg-mime query filetype # Decide if we process the # Web queue.The queue is a directory where the Recoll Web # browser plugins create the copies of visited pages. processwebqueue = 0 # Page size for text # files.If this is set, text/plain files will be divided # into documents of approximately this size. Will reduce memory usage at # index time and help with loading data in the preview window at query # time. Particularly useful with very big files, such as application or # system logs. textfilepagekbs = 1000 # Size limit for archive # members.This is passed to the filters in the environment # as RECOLL_FILTER_MAXMEMBERKB. membermaxkbs = 50000 # Parameters affecting how we generate terms # Changing some of these parameters will imply a full # reindex. Also, when using multiple indexes, it may not make sense # to search indexes that don't share the values for these parameters, # because they usually affect both search and index operations. # Decide if we store # character case and diacritics in the index.If we do, # searches sensitive to case and diacritics can be performed, but the index # will be bigger, and some marginal weirdness may sometimes occur. The # default is a stripped index. When using multiple indexes for a search, # this parameter must be defined identically for all. Changing the value # implies an index reset. indexStripChars = 1 # Decides if terms will be # generated for numbers.For example "123", "1.5e6", # 192.168.1.4, would not be indexed if nonumbers is set ("value123" would # still be). Numbers are often quite interesting to search for, and this # should probably not be set except for special situations, ie, scientific # documents with huge amounts of numbers in them, where setting nonumbers # will reduce the index size. This can only be set for a whole index, not # for a subtree. #nonumbers = 0 # Determines if we index # 'coworker' also when the input is 'co-worker'.This is new # in version 1.22, and on by default. Setting the variable to off allows # restoring the previous behaviour. #dehyphenate = 1 # Decides if specific east asian # (Chinese Korean Japanese) characters/word splitting is turned # off.This will save a small amount of cpu if you have no CJK # documents. If your document base does include such text but you are not # interested in searching it, setting nocjk may be a # significant time and space saver. #nocjk = 0 # This lets you adjust the size of # n-grams used for indexing CJK text.The default value of 2 is # probably appropriate in most cases. A value of 3 would allow more precision # and efficiency on longer words, but the index will be approximately twice # as large. #cjkngramlen = 2 # Languages for # which to create stemming expansion data.Stemmer names can # be found on http://www.xapian.org, or by executing 'recollindex -l', or # this can also be set from a list in the GUI indexstemminglanguages = english # Default character # set.This is used for files which do not contain a # character set definition (e.g.: text/plain). Values found inside files, # e.g. a 'charset' tag in HTML documents, will override it. If this is not # set, the default character set is the one defined by the NLS environment # ($LC_ALL, $LC_CTYPE, $LANG), or ultimately iso-8859-1 (cp-1252 in fact). # If for some reason you want a general default which does not match your # LANG and is not 8859-1, use this variable. This can be redefined for any # sub-directory. #defaultcharset = iso-8859-1 # A list of characters, # encoded in UTF-8, which should be handled specially # when converting text to unaccented lowercase.For # example, in Swedish, the letter a with diaeresis has full alphabet # citizenship and should not be turned into an a. # Each element in the space-separated list has the special character as # first element and the translation following. The handling of both the # lowercase and upper-case versions of a character should be specified, as # appartenance to the list will turn-off both standard accent and case # processing. The value is global and affects both indexing and querying. # Examples: # Swedish: # unac_except_trans = ää Ää öö Öö üü Üü ßss œoe Œoe æae Æae ﬀff ﬁfi ﬂfl åå Åå # German: # unac_except_trans = ää Ää öö Öö üü Üü ßss œoe Œoe æae Æae ﬀff ﬁfi ﬂfl # In French, you probably want to decompose oe and ae and nobody would type # a German ß # unac_except_trans = ßss œoe Œoe æae Æae ﬀff ﬁfi ﬂfl # Reasonable default for all until someone protests. These decompositions # are not performed by unac, but I cant imagine someone typing the composed # forms in a search. # unac_except_trans = ßss œoe Œoe æae Æae ﬀff ﬁfi ﬂfl unac_except_trans = ßss œoe Œoe æae Æae ﬀff ﬁfi ﬂfl # Overrides the default # character set for email messages which don't specify # one.This is mainly useful for readpst (libpst) dumps, # which are utf-8 but do not say so. #maildefcharset= # Set fields on all files # (usually of a specific fs area).Syntax is the usual: # name = value ; attr1 = val1 ; [...] # value is empty so this needs an initial semi-colon. This is useful, e.g., # for setting the rclaptg field for application selection inside # mimeview. #[/some/app/directory] #localfields = ; rclaptg = someapp; otherfield = somevalue # Use mtime instead of # ctime to test if a file has been modified.The time is used # in in addition to the size, which is always used. # Setting this can reduce re-indexing on systems where extended attributes # are used (by some other application), but not indexed, because changing # extended attributes only affects ctime. # Notes: # - This may prevent detection of change in some marginal file rename cases # (the target would need to have the same size and mtime). # - You should probably also set noxattrfields to 1 in this case, except if # you still prefer to perform xattr indexing, for example if the local # file update pattern makes it of value (as in general, there is a risk # for pure extended attributes updates without file modification to go # undetected). Perform a full index reset after changing this. # testmodifusemtime = 0 # Disable extended attributes # conversion to metadata fields.This probably needs to be # set if testmodifusemtime is set. noxattrfields = 0 # Define commands to # gather external metadata, e.g. tmsu tags. # There can be several entries, separated by semi-colons, each defining # which field name the data goes into and the command to use. Don't forget the # initial semi-colon. All the field names must be different. You can use # aliases in the "field" file if necessary. # As a not too pretty hack conceded to convenience, any field name # beginning with "rclmulti" will be taken as an indication that the command # returns multiple field values inside a text blob formatted as a recoll # configuration file ("fieldname = fieldvalue" lines). The rclmultixx name # will be ignored, and field names and values will be parsed from the data. # #[/some/area/of/the/fs] #metadatacmds = ; tags = tmsu tags %f; rclmulti1 = cmdOutputsConf %f # Parameters affecting where and how we store things # Top directory for Recoll # dataRecoll data directories are normally located relative # to the configuration directory (e.g. ~/.recoll/xapiandb, # ~/.recoll/mboxcache). If 'cachedir' is set, the directories are stored under # the specified value instead (e.g. if cachedir is ~/.cache/recoll, the # default dbdir would be ~/.cache/recoll/xapiandb). This affects dbdir, # webcachedir, mboxcachedir, aspellDicDir, which can still be individually # specified to override cachedir. Note that if you have multiple # configurations, each must have a different cachedir, there is no # automatic computation of a subpath under cachedir. #cachedir = ~/.cache/recoll # Maximum file system occupation # over which we stop indexing.The value is a percentage, # corresponding to what the "Capacity" df output column shows. The default # value is 0, meaning no checking. maxfsoccuppc = 0 # Xapian database directory # location.This will be created on first indexing. If the # value is not an absolute path, it will be interpreted as relative to # cachedir if set, or the configuration directory (-c argument or # $RECOLL_CONFDIR). If nothing is specified, the default is then # ~/.recoll/xapiandb/ dbdir = xapiandb # Name of the scratch file where # the indexer process updates its status. Default: # idxstatus.txt inside the configuration directory #idxstatusfile = idxstatus.txt # # # Directory location for storing mbox message offsets cache # files.This is normally 'mboxcache' under cachedir if set, # or else under the configuration directory, but it may be useful to share # a directory between different configurations. #mboxcachedir = mboxcache # # # Minimum mbox file size over which we cache the offsets. # There is really no sense in caching offsets for small files. The # default is 5 MB. #mboxcacheminmbs = 5 # # # Directory where we store the archived web pages. # This is only used by the web history indexing code # Default: cachedir/webcache if cachedir is set, else # $RECOLL_CONFDIR/webcache webcachedir = webcache # # Maximum size in MB of the Web archive. # This is only used by the web history indexing code. # Default: 100 MB. # Reducing the size will not physically truncate the file. webcachemaxmbs = 100 # # # The path to the Web indexing queue.This is # hard-coded in the plugin as ~/.recollweb/ToIndex so there should be no # need or possibility to change it. #webqueuedir = ~/.recollweb/ToIndex # # # Aspell dictionary storage directory location. The # aspell dictionary (aspdict.(lang).rws) is normally stored in the # directory specified by cachedir if set, or under the configuration # directory. #aspellDicDir = # # # Directory location for executable input handlers.If # RECOLL_FILTERSDIR is set in the environment, we use it instead. Defaults # to $prefix/share/recoll/filters. Can be redefined for # subdirectories. #filtersdir = /path/to/my/filters # # # Directory location for icons.The only reason to # change this would be if you want to change the icons displayed in the # result list. Defaults to $prefix/share/recoll/images #iconsdir = /path/to/my/icons # Parameters affecting indexing performance and resource # usage # # # Threshold (megabytes of new data) where we flush from memory to disk # index. # Setting this allows some control over memory usage by the indexer # process. A value of 0 means no explicit flushing, which lets Xapian # perform its own thing, meaning flushing every XAPIAN_FLUSH_THRESHOLD # documents created, modified or deleted. XAPIAN_FLUSH_THRESHOLD is an # environment variable. As memory usage depends on average document size, # not only document count, this is not very useful. # The default value of 10 MB may be a bit low. If you are looking for # maximum speed, you may want to experiment with values between 20 and # 80. In my experience, values beyond 100 are always counterproductive. If # you find otherwise, please drop me a note. idxflushmb = 10 # # # Maximum external filter execution time in # seconds.Default 1200 (20mn). Set to 0 for no limit. This # is mainly to avoid infinite loops in postscript files # (loop.ps) filtermaxseconds = 1200 # # # Maximum virtual memory space for filter processes # (setrlimit(RLIMIT_AS)), in megabytes. Note that this # includes any mapped libs (there is no reliable Linux way to limit the # data space only), so we need to be a bit generous here. Anything over # 2000 will be ignored on 32 bits machines. filtermaxmbytes = 2000 # # # Stage input queues configuration. There are three # internal queues in the indexing pipeline stages (file data extraction, # terms generation, index update). This parameter defines the queue depths # for each stage (three integer values). If a value of -1 is given for a # given stage, no queue is used, and the thread will go on performing the # next stage. In practise, deep queues have not been shown to increase # performance. Default: a value of 0 for the first queue tells &RCL; to # perform autoconfiguration based on the detected number of CPUs (no need # for the two other values in this case). Use thrQSizes = -1 -1 -1 to # disable multithreading entirely. thrQSizes = 0 # # # Number of threads used for each indexing stage. The # three stages are: file data extraction, terms generation, index # update). The use of the counts is also controlled by some special values # in thrQSizes: if the first queue depth is 0, all counts are ignored # (autoconfigured); if a value of -1 is used for a queue depth, the # corresponding thread count is ignored. It makes no sense to use a value # other than 1 for the last stage because updating the &XAP; index is # necessarily single-threaded (and protected by a mutex). #thrTCounts = 4 2 1 # Miscellaneous parameters # # # Debug log verbosity 1-6 2 is errors/warnings # only. 3 information like document updates, 4 is quite verbose and 6 very # verbose. loglevel = 3 # # # Debug log destination. Use 'stderr' (default) to write to the # console. logfilename = stderr # # # Override loglevel for the indexer. #idxloglevel = 3 # # # Override logfilename for the indexer. #idxlogfilename = stderr # # # Override loglevel for the indexer in real time # mode.The default is to use the idx... values if set, else # the log... values. #daemloglevel = 3 # # # Override logfilename for the indexer in real time # mode.The default is to use the idx... values if set, else # the log... values. #daemlogfilename = /dev/null # # # Indexing process current directory. The input # handlers sometimes leave temporary files in the current directory, so it # makes sense to have recollindex chdir to some temporary directory. Three # possible types of values: # - (literal) tmp : go to temp dir as set by environment (RECOLL_TMPDIR else # TMPDIR else /tmp) # - Empty: stay where started # - Absolute path value: go there. idxrundir = tmp # # # Script used to heuristically check if we need to retry indexing # files which previously failed. The default script checks # the modified dates on /usr/bin and /usr/local/bin. A relative path will # be looked up in the filters dirs, then in the path. Use an absolute path # to do otherwise. checkneedretryindexscript = rclcheckneedretry.sh # # # Additional places to search for helper executables. # This is only used on Windows for now. #recollhelperpath = c:/someprog/bin;c:/someotherprog/bin # # # Length of abstracts we store while indexing. # Recoll stores an abstract for each indexed file. # The text can come from an actual 'abstract' section in the # document or will just be the beginning of the document. It is stored in # the index so that it can be displayed inside the result lists without # decoding the original file. The idxabsmlen parameter # defines the size of the stored abstract. The default value is 250 # bytes. The search interface gives you the choice to display this stored # text or a synthetic abstract built by extracting text around the search # terms. If you always prefer the synthetic abstract, you can reduce this # value and save a little space. #idxabsmlen = 250 # # # Truncation length of stored metadata fields.This # does not affect indexing (the whole field is processed anyway), just the # amount of data stored in the index for the purpose of displaying fields # inside result lists or previews. The default value is 150 bytes which # may be too low if you have custom fields. #idxmetastoredlen = 150 # # # Language definitions to use when creating the aspell # dictionary.The value must match a set of aspell language # definition files. You can type "aspell dicts" to see a list The default # if this is not set is to use the NLS environment to guess the # value. #aspellLanguage = en # # # Additional parameter to aspell dictionary creation # command.Some aspell packages may need an additional option # (e.g. on Debian Jessie). See Debian bug 772415. #aspellAddCreateParam = --local-data-dir=/usr/lib/aspell # # # Set this to have a look at aspell dictionary creation # errors.There are always many, so this is mostly for # debugging. #aspellKeepStderr = 1 # # # Disable aspell use.The aspell dictionary generation # takes time, and some combinations of aspell version, language, and local # terms, result in aspell crashing, so it sometimes makes sense to just # disable the thing. #noaspell = 1 # # # Seconds between auxiliary databases updates (stemdb, # aspell).The default is one hour. #monauxinterval = 3600 # # # Minimum interval (seconds) between processings of the indexing # queue. The real time monitor does not process each event # when it comes in, but lets the queue accumulate, to diminish overhead and # to aggregate multiple events to the same file. Default 30 S. #monixinterval = 30 # # # Timing parameters for the real time indexing. # Definitions for files which get a longer delay before reindexing # is allowed. This is for fast-changing files, that should only be # reindexed once in a while. A list of wildcardPattern:seconds pairs. The # patterns are matched with fnmatch(pattern, path, 0) You can quote entries # containing white space with double quotes (quote the whole entry, not the # pattern). The default is empty. Example:mondelaypatterns = *.log:20 # "*with spaces.*:30" #mondelaypatterns = *.log:20 "*with spaces.*:30" # # # ionice class for the real time indexing process # On platforms where this is supported, the default value is # 3. # monioniceclass = 3 # # # ionice class parameter for the real time indexing process. # On platforms where this is supported. The default is # empty. #monioniceclassdata = # Query-time parameters (no impact on the index) # # # auto-trigger diacritics sensitivity (raw index only) # IF the index is not stripped, decide if we automatically trigger # diacritics sensitivity if the search term has accented characters (not in # unac_except_trans). Else you need to use the query language and the "D" # modifier to specify diacritics sensitivity. Default is no. autodiacsens = 0 # # # auto-trigger case sensitivity (raw index only) IF # the index is not stripped (see indexStripChars), decide if we # automatically trigger character case sensitivity if the search term has # upper-case characters in any but the first position. Else you need to use # the query language and the "C" modifier to specify character-case # sensitivity. Default is yes. autocasesens = 1 # Maximum query expansion count # for a single term (e.g.: when using wildcards).This only # affects queries, not indexing. We used to not limit this at all (except # for filenames where the limit was too low at 1000), but it is # unreasonable with a big index. Default 10000. maxTermExpand = 10000 # Maximum number of clauses # we add to a single Xapian query.This only affects queries, # not indexing. In some cases, the result of term expansion can be # multiplicative, and we want to avoid eating all the memory. Default # 50000. maxXapianClauses = 50000 # # # Maximum number of positions we walk while populating a snippet for the # result list.The default of 1,000,000 may be insufficient # for big documents, the consequence would be snippets with possibly # meaning-altering missing words. snippetMaxPosWalk = 1000000 # Parameters for the PDF input script # # # Attempt OCR of PDF files with no text content if both tesseract and # pdftoppm are installed.The default is off because OCR is so # very slow. #pdfocr = 0 # # # Enable PDF attachment extraction by executing pdftk (if # available).This is # normally disabled, because it does slow down PDF indexing a bit even if # not one attachment is ever found. #pdfattach = 0 # Parameters set for specific locations # You could specify different parameters for a subdirectory like this: #[~/hungariandocs/plain] #defaultcharset = iso-8859-2 [/usr/share/man] followLinks = 1 # # # Enable thunderbird/mozilla-seamonkey mbox format quirks # Set this for the directory where the email mbox files are # stored. [~/.thunderbird] mhmboxquirks = tbird [~/.mozilla] mhmboxquirks = tbird # pidgin / purple directories for irc chats have names beginning with # [~/.purple] skippedNames =