diff --git a/src/doc/man/recoll.1 b/src/doc/man/recoll.1 index 88b80492..36174625 100644 --- a/src/doc/man/recoll.1 +++ b/src/doc/man/recoll.1 @@ -24,8 +24,15 @@ recoll \- user interface for the Recoll full text search system .B \-q ] + +.B recoll +[ +.B \-c + +] + .SH DESCRIPTION -The +In the first form, the .B recoll command will start the graphical user interface for querying the .B Recoll @@ -73,6 +80,20 @@ command may be specified in this case. These can control the output format and the maximum number of results to be printed. .PP Please refer to online help for a full description. +.PP +In the second form, the +.B recoll +command can be used to start a native viewer for a document indexed by +Recoll. It will understand a final URL fragment (separated by a '#' +character) to indicate an +.I ipath +, the specifier for the part of the Recoll document access path which is is +internal to a container such as a mbox folder or a zip archive, and will, +if needed, create a temporary file to let a normal system utility display +the document. +.PP +The second form is mostly used for opening embedded documents from the +Ubuntu Unity Recoll lens. .SH SEE ALSO .PP recollindex(1) recollq(1) recoll.conf(5) diff --git a/src/doc/man/recoll.conf.5 b/src/doc/man/recoll.conf.5 index f0be5743..e0c8ac89 100644 --- a/src/doc/man/recoll.conf.5 +++ b/src/doc/man/recoll.conf.5 @@ -3,7 +3,7 @@ .SH NAME recoll.conf \- main personal configuration file for Recoll .SH DESCRIPTION -This file defines the indexation configuration for the Recoll full-text search +This file defines the index configuration for the Recoll full-text search system. .LP The system-wide configuration file is normally located inside @@ -13,8 +13,8 @@ may be overridden by setting it in the personal configuration file, by default: .LP Please note while we try to keep this manual page reasonably up to date, it will frequently lag the current state of the software. The best source of -information about the configuration are the comments in the configuration -file. +information about the configuration are the comments in the system-wide +configuration file. .LP A short extract of the file might look as follows: @@ -44,7 +44,7 @@ Empty lines or lines beginning with # are ignored. Affectation lines are in the form 'name = value'. .LP Section lines allow redefining a parameter for a directory subtree. Some of -the parameters used for indexaction are looked up hierarchically from the +the parameters used for indexing are looked up hierarchically from the more to the less specific. Not all parameters can be meaningfully redefined, this is specified for each in the next section. .LP @@ -58,11 +58,6 @@ embedded spaces can be quoted with double-quotes. .BI "topdirs = " directories Specifies the list of directories to index (recursively). .TP -.BI "dbdir = " directory -The name of the Xapian database directory. It will be created if needed -when the database is initialized. If this is not an absolute pathname, it -will be taken relative to the configuration directory. -.TP .BI "skippedNames = " patterns A space-separated list of patterns for names of files or directories that should be completely ignored. The list defined in the default file is: @@ -78,8 +73,16 @@ for the top level ones in .BI "skippedPaths = " patterns A space-separated list of patterns for paths the indexer should not descend into. Together with topdirs, this allows pruning the indexed tree to one's -content. daemSkippedPaths can be used to define a specific value for the -real time indexing monitor. +content. +.B daemSkippedPaths +can be used to define a specific value for the real time indexing monitor. +.TP +.BI "skippedPathsFnmPathname = " 0/1 +The values in the *skippedPaths variables are matched by default with +fnmatch(3), with the FNM_PATHNAME and FNM_LEADING_DIR flags. This means +that '/' characters must be matched explicitely. You can set +skippedPathsFnmPathname to 0 to disable the use of FNM_PATHNAME (meaning +that /*/dir3 will match /dir1/dir2/dir3). .TP .BI "followLinks = " boolean Specifies if the indexer should follow @@ -93,66 +96,6 @@ members by using sections. It can not be changed below the .I topdirs level. .TP -.BI "loglevel = " value -Verbosity level for recoll and recollindex. A value of 4 lists quite a lot of -debug/information messages. 3 lists only errors. -.B daemloglevel -can be used to specify a different value for the real-time indexing daemon. -.TP -.BI "logfilename = " file -Where should the messages go. 'stderr' can be used as a special value. -.B daemlogfilename -can be used to specify a different value for the real-time indexing daemon. -.TP -.BI "indexstemminglanguages = " languages -A list of languages for which the stem expansion databases will be -built. See recollindex(1) for possible values. -.TP -.BI "defaultcharset = " charset -The name of the character set used for files that do not contain a -character set definition (ie: plain text files). This can be redefined for -any subdirectory. -.TP -.BI "maxfsoccuppc = " percentnumber -Maximum file system occupation before we -stop indexing. The value is a percentage, corresponding to -what the "Capacity" df output column shows. The default -value is 0, meaning no checking. -.TP -.BI "idxflushmb = " megabytes -Threshold (megabytes of new text data) -where we flush from memory to disk index. Setting this can -help control memory usage. A value of 0 means no explicit -flushing, letting Xapian use its own default, which is -flushing every 10000 documents (or XAPIAN_FLUSH_THRESHOLD), meaning that -memory usage depends on average document size. The default value is 10. -.TP -.BI "filtersdir = " directory -A directory to search for the external filter scripts used to index some -types of files. The value should not be changed, except if you want to -modify one of the default scripts. The value can be redefined for any -subdirectory. -.TP -.BI "iconsdir = " directory -The name of the directory where -.B recoll -result list icons are stored. You can change this if you want different -images. -.TP -.BI "guesscharset = " boolean -Try to guess the character set of files if no internal value is available -(ie: for plain text files). This does not work well in general, and should -probably not be used. -.TP -.BI "usesystemfilecommand = " boolean -Decide if we use the -.B "file \-i" -system command as a final step for determining the mime type for a file -(the main procedure uses suffix associations as defined in the -.B mimemap -file). This can be useful for files with suffixless names, but it will -also cause the indexation of many bogus "text" files. -.TP .BI "indexedmimetypes = " list Recoll normally indexes any file which it knows how to read. This list lets you restrict the indexed mime types to what you specify. If the variable is @@ -166,13 +109,232 @@ wasteful if 'uninteresting' big compressed files are present. Negative means no limit, 0 means no processing of any compressed file. Defaults to \-1. .TP +.BI "textfilemaxmbs = " value +Maximum size for text files. Very big text files are often uninteresting +logs. Set to -1 to disable (default 20MB). +.TP +.BI "textfilepagekbs = " value +If this is set to other than -1, text files will be indexed as multiple +documents of the given page size. This may be useful if you do want to +index very big text files as it will both reduce memory usage at index time +and help with loading data to the preview window. A size of a few megabytes +would seem reasonable (default: 1000 : 1MB). +.TP +.BI "membermaxkbs = " "value in kilobytes" +This defines the maximum size for an archive member (zip, tar or rar at +the moment). Bigger entries will be skipped. Current default: 50000 (50 MB). +.TP .BI "indexallfilenames = " boolean Recoll indexes file names into a special section of the database to allow specific file names searches using wild cards. This parameter decides if file name indexing is performed only for files with mime types that would -qualify them for full text indexation, or for all files inside +qualify them for full text indexing, or for all files inside the selected subtrees, independent of mime type. .TP +.BI "usesystemfilecommand = " boolean +Decide if we use the +.B "file \-i" +system command as a final step for determining the mime type for a file +(the main procedure uses suffix associations as defined in the +.B mimemap +file). This can be useful for files with suffixless names, but it will +also cause the indexing of many bogus "text" files. +.TP +.BI "processbeaglequeue = " 0/1 +If this is set, process the directory where Beagle Web browser plugins copy +visited pages for indexing. Of course, Beagle MUST NOT be running, else +things will behave strangely. +.TP +.BI "beaglequeuedir = " directory path +The path to the Beagle indexing queue. This is hard-coded in the Beagle +plugin as ~/.beagle/ToIndex so there should be no need to change it. +.TP +.BI "indexStripChars = " 0/1 +Decide if we strip characters of diacritics and convert them to lower-case +before terms are indexed. If we don't, searches sensitive to case and +diacritics can be performed, but the index will be bigger, and some +marginal weirdness may sometimes occur. The default is a stripped index +(indexStripChars = 1) for now. When using multiple indexes for a search, +this parameter must be defined identically for all. Changing the value +implies an index reset. +.TP +.BI "maxTermExpand = " value +Maximum expansion count for a single term (e.g.: when using wildcards). The +default of 10000 is reasonable and will avoid queries that appear frozen +while the engine is walking the term list. +.TP +.BI "maxXapianClauses = " value +Maximum number of elementary clauses we can add to a single Xapian +query. In some cases, the result of term expansion can be multiplicative, +and we want to avoid using excessive memory. The default of 100 000 should +be both high enough in most cases and compatible with current typical +hardware configurations. +.TP +.BI "nonumbers = " 0/1 +If this set to true, no terms will be generated for numbers. For example +"123", "1.5e6", 192.168.1.4, would not be indexed ("value123" would still +be). Numbers are often quite interesting to search for, and this should +probably not be set except for special situations, ie, scientific documents +with huge amounts of numbers in them. This can only be set for a whole +index, not for a subtree. +.TP +.BI "nocjk = " boolean +If this set to true, specific east asian (Chinese Korean Japanese) +characters/word splitting is turned off. This will save a small amount of +cpu if you have no CJK documents. If your document base does include such +text but you are not interested in searching it, setting +.I nocjk +may be a significant time and space saver. +.TP +.BI "cjkngramlen = " value +This lets you adjust the size of n-grams used for indexing CJK text. The +default value of 2 is probably appropriate in most cases. A value of 3 +would allow more precision and efficiency on longer words, but the index +will be approximately twice as large. +.TP +.BI "indexstemminglanguages = " languages +A list of languages for which the stem expansion databases will be +built. See recollindex(1) for possible values. +.TP +.BI "defaultcharset = " charset +The name of the character set used for files that do not contain a +character set definition (ie: plain text files). This can be redefined for +any subdirectory. +.TP +.BI "unac_except_trans = " "list of utf-8 groups" +This is a list of characters, encoded in UTF-8, which should be handled +specially when converting text to unaccented lowercase. For example, in +Swedish, the letter "a with diaeresis" has full alphabet citizenship and +should not be turned into an a. +.br +Each element in the space-separated list has the special character as first +element and the translation following. The handling of both the lowercase +and upper-case versions of a character should be specified, as appartenance +to the list will turn-off both standard accent and case processing. +.br +Note that the translation is not limited to a single character. +.br +This parameter cannot be redefined for subdirectories, it is global, +because there is no way to do otherwise when querying. If you have document +sets which would need different values, you will have to index and query +them separately. +.TP +.BI "maildefcharset = " character set name +This can be used to define the default character set specifically for email +messages which don't specify it. This is mainly useful for readpst (libpst) +dumps, which are utf-8 but do not say so. +.TP +.BI "localfields = " "fieldname = value:..." +This allows setting fields for all documents under a given +directory. Typical usage would be to set an "rclaptg" field, to be used in +mimeview to select a specific viewer. If several fields are to be set, they +should be separated with a colon (':') character (which there is currently +no way to escape). Ie: localfields= rclaptg=gnus:other = val, then select +specifier viewer with mimetype|tag=... in mimeview. +.TP +.BI "dbdir = " directory +The name of the Xapian database directory. It will be created if needed +when the database is initialized. If this is not an absolute pathname, it +will be taken relative to the configuration directory. +.TP +.BI "idxstatusfile = " "file path" +The name of the scratch file where the indexer process updates its +status. Default: idxstatus.txt inside the configuration directory. +.TP +.BI "maxfsoccuppc = " percentnumber +Maximum file system occupation before we +stop indexing. The value is a percentage, corresponding to +what the "Capacity" df output column shows. The default +value is 0, meaning no checking. +.TP +.BI "mboxcachedir = " "directory path" +The directory where mbox message offsets cache files are held. This is +normally $RECOLL_CONFDIR/mboxcache, but it may be useful to share a +directory between different configurations. +.TP +.BI "mboxcacheminmbs = " "value in megabytes" +The minimum mbox file size over which we cache the offsets. There is really no sense in caching offsets for small files. The default is 5 MB. +.TP +.BI "webcachedir = " "directory path" +This is only used by the Beagle web browser plugin indexing code, and +defines where the cache for visited pages will live. Default: +$RECOLL_CONFDIR/webcache +.TP +.BI "webcachemaxmbs = " "value in megabytes" +This is only used by the Beagle web browser plugin indexing code, and +defines the maximum size for the web page cache. Default: 40 MB. +.TP +.BI "idxflushmb = " megabytes +Threshold (megabytes of new text data) +where we flush from memory to disk index. Setting this can +help control memory usage. A value of 0 means no explicit +flushing, letting Xapian use its own default, which is +flushing every 10000 documents (or XAPIAN_FLUSH_THRESHOLD), meaning that +memory usage depends on average document size. The default value is 10. +.TP +.BI "autodiacsens = " 0/1 +IF the index is not stripped, decide if we automatically trigger diacritics +sensitivity if the search term has accented characters (not in +unac_except_trans). Else you need to use the query language and the D +modifier to specify diacritics sensitivity. Default is no. +.TP +.BI "autocasesens = " 0/1 +IF the index is not stripped, decide if we automatically trigger character +case sensitivity if the search term has upper-case characters in any but +the first position. Else you need to use the query language and the C +modifier to specify character-case sensitivity. Default is yes. +.TP +.BI "loglevel = " value +Verbosity level for recoll and recollindex. A value of 4 lists quite a lot of +debug/information messages. 3 lists only errors. +.B daemloglevel +can be used to specify a different value for the real-time indexing daemon. +.TP +.BI "logfilename = " file +Where should the messages go. 'stderr' can be used as a special value. +.B daemlogfilename +can be used to specify a different value for the real-time indexing daemon. +.TP +.BI "mondelaypatterns = " "list of patterns" +This allows specify wildcard path patterns (processed with fnmatch(3) with +0 flag), to match files which change too often and for which a delay should +be observed before re-indexing. This is a space-separated list, each entry +being a pattern and a time in seconds, separated by a colon. You can use +double quotes if a path entry contains white space. Example: +.sp +mondelaypatterns = *.log:20 "this one has spaces*:10" +.TP +.BI "monixinterval = " "value in seconds +Minimum interval (seconds) for processing the indexing queue. The real time +monitor does not process each event when it comes in, but will wait this +time for the queue to accumulate to diminish overhead and in order to +aggregate multiple events to the same file. Default 30 S. +.TP +.BI "monauxinterval = " "value in seconds +Period (in seconds) at which the real time monitor will regenerate the +auxiliary databases (spelling, stemming) if needed. The default is one +hour. +.TP +.BI "monioniceclass, monioniceclassdata" +These allow defining the ionice class and data used by the indexer (default +class 3, no data). +.TP +.BI "filtermaxseconds = " "value in seconds" +Maximum filter execution time, after which it is aborted. Some postscript +programs just loop... +.TP +.BI "filtersdir = " directory +A directory to search for the external filter scripts used to index some +types of files. The value should not be changed, except if you want to +modify one of the default scripts. The value can be redefined for any +subdirectory. +.TP +.BI "iconsdir = " directory +The name of the directory where +.B recoll +result list icons are stored. You can change this if you want different +images. +.TP .BI "idxabsmlen = " value Recoll stores an abstract for each indexed file inside the database. The text can come from an actual 'abstract' section in the document or will @@ -198,19 +360,11 @@ If this is set, the aspell dictionary generation is turned off. Useful for cases where you don't need the functionality or when it is unusable because aspell crashes during dictionary generation. .TP -.BI "nocjk = " boolean -If this set to true, specific east asian (Chinese Korean Japanese) -characters/word splitting is turned off. This will save a small amount of -cpu if you have no CJK documents. If your document base does include such -text but you are not interested in searching it, setting -.I nocjk -may be a significant time and space saver. -.TP -.BI "cjkngramlen = " value -This lets you adjust the size of n-grams used for indexing CJK text. The -default value of 2 is probably appropriate in most cases. A value of 3 -would allow more precision and efficiency on longer words, but the index -will be approximately twice as large. +.BI "mhmboxquirks = " flags +This allows definining location-related quirks for the mailbox +handler. Currently only the tbird flag is defined, and it should be set for +directories which hold Thunderbird data, as their folder format is weird. + .SH SEE ALSO .PP recollindex(1) recoll(1) diff --git a/src/doc/user/usermanual.sgml b/src/doc/user/usermanual.sgml index 258f8111..88fffbde 100644 --- a/src/doc/user/usermanual.sgml +++ b/src/doc/user/usermanual.sgml @@ -4201,6 +4201,13 @@ skippedPaths = ~/somedir/∗.txt + membermaxkbs + This defines the maximum size in kilobytes for + an archive member (zip, tar or rar at the moment). Bigger + entries will be skipped. + + + indexallfilenames &RCL; indexes file names in a special section of the database to allow specific file names @@ -4258,8 +4265,10 @@ skippedPaths = ~/somedir/∗.txt don't, searches sensitive to case and diacritics can be performed, but the index will be bigger, and some marginal weirdness may sometimes occur. The default is a stripped - index (indexStripChars = 1 for - now. + index (indexStripChars = 1) for + now. When using multiple indexes for a search, + this parameter must be defined identically for + all. Changing the value implies an index reset. @@ -4537,15 +4546,18 @@ mondelaypatterns = *.log:20 "this one has spaces*:10" + monioniceclass, monioniceclassdata + These allow defining the + ionice class and data used by the + indexer (default class 3, no data). + + - - - filtermaxseconds - Maximum filter execution time, after which it + filtermaxseconds + Maximum filter execution time, after which it is aborted. Some postscript programs just loop... - - - + + filtersdir A directory to search for the external filter scripts used to index some types of files. The @@ -4600,6 +4612,17 @@ mondelaypatterns = *.log:20 "this one has spaces*:10" + mhmboxquirks + This allows definining location-related quirks + for the mailbox handler. Currently only the + tbird flag is defined, and it should be set + for directories which hold + Thunderbird data, as their folder + format is weird. + + + + diff --git a/src/internfile/mh_execm.cpp b/src/internfile/mh_execm.cpp index 2b9f3846..10109eae 100644 --- a/src/internfile/mh_execm.cpp +++ b/src/internfile/mh_execm.cpp @@ -51,7 +51,7 @@ bool MimeHandlerExecMultiple::startCmd() string cmd = params.front(); m_maxmemberkb = 50000; - m_config->getConfParam("maxmemberkb", &m_maxmemberkb); + m_config->getConfParam("membermaxkbs", &m_maxmemberkb); ostringstream oss; oss << "RECOLL_FILTER_MAXMEMBERKB=" << m_maxmemberkb; m_cmd.putenv(oss.str()); diff --git a/src/query/docseq.h b/src/query/docseq.h index de10369b..5a1ebe83 100644 --- a/src/query/docseq.h +++ b/src/query/docseq.h @@ -213,7 +213,7 @@ public: virtual std::string getReason() { if (m_seq.isNull()) - return false; + return string(); return m_seq->getReason(); } virtual std::string title() {return m_seq->title();} diff --git a/src/sampleconf/recoll.conf.in b/src/sampleconf/recoll.conf.in index 231a9249..ba0c4bb1 100644 --- a/src/sampleconf/recoll.conf.in +++ b/src/sampleconf/recoll.conf.in @@ -163,7 +163,7 @@ indexallfilenames = 1 # Size limit for archive members. This is passed to the filters in the # environment as RECOLL_FILTER_MAXMEMBERKB # -maxmemberkb = 50000 +membermaxkbs = 50000 # Size limit for compressed files. We need to decompress these in a # temporary directory for identification, which can be wasteful in some