diff --git a/src/doc/user/usermanual.sgml b/src/doc/user/usermanual.sgml index 939d07d4..499006b4 100644 --- a/src/doc/user/usermanual.sgml +++ b/src/doc/user/usermanual.sgml @@ -3593,6 +3593,28 @@ while query.next >= 0 and query.next < nres: List elements with embedded spaces can be quoted using double-quotes. + Encoding issues + Most of the configuration parameters are plain ASCII. Two + particular sets of values may cause encoding issues: + + + File path parameters may contain non-ascii + characters and should use the exact same byte values as found in + the file system directory. Usually, this means that the + configuration file should use the system default locale + encoding. + + The unac_except_trans parameter + should be encoded in UTF-8. If your system locale is not UTF-8, and + you need to also specify non-ascii file paths, this poses a + difficulty because common text editors cannot handle multiple + encodings in a single file. In this relatively unlikely case, you + can edit the configuration file as two separate text files with + appropriate encodings, and concatenate them to create the complete + configuration. + + + Main configuration file @@ -3853,16 +3875,17 @@ skippedPaths = ~/somedir/∗.txt unac_except_trans - This is a list of characters which should be - handled specially when converting text to unaccented lowercase. - For example, in Swedish, the letter a with diaeresis - has full alphabet citizenship and should not be - turned into an a. Each element in the - space-separated list has the special character as first element - and the translation following. The handling of both the - lowercase and upper-case versions of a character should be - specified, as appartenance to the list will turn-off both - standard accent and case processing. Example for Swedish: + This is a list of characters, encoded in UTF-8, + which should be handled specially when converting text to + unaccented lowercase. For example, in Swedish, the letter + a with diaeresis has full alphabet + citizenship and should not be turned into an + a. Each element in the space-separated list + has the special character as first element and the translation + following. The handling of both the lowercase and upper-case + versions of a character should be specified, as appartenance to + the list will turn-off both standard accent and case + processing. Example for Swedish: unac_except_trans = åå Åå ää Ää öö Öö diff --git a/src/sampleconf/recoll.conf.in b/src/sampleconf/recoll.conf.in index d36fefd7..bab728b3 100644 --- a/src/sampleconf/recoll.conf.in +++ b/src/sampleconf/recoll.conf.in @@ -65,6 +65,17 @@ indexstemminglanguages = english # match your LANG and is not 8859-1, set it here. # defaultcharset = iso-8859-1 +# A list of characters, encoded in UTF-8, which should be handled specially +# when converting text to unaccented lowercase. For example, in Swedish, +# the letter a with diaeresis has full alphabet citizenship and should not +# be turned into an a. +# Each element in the space-separated list has the special character as +# first element and the translation following. The handling of both the +# lowercase and upper-case versions of a character should be specified, as +# appartenance to the list will turn-off both standard accent and case +# processing. Example for Swedish: +# unac_except_trans = åå Ã…Ã¥ ää Ää öö Öö + # Where to store the database (directory). This may be an absolute path, # else it is taken as relative to the configuration directory (-c argument # or $RECOLL_CONFDIR).