From df09d65a4e69b3b6932f80ba06ec2426d70aa818 Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Sun, 13 Sep 2020 15:40:28 +0200 Subject: [PATCH] add underscoreasletter config variable to process _ as a letter --- src/common/textsplit.cpp | 9 ++++++++- src/doc/user/recoll.conf.xml | 8 ++++++-- src/doc/user/usermanual.html | 13 +++++++++++-- src/sampleconf/recoll.conf | 9 ++++++++- 4 files changed, 33 insertions(+), 6 deletions(-) diff --git a/src/common/textsplit.cpp b/src/common/textsplit.cpp index fd26f894..04af5396 100644 --- a/src/common/textsplit.cpp +++ b/src/common/textsplit.cpp @@ -118,6 +118,13 @@ void TextSplit::staticConfInit(RclConfig *config) } } + bvalue = false; + if (config->getConfParam("underscoreasletter", &bvalue)) { + if (bvalue) { + charclasses[int('_')] = A_LLETTER; + } + } + string kotagger; config->getConfParam("hangultagger", kotagger); if (!kotagger.empty()) { @@ -855,7 +862,7 @@ bool TextSplit::text_to_words(const string &in) goto SPACE; case '@': - case '_': + case '_': // If underscoreasletter is set, we'll never get this case '\'': // If in word, potential span: o'brien, jf@dockes.org, // else just ignore diff --git a/src/doc/user/recoll.conf.xml b/src/doc/user/recoll.conf.xml index 5ecce81f..0505ace9 100644 --- a/src/doc/user/recoll.conf.xml +++ b/src/doc/user/recoll.conf.xml @@ -253,8 +253,12 @@ in version 1.22, and on by default. Setting the variable to off allows restoring the previous behaviour. backslashasletter -Process backslash as normal letter This may make sense for people wanting to index TeX commands as +Process backslash as normal letter. This may make sense for people wanting to index TeX commands as such but is not of much general use. + +underscoreasletter +Process underscore as normal letter. This makes sense in so many cases that one wonders if it should +not be the default. maxtermlength Maximum term length. Words longer than this will be discarded. @@ -758,7 +762,7 @@ script. Example values: eng, fra... See the tesseract documentation.tesseractcmd Path for the tesseract command. This is mostly useful on Windows, or for specifying a non-default tesseract command. e.g. on Windows: -C:/Program Files (x86)/Tesseract-OCR/tesseract.exe +C:/ProgramFiles(x86)/Tesseract-OCR/tesseract.exe abbyylang Language to assume for abbyy OCR. Important for improving the OCR accuracy. This can also be set diff --git a/src/doc/user/usermanual.html b/src/doc/user/usermanual.html index 72060c70..d4dc0532 100644 --- a/src/doc/user/usermanual.html +++ b/src/doc/user/usermanual.html @@ -8933,12 +8933,21 @@ for i in range(nres): id= "RCL.INSTALL.CONFIG.RECOLLCONF.BACKSLASHASLETTER">backslashasletter
-

Process backslash as normal letter This may +

Process backslash as normal letter. This may make sense for people wanting to index TeX commands as such but is not of much general use.

underscoreasletter
+
+

Process underscore as normal letter. This + makes sense in so many cases that one wonders if + it should not be the default.

+
+
maxtermlength
@@ -9865,7 +9874,7 @@ for i in range(nres):

Path for the tesseract command. This is mostly useful on Windows, or for specifying a non-default tesseract command. e.g. on Windows: - C:/Program Files (x86)/Tesseract-OCR/tesseract.exe

+ C:/ProgramFiles(x86)/Tesseract-OCR/tesseract.exe

-# Process backslash as normal letter +# Process backslash as normal letter. # This may make sense for people wanting to index TeX commands as # such but is not of much general use. # #backslashasletter = 0 +# +# Process underscore as normal letter. +# This makes sense in so many cases that one wonders if it should +# not be the default. +# +#underscoreasletter = 0 + # # Maximum term length. # Words longer than this will be discarded.