From df09d65a4e69b3b6932f80ba06ec2426d70aa818 Mon Sep 17 00:00:00 2001
From: Jean-Francois Dockes
Date: Sun, 13 Sep 2020 15:40:28 +0200
Subject: [PATCH] add underscoreasletter config variable to process _ as a
letter
---
src/common/textsplit.cpp | 9 ++++++++-
src/doc/user/recoll.conf.xml | 8 ++++++--
src/doc/user/usermanual.html | 13 +++++++++++--
src/sampleconf/recoll.conf | 9 ++++++++-
4 files changed, 33 insertions(+), 6 deletions(-)
diff --git a/src/common/textsplit.cpp b/src/common/textsplit.cpp
index fd26f894..04af5396 100644
--- a/src/common/textsplit.cpp
+++ b/src/common/textsplit.cpp
@@ -118,6 +118,13 @@ void TextSplit::staticConfInit(RclConfig *config)
}
}
+ bvalue = false;
+ if (config->getConfParam("underscoreasletter", &bvalue)) {
+ if (bvalue) {
+ charclasses[int('_')] = A_LLETTER;
+ }
+ }
+
string kotagger;
config->getConfParam("hangultagger", kotagger);
if (!kotagger.empty()) {
@@ -855,7 +862,7 @@ bool TextSplit::text_to_words(const string &in)
goto SPACE;
case '@':
- case '_':
+ case '_': // If underscoreasletter is set, we'll never get this
case '\'':
// If in word, potential span: o'brien, jf@dockes.org,
// else just ignore
diff --git a/src/doc/user/recoll.conf.xml b/src/doc/user/recoll.conf.xml
index 5ecce81f..0505ace9 100644
--- a/src/doc/user/recoll.conf.xml
+++ b/src/doc/user/recoll.conf.xml
@@ -253,8 +253,12 @@ in version 1.22, and on by default. Setting the variable to off allows
restoring the previous behaviour.
backslashasletter
-Process backslash as normal letter This may make sense for people wanting to index TeX commands as
+Process backslash as normal letter. This may make sense for people wanting to index TeX commands as
such but is not of much general use.
+
+underscoreasletter
+Process underscore as normal letter. This makes sense in so many cases that one wonders if it should
+not be the default.
maxtermlength
Maximum term length. Words longer than this will be discarded.
@@ -758,7 +762,7 @@ script. Example values: eng, fra... See the tesseract documentation.tesseractcmd
Path for the tesseract command. This is mostly useful on Windows, or for specifying a non-default
tesseract command. e.g. on Windows:
-C:/Program Files (x86)/Tesseract-OCR/tesseract.exe
+C:/ProgramFiles(x86)/Tesseract-OCR/tesseract.exe
abbyylang
Language to assume for abbyy OCR. Important for improving the OCR accuracy. This can also be set
diff --git a/src/doc/user/usermanual.html b/src/doc/user/usermanual.html
index 72060c70..d4dc0532 100644
--- a/src/doc/user/usermanual.html
+++ b/src/doc/user/usermanual.html
@@ -8933,12 +8933,21 @@ for i in range(nres):
id=
"RCL.INSTALL.CONFIG.RECOLLCONF.BACKSLASHASLETTER">backslashasletter
- Process backslash as normal letter This may
+
Process backslash as normal letter. This may
make sense for people wanting to index TeX
commands as such but is not of much general
use.
underscoreasletter
+
+ Process underscore as normal letter. This
+ makes sense in so many cases that one wonders if
+ it should not be the default.
+
+ maxtermlength
@@ -9865,7 +9874,7 @@ for i in range(nres):
Path for the tesseract command. This is mostly
useful on Windows, or for specifying a
non-default tesseract command. e.g. on Windows:
- C:/Program Files (x86)/Tesseract-OCR/tesseract.exe
+ C:/ProgramFiles(x86)/Tesseract-OCR/tesseract.exe
-# Process backslash as normal letter
+# Process backslash as normal letter.
# This may make sense for people wanting to index TeX commands as
# such but is not of much general use.
#
#backslashasletter = 0
+#
+# Process underscore as normal letter.
+# This makes sense in so many cases that one wonders if it should
+# not be the default.
+#
+#underscoreasletter = 0
+
#
# Maximum term length.
# Words longer than this will be discarded.