From efaa4796b14d9f7f3a9f09ad7319bc72b5e31913 Mon Sep 17 00:00:00 2001
From: Jean-Francois Dockes
Date: Sun, 13 Sep 2020 15:32:31 +0200
Subject: [PATCH 1/3] version bump
---
packaging/debian/buildppa.sh | 10 +++++-----
packaging/debian/debian/changelog | 12 ++++++++++++
packaging/debian/debiankio/changelog | 12 ++++++++++++
src/VERSION | 2 +-
4 files changed, 30 insertions(+), 6 deletions(-)
diff --git a/packaging/debian/buildppa.sh b/packaging/debian/buildppa.sh
index d741ca4a..d4b9f820 100644
--- a/packaging/debian/buildppa.sh
+++ b/packaging/debian/buildppa.sh
@@ -6,7 +6,7 @@
PPA_KEYID=7808CE96D38B9201
-RCLVERS=1.27.5
+RCLVERS=1.27.6
SCOPEVERS=1.20.2.4
GSSPVERS=1.0.0
PPAVERS=1
@@ -49,7 +49,7 @@ debdir=debian
# 16.04LTS xenial 2021-04
# 18.04LTS bionic 2023-04
# 20.04LTS focal 2025-04
-series="xenial bionic focal"
+series="xenial bionic focal groovy"
series=
if test "X$series" != X ; then
@@ -89,8 +89,8 @@ done
# 16.04LTS xenial 2021-04
# 18.04LTS bionic 2023-04
# 20.04LTS focal 2025-04
-series="xenial bionic focal"
-#series=
+series="xenial bionic focal groovy"
+# series=
debdir=debiankio
topdir=kio-recoll-${RCLVERS}
@@ -133,7 +133,7 @@ done
# 16.04LTS xenial 2021-04
# 18.04LTS bionic 2023-04
# 20.04LTS focal 2025-04
-series="xenial bionic focal"
+series="xenial bionic focal groovy"
series=
debdir=debiangssp
diff --git a/packaging/debian/debian/changelog b/packaging/debian/debian/changelog
index 3306e08e..6f3dfb02 100644
--- a/packaging/debian/debian/changelog
+++ b/packaging/debian/debian/changelog
@@ -1,3 +1,15 @@
+recoll (1.27.6-1~ppaPPAVERS~SERIES1) SERIES; urgency=low
+
+ * Process PDF annotations if the poppler-glib Python bindings are available
+ * Fix build error with some compiler versions
+ * Fix inconsistent lock file name issue which could result in background
+ indexer exit.
+ * recollq: new -p option to show snippets lists instead of abstracts with
+ -A
+ * Fix nonumbers option.
+
+ -- Jean-Francois Dockes Sun, 13 Sep 2020 13:29:00 +0200
+
recoll (1.27.5-1~ppaPPAVERS~SERIES1) SERIES; urgency=low
* Misc small fixes.
diff --git a/packaging/debian/debiankio/changelog b/packaging/debian/debiankio/changelog
index f4171d0a..67d94bac 100644
--- a/packaging/debian/debiankio/changelog
+++ b/packaging/debian/debiankio/changelog
@@ -1,3 +1,15 @@
+kio-recoll (1.27.6-1~ppaPPAVERS~SERIES1) SERIES; urgency=low
+
+ * Process PDF annotations if the poppler-glib Python bindings are available
+ * Fix build error with some compiler versions
+ * Fix inconsistent lock file name issue which could result in background
+ indexer exit.
+ * recollq: new -p option to show snippets lists instead of abstracts with
+ -A
+ * Fix nonumbers option.
+
+ -- Jean-Francois Dockes Sun, 13 Sep 2020 13:29:00 +0200
+
kio-recoll (1.27.5-1~ppaPPAVERS~SERIES1) SERIES; urgency=low
* Misc small fixes.
diff --git a/src/VERSION b/src/VERSION
index 2a5aed46..127aeda7 100644
--- a/src/VERSION
+++ b/src/VERSION
@@ -1 +1 @@
-1.27.6
+1.27.7
From df09d65a4e69b3b6932f80ba06ec2426d70aa818 Mon Sep 17 00:00:00 2001
From: Jean-Francois Dockes
Date: Sun, 13 Sep 2020 15:40:28 +0200
Subject: [PATCH 2/3] add underscoreasletter config variable to process _ as a
letter
---
src/common/textsplit.cpp | 9 ++++++++-
src/doc/user/recoll.conf.xml | 8 ++++++--
src/doc/user/usermanual.html | 13 +++++++++++--
src/sampleconf/recoll.conf | 9 ++++++++-
4 files changed, 33 insertions(+), 6 deletions(-)
diff --git a/src/common/textsplit.cpp b/src/common/textsplit.cpp
index fd26f894..04af5396 100644
--- a/src/common/textsplit.cpp
+++ b/src/common/textsplit.cpp
@@ -118,6 +118,13 @@ void TextSplit::staticConfInit(RclConfig *config)
}
}
+ bvalue = false;
+ if (config->getConfParam("underscoreasletter", &bvalue)) {
+ if (bvalue) {
+ charclasses[int('_')] = A_LLETTER;
+ }
+ }
+
string kotagger;
config->getConfParam("hangultagger", kotagger);
if (!kotagger.empty()) {
@@ -855,7 +862,7 @@ bool TextSplit::text_to_words(const string &in)
goto SPACE;
case '@':
- case '_':
+ case '_': // If underscoreasletter is set, we'll never get this
case '\'':
// If in word, potential span: o'brien, jf@dockes.org,
// else just ignore
diff --git a/src/doc/user/recoll.conf.xml b/src/doc/user/recoll.conf.xml
index 5ecce81f..0505ace9 100644
--- a/src/doc/user/recoll.conf.xml
+++ b/src/doc/user/recoll.conf.xml
@@ -253,8 +253,12 @@ in version 1.22, and on by default. Setting the variable to off allows
restoring the previous behaviour.
backslashasletter
-Process backslash as normal letter This may make sense for people wanting to index TeX commands as
+Process backslash as normal letter. This may make sense for people wanting to index TeX commands as
such but is not of much general use.
+
+underscoreasletter
+Process underscore as normal letter. This makes sense in so many cases that one wonders if it should
+not be the default.
maxtermlength
Maximum term length. Words longer than this will be discarded.
@@ -758,7 +762,7 @@ script. Example values: eng, fra... See the tesseract documentation.tesseractcmd
Path for the tesseract command. This is mostly useful on Windows, or for specifying a non-default
tesseract command. e.g. on Windows:
-C:/Program Files (x86)/Tesseract-OCR/tesseract.exe
+C:/ProgramFiles(x86)/Tesseract-OCR/tesseract.exe
abbyylang
Language to assume for abbyy OCR. Important for improving the OCR accuracy. This can also be set
diff --git a/src/doc/user/usermanual.html b/src/doc/user/usermanual.html
index 72060c70..d4dc0532 100644
--- a/src/doc/user/usermanual.html
+++ b/src/doc/user/usermanual.html
@@ -8933,12 +8933,21 @@ for i in range(nres):
id=
"RCL.INSTALL.CONFIG.RECOLLCONF.BACKSLASHASLETTER">backslashasletter
- Process backslash as normal letter This may
+
Process backslash as normal letter. This may
make sense for people wanting to index TeX
commands as such but is not of much general
use.
underscoreasletter
+
+ Process underscore as normal letter. This
+ makes sense in so many cases that one wonders if
+ it should not be the default.
+
+ maxtermlength
@@ -9865,7 +9874,7 @@ for i in range(nres):
Path for the tesseract command. This is mostly
useful on Windows, or for specifying a
non-default tesseract command. e.g. on Windows:
- C:/Program Files (x86)/Tesseract-OCR/tesseract.exe
+ C:/ProgramFiles(x86)/Tesseract-OCR/tesseract.exe
-# Process backslash as normal letter
+# Process backslash as normal letter.
# This may make sense for people wanting to index TeX commands as
# such but is not of much general use.
#
#backslashasletter = 0
+#
+# Process underscore as normal letter.
+# This makes sense in so many cases that one wonders if it should
+# not be the default.
+#
+#underscoreasletter = 0
+
#
# Maximum term length.
# Words longer than this will be discarded.
From 16a9d8eba81905dfe4f542bd708cc9046135ddb5 Mon Sep 17 00:00:00 2001
From: Jean-Francois Dockes
Date: Sun, 13 Sep 2020 17:53:59 +0200
Subject: [PATCH 3/3] fix span trimming loop when underscoreasletter is set
---
src/common/textsplit.cpp | 32 ++++++++++++++++++--------------
src/testmains/trtextsplit.cpp | 1 +
2 files changed, 19 insertions(+), 14 deletions(-)
diff --git a/src/common/textsplit.cpp b/src/common/textsplit.cpp
index 04af5396..aa725910 100644
--- a/src/common/textsplit.cpp
+++ b/src/common/textsplit.cpp
@@ -84,6 +84,9 @@ int TextSplit::o_maxWordLength{40};
static const int o_CJKMaxNgramLen{5};
bool o_exthangultagger{false};
+// This is changed to 0 if _ is processed as a letter
+static char underscoreatend = '_';
+
void TextSplit::staticConfInit(RclConfig *config)
{
config->getConfParam("maxtermlength", &o_maxWordLength);
@@ -122,6 +125,7 @@ void TextSplit::staticConfInit(RclConfig *config)
if (config->getConfParam("underscoreasletter", &bvalue)) {
if (bvalue) {
charclasses[int('_')] = A_LLETTER;
+ underscoreatend = 0;
}
}
@@ -557,26 +561,26 @@ inline bool TextSplit::doemit(bool spanerase, size_t _bp)
// Maybe trim at end. These are chars that we might keep
// inside a span, but not at the end.
- while (m_span.length() > 0) {
- switch (*(m_span.rbegin())) {
- case '.':
- case '-':
- case ',':
- case '@':
- case '_':
- case '\'':
- m_span.resize(m_span.length()-1);
+ string::size_type trimsz{0};
+ while (trimsz < m_span.length()) {
+ auto c = m_span[m_span.length() - 1 - trimsz];
+ if (c == '.' || c == '-' || c == ',' || c == '@' || c == '\'' ||
+ c == underscoreatend) {
+ trimsz++;
if (m_words_in_span.size() &&
- m_words_in_span.back().second > int(m_span.size()))
+ m_words_in_span.back().second > int(m_span.size())) {
m_words_in_span.back().second = int(m_span.size());
- if (--bp < 0)
+ }
+ if (--bp < 0) {
bp = 0;
+ }
+ } else {
break;
- default:
- goto breaktrimloop;
}
}
-breaktrimloop:
+ if (trimsz > 0) {
+ m_span.resize(m_span.length() - trimsz);
+ }
if (!words_from_span(bp)) {
return false;
diff --git a/src/testmains/trtextsplit.cpp b/src/testmains/trtextsplit.cpp
index 730d2011..ab8056e3 100644
--- a/src/testmains/trtextsplit.cpp
+++ b/src/testmains/trtextsplit.cpp
@@ -265,6 +265,7 @@ int main(int argc, char **argv)
if (!kotagger.empty()) {
fprintf(fp, "hangultagger = %s\n", kotagger.c_str());
}
+ fprintf(fp, "underscoreasletter = 0\n");
fclose(fp);
Logger::getTheLog("")->setLogLevel(Logger::LogLevel(loglevel));