From efaa4796b14d9f7f3a9f09ad7319bc72b5e31913 Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Sun, 13 Sep 2020 15:32:31 +0200 Subject: [PATCH 1/3] version bump --- packaging/debian/buildppa.sh | 10 +++++----- packaging/debian/debian/changelog | 12 ++++++++++++ packaging/debian/debiankio/changelog | 12 ++++++++++++ src/VERSION | 2 +- 4 files changed, 30 insertions(+), 6 deletions(-) diff --git a/packaging/debian/buildppa.sh b/packaging/debian/buildppa.sh index d741ca4a..d4b9f820 100644 --- a/packaging/debian/buildppa.sh +++ b/packaging/debian/buildppa.sh @@ -6,7 +6,7 @@ PPA_KEYID=7808CE96D38B9201 -RCLVERS=1.27.5 +RCLVERS=1.27.6 SCOPEVERS=1.20.2.4 GSSPVERS=1.0.0 PPAVERS=1 @@ -49,7 +49,7 @@ debdir=debian # 16.04LTS xenial 2021-04 # 18.04LTS bionic 2023-04 # 20.04LTS focal 2025-04 -series="xenial bionic focal" +series="xenial bionic focal groovy" series= if test "X$series" != X ; then @@ -89,8 +89,8 @@ done # 16.04LTS xenial 2021-04 # 18.04LTS bionic 2023-04 # 20.04LTS focal 2025-04 -series="xenial bionic focal" -#series= +series="xenial bionic focal groovy" +# series= debdir=debiankio topdir=kio-recoll-${RCLVERS} @@ -133,7 +133,7 @@ done # 16.04LTS xenial 2021-04 # 18.04LTS bionic 2023-04 # 20.04LTS focal 2025-04 -series="xenial bionic focal" +series="xenial bionic focal groovy" series= debdir=debiangssp diff --git a/packaging/debian/debian/changelog b/packaging/debian/debian/changelog index 3306e08e..6f3dfb02 100644 --- a/packaging/debian/debian/changelog +++ b/packaging/debian/debian/changelog @@ -1,3 +1,15 @@ +recoll (1.27.6-1~ppaPPAVERS~SERIES1) SERIES; urgency=low + + * Process PDF annotations if the poppler-glib Python bindings are available + * Fix build error with some compiler versions + * Fix inconsistent lock file name issue which could result in background + indexer exit. + * recollq: new -p option to show snippets lists instead of abstracts with + -A + * Fix nonumbers option. + + -- Jean-Francois Dockes Sun, 13 Sep 2020 13:29:00 +0200 + recoll (1.27.5-1~ppaPPAVERS~SERIES1) SERIES; urgency=low * Misc small fixes. diff --git a/packaging/debian/debiankio/changelog b/packaging/debian/debiankio/changelog index f4171d0a..67d94bac 100644 --- a/packaging/debian/debiankio/changelog +++ b/packaging/debian/debiankio/changelog @@ -1,3 +1,15 @@ +kio-recoll (1.27.6-1~ppaPPAVERS~SERIES1) SERIES; urgency=low + + * Process PDF annotations if the poppler-glib Python bindings are available + * Fix build error with some compiler versions + * Fix inconsistent lock file name issue which could result in background + indexer exit. + * recollq: new -p option to show snippets lists instead of abstracts with + -A + * Fix nonumbers option. + + -- Jean-Francois Dockes Sun, 13 Sep 2020 13:29:00 +0200 + kio-recoll (1.27.5-1~ppaPPAVERS~SERIES1) SERIES; urgency=low * Misc small fixes. diff --git a/src/VERSION b/src/VERSION index 2a5aed46..127aeda7 100644 --- a/src/VERSION +++ b/src/VERSION @@ -1 +1 @@ -1.27.6 +1.27.7 From df09d65a4e69b3b6932f80ba06ec2426d70aa818 Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Sun, 13 Sep 2020 15:40:28 +0200 Subject: [PATCH 2/3] add underscoreasletter config variable to process _ as a letter --- src/common/textsplit.cpp | 9 ++++++++- src/doc/user/recoll.conf.xml | 8 ++++++-- src/doc/user/usermanual.html | 13 +++++++++++-- src/sampleconf/recoll.conf | 9 ++++++++- 4 files changed, 33 insertions(+), 6 deletions(-) diff --git a/src/common/textsplit.cpp b/src/common/textsplit.cpp index fd26f894..04af5396 100644 --- a/src/common/textsplit.cpp +++ b/src/common/textsplit.cpp @@ -118,6 +118,13 @@ void TextSplit::staticConfInit(RclConfig *config) } } + bvalue = false; + if (config->getConfParam("underscoreasletter", &bvalue)) { + if (bvalue) { + charclasses[int('_')] = A_LLETTER; + } + } + string kotagger; config->getConfParam("hangultagger", kotagger); if (!kotagger.empty()) { @@ -855,7 +862,7 @@ bool TextSplit::text_to_words(const string &in) goto SPACE; case '@': - case '_': + case '_': // If underscoreasletter is set, we'll never get this case '\'': // If in word, potential span: o'brien, jf@dockes.org, // else just ignore diff --git a/src/doc/user/recoll.conf.xml b/src/doc/user/recoll.conf.xml index 5ecce81f..0505ace9 100644 --- a/src/doc/user/recoll.conf.xml +++ b/src/doc/user/recoll.conf.xml @@ -253,8 +253,12 @@ in version 1.22, and on by default. Setting the variable to off allows restoring the previous behaviour. backslashasletter -Process backslash as normal letter This may make sense for people wanting to index TeX commands as +Process backslash as normal letter. This may make sense for people wanting to index TeX commands as such but is not of much general use. + +underscoreasletter +Process underscore as normal letter. This makes sense in so many cases that one wonders if it should +not be the default. maxtermlength Maximum term length. Words longer than this will be discarded. @@ -758,7 +762,7 @@ script. Example values: eng, fra... See the tesseract documentation.tesseractcmd Path for the tesseract command. This is mostly useful on Windows, or for specifying a non-default tesseract command. e.g. on Windows: -C:/Program Files (x86)/Tesseract-OCR/tesseract.exe +C:/ProgramFiles(x86)/Tesseract-OCR/tesseract.exe abbyylang Language to assume for abbyy OCR. Important for improving the OCR accuracy. This can also be set diff --git a/src/doc/user/usermanual.html b/src/doc/user/usermanual.html index 72060c70..d4dc0532 100644 --- a/src/doc/user/usermanual.html +++ b/src/doc/user/usermanual.html @@ -8933,12 +8933,21 @@ for i in range(nres): id= "RCL.INSTALL.CONFIG.RECOLLCONF.BACKSLASHASLETTER">backslashasletter
-

Process backslash as normal letter This may +

Process backslash as normal letter. This may make sense for people wanting to index TeX commands as such but is not of much general use.

underscoreasletter
+
+

Process underscore as normal letter. This + makes sense in so many cases that one wonders if + it should not be the default.

+
+
maxtermlength
@@ -9865,7 +9874,7 @@ for i in range(nres):

Path for the tesseract command. This is mostly useful on Windows, or for specifying a non-default tesseract command. e.g. on Windows: - C:/Program Files (x86)/Tesseract-OCR/tesseract.exe

+ C:/ProgramFiles(x86)/Tesseract-OCR/tesseract.exe

-# Process backslash as normal letter +# Process backslash as normal letter. # This may make sense for people wanting to index TeX commands as # such but is not of much general use. # #backslashasletter = 0 +# +# Process underscore as normal letter. +# This makes sense in so many cases that one wonders if it should +# not be the default. +# +#underscoreasletter = 0 + # # Maximum term length. # Words longer than this will be discarded. From 16a9d8eba81905dfe4f542bd708cc9046135ddb5 Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Sun, 13 Sep 2020 17:53:59 +0200 Subject: [PATCH 3/3] fix span trimming loop when underscoreasletter is set --- src/common/textsplit.cpp | 32 ++++++++++++++++++-------------- src/testmains/trtextsplit.cpp | 1 + 2 files changed, 19 insertions(+), 14 deletions(-) diff --git a/src/common/textsplit.cpp b/src/common/textsplit.cpp index 04af5396..aa725910 100644 --- a/src/common/textsplit.cpp +++ b/src/common/textsplit.cpp @@ -84,6 +84,9 @@ int TextSplit::o_maxWordLength{40}; static const int o_CJKMaxNgramLen{5}; bool o_exthangultagger{false}; +// This is changed to 0 if _ is processed as a letter +static char underscoreatend = '_'; + void TextSplit::staticConfInit(RclConfig *config) { config->getConfParam("maxtermlength", &o_maxWordLength); @@ -122,6 +125,7 @@ void TextSplit::staticConfInit(RclConfig *config) if (config->getConfParam("underscoreasletter", &bvalue)) { if (bvalue) { charclasses[int('_')] = A_LLETTER; + underscoreatend = 0; } } @@ -557,26 +561,26 @@ inline bool TextSplit::doemit(bool spanerase, size_t _bp) // Maybe trim at end. These are chars that we might keep // inside a span, but not at the end. - while (m_span.length() > 0) { - switch (*(m_span.rbegin())) { - case '.': - case '-': - case ',': - case '@': - case '_': - case '\'': - m_span.resize(m_span.length()-1); + string::size_type trimsz{0}; + while (trimsz < m_span.length()) { + auto c = m_span[m_span.length() - 1 - trimsz]; + if (c == '.' || c == '-' || c == ',' || c == '@' || c == '\'' || + c == underscoreatend) { + trimsz++; if (m_words_in_span.size() && - m_words_in_span.back().second > int(m_span.size())) + m_words_in_span.back().second > int(m_span.size())) { m_words_in_span.back().second = int(m_span.size()); - if (--bp < 0) + } + if (--bp < 0) { bp = 0; + } + } else { break; - default: - goto breaktrimloop; } } -breaktrimloop: + if (trimsz > 0) { + m_span.resize(m_span.length() - trimsz); + } if (!words_from_span(bp)) { return false; diff --git a/src/testmains/trtextsplit.cpp b/src/testmains/trtextsplit.cpp index 730d2011..ab8056e3 100644 --- a/src/testmains/trtextsplit.cpp +++ b/src/testmains/trtextsplit.cpp @@ -265,6 +265,7 @@ int main(int argc, char **argv) if (!kotagger.empty()) { fprintf(fp, "hangultagger = %s\n", kotagger.c_str()); } + fprintf(fp, "underscoreasletter = 0\n"); fclose(fp); Logger::getTheLog("")->setLogLevel(Logger::LogLevel(loglevel));