diff --git a/packaging/debian/buildppa.sh b/packaging/debian/buildppa.sh index d741ca4a..d4b9f820 100644 --- a/packaging/debian/buildppa.sh +++ b/packaging/debian/buildppa.sh @@ -6,7 +6,7 @@ PPA_KEYID=7808CE96D38B9201 -RCLVERS=1.27.5 +RCLVERS=1.27.6 SCOPEVERS=1.20.2.4 GSSPVERS=1.0.0 PPAVERS=1 @@ -49,7 +49,7 @@ debdir=debian # 16.04LTS xenial 2021-04 # 18.04LTS bionic 2023-04 # 20.04LTS focal 2025-04 -series="xenial bionic focal" +series="xenial bionic focal groovy" series= if test "X$series" != X ; then @@ -89,8 +89,8 @@ done # 16.04LTS xenial 2021-04 # 18.04LTS bionic 2023-04 # 20.04LTS focal 2025-04 -series="xenial bionic focal" -#series= +series="xenial bionic focal groovy" +# series= debdir=debiankio topdir=kio-recoll-${RCLVERS} @@ -133,7 +133,7 @@ done # 16.04LTS xenial 2021-04 # 18.04LTS bionic 2023-04 # 20.04LTS focal 2025-04 -series="xenial bionic focal" +series="xenial bionic focal groovy" series= debdir=debiangssp diff --git a/packaging/debian/debian/changelog b/packaging/debian/debian/changelog index 3306e08e..6f3dfb02 100644 --- a/packaging/debian/debian/changelog +++ b/packaging/debian/debian/changelog @@ -1,3 +1,15 @@ +recoll (1.27.6-1~ppaPPAVERS~SERIES1) SERIES; urgency=low + + * Process PDF annotations if the poppler-glib Python bindings are available + * Fix build error with some compiler versions + * Fix inconsistent lock file name issue which could result in background + indexer exit. + * recollq: new -p option to show snippets lists instead of abstracts with + -A + * Fix nonumbers option. + + -- Jean-Francois Dockes Sun, 13 Sep 2020 13:29:00 +0200 + recoll (1.27.5-1~ppaPPAVERS~SERIES1) SERIES; urgency=low * Misc small fixes. diff --git a/packaging/debian/debiankio/changelog b/packaging/debian/debiankio/changelog index f4171d0a..67d94bac 100644 --- a/packaging/debian/debiankio/changelog +++ b/packaging/debian/debiankio/changelog @@ -1,3 +1,15 @@ +kio-recoll (1.27.6-1~ppaPPAVERS~SERIES1) SERIES; urgency=low + + * Process PDF annotations if the poppler-glib Python bindings are available + * Fix build error with some compiler versions + * Fix inconsistent lock file name issue which could result in background + indexer exit. + * recollq: new -p option to show snippets lists instead of abstracts with + -A + * Fix nonumbers option. + + -- Jean-Francois Dockes Sun, 13 Sep 2020 13:29:00 +0200 + kio-recoll (1.27.5-1~ppaPPAVERS~SERIES1) SERIES; urgency=low * Misc small fixes. diff --git a/src/VERSION b/src/VERSION index 2a5aed46..127aeda7 100644 --- a/src/VERSION +++ b/src/VERSION @@ -1 +1 @@ -1.27.6 +1.27.7 diff --git a/src/common/textsplit.cpp b/src/common/textsplit.cpp index fd26f894..aa725910 100644 --- a/src/common/textsplit.cpp +++ b/src/common/textsplit.cpp @@ -84,6 +84,9 @@ int TextSplit::o_maxWordLength{40}; static const int o_CJKMaxNgramLen{5}; bool o_exthangultagger{false}; +// This is changed to 0 if _ is processed as a letter +static char underscoreatend = '_'; + void TextSplit::staticConfInit(RclConfig *config) { config->getConfParam("maxtermlength", &o_maxWordLength); @@ -118,6 +121,14 @@ void TextSplit::staticConfInit(RclConfig *config) } } + bvalue = false; + if (config->getConfParam("underscoreasletter", &bvalue)) { + if (bvalue) { + charclasses[int('_')] = A_LLETTER; + underscoreatend = 0; + } + } + string kotagger; config->getConfParam("hangultagger", kotagger); if (!kotagger.empty()) { @@ -550,26 +561,26 @@ inline bool TextSplit::doemit(bool spanerase, size_t _bp) // Maybe trim at end. These are chars that we might keep // inside a span, but not at the end. - while (m_span.length() > 0) { - switch (*(m_span.rbegin())) { - case '.': - case '-': - case ',': - case '@': - case '_': - case '\'': - m_span.resize(m_span.length()-1); + string::size_type trimsz{0}; + while (trimsz < m_span.length()) { + auto c = m_span[m_span.length() - 1 - trimsz]; + if (c == '.' || c == '-' || c == ',' || c == '@' || c == '\'' || + c == underscoreatend) { + trimsz++; if (m_words_in_span.size() && - m_words_in_span.back().second > int(m_span.size())) + m_words_in_span.back().second > int(m_span.size())) { m_words_in_span.back().second = int(m_span.size()); - if (--bp < 0) + } + if (--bp < 0) { bp = 0; + } + } else { break; - default: - goto breaktrimloop; } } -breaktrimloop: + if (trimsz > 0) { + m_span.resize(m_span.length() - trimsz); + } if (!words_from_span(bp)) { return false; @@ -855,7 +866,7 @@ bool TextSplit::text_to_words(const string &in) goto SPACE; case '@': - case '_': + case '_': // If underscoreasletter is set, we'll never get this case '\'': // If in word, potential span: o'brien, jf@dockes.org, // else just ignore diff --git a/src/doc/user/recoll.conf.xml b/src/doc/user/recoll.conf.xml index 5ecce81f..0505ace9 100644 --- a/src/doc/user/recoll.conf.xml +++ b/src/doc/user/recoll.conf.xml @@ -253,8 +253,12 @@ in version 1.22, and on by default. Setting the variable to off allows restoring the previous behaviour. backslashasletter -Process backslash as normal letter This may make sense for people wanting to index TeX commands as +Process backslash as normal letter. This may make sense for people wanting to index TeX commands as such but is not of much general use. + +underscoreasletter +Process underscore as normal letter. This makes sense in so many cases that one wonders if it should +not be the default. maxtermlength Maximum term length. Words longer than this will be discarded. @@ -758,7 +762,7 @@ script. Example values: eng, fra... See the tesseract documentation.tesseractcmd Path for the tesseract command. This is mostly useful on Windows, or for specifying a non-default tesseract command. e.g. on Windows: -C:/Program Files (x86)/Tesseract-OCR/tesseract.exe +C:/ProgramFiles(x86)/Tesseract-OCR/tesseract.exe abbyylang Language to assume for abbyy OCR. Important for improving the OCR accuracy. This can also be set diff --git a/src/doc/user/usermanual.html b/src/doc/user/usermanual.html index 72060c70..d4dc0532 100644 --- a/src/doc/user/usermanual.html +++ b/src/doc/user/usermanual.html @@ -8933,12 +8933,21 @@ for i in range(nres): id= "RCL.INSTALL.CONFIG.RECOLLCONF.BACKSLASHASLETTER">backslashasletter
-

Process backslash as normal letter This may +

Process backslash as normal letter. This may make sense for people wanting to index TeX commands as such but is not of much general use.

underscoreasletter
+
+

Process underscore as normal letter. This + makes sense in so many cases that one wonders if + it should not be the default.

+
+
maxtermlength
@@ -9865,7 +9874,7 @@ for i in range(nres):

Path for the tesseract command. This is mostly useful on Windows, or for specifying a non-default tesseract command. e.g. on Windows: - C:/Program Files (x86)/Tesseract-OCR/tesseract.exe

+ C:/ProgramFiles(x86)/Tesseract-OCR/tesseract.exe

-# Process backslash as normal letter +# Process backslash as normal letter. # This may make sense for people wanting to index TeX commands as # such but is not of much general use. # #backslashasletter = 0 +# +# Process underscore as normal letter. +# This makes sense in so many cases that one wonders if it should +# not be the default. +# +#underscoreasletter = 0 + # # Maximum term length. # Words longer than this will be discarded. diff --git a/src/testmains/trtextsplit.cpp b/src/testmains/trtextsplit.cpp index 730d2011..ab8056e3 100644 --- a/src/testmains/trtextsplit.cpp +++ b/src/testmains/trtextsplit.cpp @@ -265,6 +265,7 @@ int main(int argc, char **argv) if (!kotagger.empty()) { fprintf(fp, "hangultagger = %s\n", kotagger.c_str()); } + fprintf(fp, "underscoreasletter = 0\n"); fclose(fp); Logger::getTheLog("")->setLogLevel(Logger::LogLevel(loglevel));