This commit is contained in:
Jean-Francois Dockes 2020-09-14 16:11:22 +01:00
commit db1d2f48f6
9 changed files with 82 additions and 26 deletions

View File

@ -6,7 +6,7 @@
PPA_KEYID=7808CE96D38B9201
RCLVERS=1.27.5
RCLVERS=1.27.6
SCOPEVERS=1.20.2.4
GSSPVERS=1.0.0
PPAVERS=1
@ -49,7 +49,7 @@ debdir=debian
# 16.04LTS xenial 2021-04
# 18.04LTS bionic 2023-04
# 20.04LTS focal 2025-04
series="xenial bionic focal"
series="xenial bionic focal groovy"
series=
if test "X$series" != X ; then
@ -89,8 +89,8 @@ done
# 16.04LTS xenial 2021-04
# 18.04LTS bionic 2023-04
# 20.04LTS focal 2025-04
series="xenial bionic focal"
#series=
series="xenial bionic focal groovy"
# series=
debdir=debiankio
topdir=kio-recoll-${RCLVERS}
@ -133,7 +133,7 @@ done
# 16.04LTS xenial 2021-04
# 18.04LTS bionic 2023-04
# 20.04LTS focal 2025-04
series="xenial bionic focal"
series="xenial bionic focal groovy"
series=
debdir=debiangssp

View File

@ -1,3 +1,15 @@
recoll (1.27.6-1~ppaPPAVERS~SERIES1) SERIES; urgency=low
* Process PDF annotations if the poppler-glib Python bindings are available
* Fix build error with some compiler versions
* Fix inconsistent lock file name issue which could result in background
indexer exit.
* recollq: new -p option to show snippets lists instead of abstracts with
-A
* Fix nonumbers option.
-- Jean-Francois Dockes <jf@dockes.org> Sun, 13 Sep 2020 13:29:00 +0200
recoll (1.27.5-1~ppaPPAVERS~SERIES1) SERIES; urgency=low
* Misc small fixes.

View File

@ -1,3 +1,15 @@
kio-recoll (1.27.6-1~ppaPPAVERS~SERIES1) SERIES; urgency=low
* Process PDF annotations if the poppler-glib Python bindings are available
* Fix build error with some compiler versions
* Fix inconsistent lock file name issue which could result in background
indexer exit.
* recollq: new -p option to show snippets lists instead of abstracts with
-A
* Fix nonumbers option.
-- Jean-Francois Dockes <jf@dockes.org> Sun, 13 Sep 2020 13:29:00 +0200
kio-recoll (1.27.5-1~ppaPPAVERS~SERIES1) SERIES; urgency=low
* Misc small fixes.

View File

@ -1 +1 @@
1.27.6
1.27.7

View File

@ -84,6 +84,9 @@ int TextSplit::o_maxWordLength{40};
static const int o_CJKMaxNgramLen{5};
bool o_exthangultagger{false};
// This is changed to 0 if _ is processed as a letter
static char underscoreatend = '_';
void TextSplit::staticConfInit(RclConfig *config)
{
config->getConfParam("maxtermlength", &o_maxWordLength);
@ -118,6 +121,14 @@ void TextSplit::staticConfInit(RclConfig *config)
}
}
bvalue = false;
if (config->getConfParam("underscoreasletter", &bvalue)) {
if (bvalue) {
charclasses[int('_')] = A_LLETTER;
underscoreatend = 0;
}
}
string kotagger;
config->getConfParam("hangultagger", kotagger);
if (!kotagger.empty()) {
@ -550,26 +561,26 @@ inline bool TextSplit::doemit(bool spanerase, size_t _bp)
// Maybe trim at end. These are chars that we might keep
// inside a span, but not at the end.
while (m_span.length() > 0) {
switch (*(m_span.rbegin())) {
case '.':
case '-':
case ',':
case '@':
case '_':
case '\'':
m_span.resize(m_span.length()-1);
string::size_type trimsz{0};
while (trimsz < m_span.length()) {
auto c = m_span[m_span.length() - 1 - trimsz];
if (c == '.' || c == '-' || c == ',' || c == '@' || c == '\'' ||
c == underscoreatend) {
trimsz++;
if (m_words_in_span.size() &&
m_words_in_span.back().second > int(m_span.size()))
m_words_in_span.back().second > int(m_span.size())) {
m_words_in_span.back().second = int(m_span.size());
if (--bp < 0)
}
if (--bp < 0) {
bp = 0;
}
} else {
break;
default:
goto breaktrimloop;
}
}
breaktrimloop:
if (trimsz > 0) {
m_span.resize(m_span.length() - trimsz);
}
if (!words_from_span(bp)) {
return false;
@ -855,7 +866,7 @@ bool TextSplit::text_to_words(const string &in)
goto SPACE;
case '@':
case '_':
case '_': // If underscoreasletter is set, we'll never get this
case '\'':
// If in word, potential span: o'brien, jf@dockes.org,
// else just ignore

View File

@ -253,8 +253,12 @@ in version 1.22, and on by default. Setting the variable to off allows
restoring the previous behaviour.</para></listitem></varlistentry>
<varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.BACKSLASHASLETTER">
<term><varname>backslashasletter</varname></term>
<listitem><para>Process backslash as normal letter This may make sense for people wanting to index TeX commands as
<listitem><para>Process backslash as normal letter. This may make sense for people wanting to index TeX commands as
such but is not of much general use.</para></listitem></varlistentry>
<varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.UNDERSCOREASLETTER">
<term><varname>underscoreasletter</varname></term>
<listitem><para>Process underscore as normal letter. This makes sense in so many cases that one wonders if it should
not be the default.</para></listitem></varlistentry>
<varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.MAXTERMLENGTH">
<term><varname>maxtermlength</varname></term>
<listitem><para>Maximum term length. Words longer than this will be discarded.
@ -758,7 +762,7 @@ script. Example values: eng, fra... See the tesseract documentation.</para></lis
<term><varname>tesseractcmd</varname></term>
<listitem><para>Path for the tesseract command. This is mostly useful on Windows, or for specifying a non-default
tesseract command. e.g. on Windows:
C:/Program&nbsp;Files&nbsp;(x86)/Tesseract-OCR/tesseract.exe</para></listitem></varlistentry>
C:/ProgramFiles(x86)/Tesseract-OCR/tesseract.exe</para></listitem></varlistentry>
<varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.ABBYYLANG">
<term><varname>abbyylang</varname></term>
<listitem><para>Language to assume for abbyy OCR. Important for improving the OCR accuracy. This can also be set

View File

@ -8933,12 +8933,21 @@ for i in range(nres):
id=
"RCL.INSTALL.CONFIG.RECOLLCONF.BACKSLASHASLETTER"></a><span class="term"><code class="varname">backslashasletter</code></span></dt>
<dd>
<p>Process backslash as normal letter This may
<p>Process backslash as normal letter. This may
make sense for people wanting to index TeX
commands as such but is not of much general
use.</p>
</dd>
<dt><a name=
"RCL.INSTALL.CONFIG.RECOLLCONF.UNDERSCOREASLETTER"
id=
"RCL.INSTALL.CONFIG.RECOLLCONF.UNDERSCOREASLETTER"></a><span class="term"><code class="varname">underscoreasletter</code></span></dt>
<dd>
<p>Process underscore as normal letter. This
makes sense in so many cases that one wonders if
it should not be the default.</p>
</dd>
<dt><a name=
"RCL.INSTALL.CONFIG.RECOLLCONF.MAXTERMLENGTH" id=
"RCL.INSTALL.CONFIG.RECOLLCONF.MAXTERMLENGTH"></a><span class="term"><code class="varname">maxtermlength</code></span></dt>
<dd>
@ -9865,7 +9874,7 @@ for i in range(nres):
<p>Path for the tesseract command. This is mostly
useful on Windows, or for specifying a
non-default tesseract command. e.g. on Windows:
C:/Program&nbsp;Files&nbsp;(x86)/Tesseract-OCR/tesseract.exe</p>
C:/ProgramFiles(x86)/Tesseract-OCR/tesseract.exe</p>
</dd>
<dt><a name=
"RCL.INSTALL.CONFIG.RECOLLCONF.ABBYYLANG" id=

View File

@ -317,12 +317,19 @@ indexStoreDocText = 1
#dehyphenate = 1
# <var name="backslashasletter" type="bool">
# <brief>Process backslash as normal letter</brief>
# <brief>Process backslash as normal letter.</brief>
# <descr>This may make sense for people wanting to index TeX commands as
# such but is not of much general use.</descr>
# </var>
#backslashasletter = 0
# <var name="underscoreasletter" type="bool">
# <brief>Process underscore as normal letter.</brief>
# <descr>This makes sense in so many cases that one wonders if it should
# not be the default.</descr>
# </var>
#underscoreasletter = 0
# <var name="maxtermlength" type="int" values="10 200 40">
# <brief>Maximum term length.</brief>
# <descr>Words longer than this will be discarded.

View File

@ -265,6 +265,7 @@ int main(int argc, char **argv)
if (!kotagger.empty()) {
fprintf(fp, "hangultagger = %s\n", kotagger.c_str());
}
fprintf(fp, "underscoreasletter = 0\n");
fclose(fp);
Logger::getTheLog("")->setLogLevel(Logger::LogLevel(loglevel));