This commit is contained in:
Jean-Francois Dockes 2020-09-14 16:11:22 +01:00
commit db1d2f48f6
9 changed files with 82 additions and 26 deletions

View File

@ -6,7 +6,7 @@
PPA_KEYID=7808CE96D38B9201 PPA_KEYID=7808CE96D38B9201
RCLVERS=1.27.5 RCLVERS=1.27.6
SCOPEVERS=1.20.2.4 SCOPEVERS=1.20.2.4
GSSPVERS=1.0.0 GSSPVERS=1.0.0
PPAVERS=1 PPAVERS=1
@ -49,7 +49,7 @@ debdir=debian
# 16.04LTS xenial 2021-04 # 16.04LTS xenial 2021-04
# 18.04LTS bionic 2023-04 # 18.04LTS bionic 2023-04
# 20.04LTS focal 2025-04 # 20.04LTS focal 2025-04
series="xenial bionic focal" series="xenial bionic focal groovy"
series= series=
if test "X$series" != X ; then if test "X$series" != X ; then
@ -89,8 +89,8 @@ done
# 16.04LTS xenial 2021-04 # 16.04LTS xenial 2021-04
# 18.04LTS bionic 2023-04 # 18.04LTS bionic 2023-04
# 20.04LTS focal 2025-04 # 20.04LTS focal 2025-04
series="xenial bionic focal" series="xenial bionic focal groovy"
#series= # series=
debdir=debiankio debdir=debiankio
topdir=kio-recoll-${RCLVERS} topdir=kio-recoll-${RCLVERS}
@ -133,7 +133,7 @@ done
# 16.04LTS xenial 2021-04 # 16.04LTS xenial 2021-04
# 18.04LTS bionic 2023-04 # 18.04LTS bionic 2023-04
# 20.04LTS focal 2025-04 # 20.04LTS focal 2025-04
series="xenial bionic focal" series="xenial bionic focal groovy"
series= series=
debdir=debiangssp debdir=debiangssp

View File

@ -1,3 +1,15 @@
recoll (1.27.6-1~ppaPPAVERS~SERIES1) SERIES; urgency=low
* Process PDF annotations if the poppler-glib Python bindings are available
* Fix build error with some compiler versions
* Fix inconsistent lock file name issue which could result in background
indexer exit.
* recollq: new -p option to show snippets lists instead of abstracts with
-A
* Fix nonumbers option.
-- Jean-Francois Dockes <jf@dockes.org> Sun, 13 Sep 2020 13:29:00 +0200
recoll (1.27.5-1~ppaPPAVERS~SERIES1) SERIES; urgency=low recoll (1.27.5-1~ppaPPAVERS~SERIES1) SERIES; urgency=low
* Misc small fixes. * Misc small fixes.

View File

@ -1,3 +1,15 @@
kio-recoll (1.27.6-1~ppaPPAVERS~SERIES1) SERIES; urgency=low
* Process PDF annotations if the poppler-glib Python bindings are available
* Fix build error with some compiler versions
* Fix inconsistent lock file name issue which could result in background
indexer exit.
* recollq: new -p option to show snippets lists instead of abstracts with
-A
* Fix nonumbers option.
-- Jean-Francois Dockes <jf@dockes.org> Sun, 13 Sep 2020 13:29:00 +0200
kio-recoll (1.27.5-1~ppaPPAVERS~SERIES1) SERIES; urgency=low kio-recoll (1.27.5-1~ppaPPAVERS~SERIES1) SERIES; urgency=low
* Misc small fixes. * Misc small fixes.

View File

@ -1 +1 @@
1.27.6 1.27.7

View File

@ -84,6 +84,9 @@ int TextSplit::o_maxWordLength{40};
static const int o_CJKMaxNgramLen{5}; static const int o_CJKMaxNgramLen{5};
bool o_exthangultagger{false}; bool o_exthangultagger{false};
// This is changed to 0 if _ is processed as a letter
static char underscoreatend = '_';
void TextSplit::staticConfInit(RclConfig *config) void TextSplit::staticConfInit(RclConfig *config)
{ {
config->getConfParam("maxtermlength", &o_maxWordLength); config->getConfParam("maxtermlength", &o_maxWordLength);
@ -118,6 +121,14 @@ void TextSplit::staticConfInit(RclConfig *config)
} }
} }
bvalue = false;
if (config->getConfParam("underscoreasletter", &bvalue)) {
if (bvalue) {
charclasses[int('_')] = A_LLETTER;
underscoreatend = 0;
}
}
string kotagger; string kotagger;
config->getConfParam("hangultagger", kotagger); config->getConfParam("hangultagger", kotagger);
if (!kotagger.empty()) { if (!kotagger.empty()) {
@ -550,26 +561,26 @@ inline bool TextSplit::doemit(bool spanerase, size_t _bp)
// Maybe trim at end. These are chars that we might keep // Maybe trim at end. These are chars that we might keep
// inside a span, but not at the end. // inside a span, but not at the end.
while (m_span.length() > 0) { string::size_type trimsz{0};
switch (*(m_span.rbegin())) { while (trimsz < m_span.length()) {
case '.': auto c = m_span[m_span.length() - 1 - trimsz];
case '-': if (c == '.' || c == '-' || c == ',' || c == '@' || c == '\'' ||
case ',': c == underscoreatend) {
case '@': trimsz++;
case '_':
case '\'':
m_span.resize(m_span.length()-1);
if (m_words_in_span.size() && if (m_words_in_span.size() &&
m_words_in_span.back().second > int(m_span.size())) m_words_in_span.back().second > int(m_span.size())) {
m_words_in_span.back().second = int(m_span.size()); m_words_in_span.back().second = int(m_span.size());
if (--bp < 0) }
if (--bp < 0) {
bp = 0; bp = 0;
}
} else {
break; break;
default:
goto breaktrimloop;
} }
} }
breaktrimloop: if (trimsz > 0) {
m_span.resize(m_span.length() - trimsz);
}
if (!words_from_span(bp)) { if (!words_from_span(bp)) {
return false; return false;
@ -855,7 +866,7 @@ bool TextSplit::text_to_words(const string &in)
goto SPACE; goto SPACE;
case '@': case '@':
case '_': case '_': // If underscoreasletter is set, we'll never get this
case '\'': case '\'':
// If in word, potential span: o'brien, jf@dockes.org, // If in word, potential span: o'brien, jf@dockes.org,
// else just ignore // else just ignore

View File

@ -253,8 +253,12 @@ in version 1.22, and on by default. Setting the variable to off allows
restoring the previous behaviour.</para></listitem></varlistentry> restoring the previous behaviour.</para></listitem></varlistentry>
<varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.BACKSLASHASLETTER"> <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.BACKSLASHASLETTER">
<term><varname>backslashasletter</varname></term> <term><varname>backslashasletter</varname></term>
<listitem><para>Process backslash as normal letter This may make sense for people wanting to index TeX commands as <listitem><para>Process backslash as normal letter. This may make sense for people wanting to index TeX commands as
such but is not of much general use.</para></listitem></varlistentry> such but is not of much general use.</para></listitem></varlistentry>
<varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.UNDERSCOREASLETTER">
<term><varname>underscoreasletter</varname></term>
<listitem><para>Process underscore as normal letter. This makes sense in so many cases that one wonders if it should
not be the default.</para></listitem></varlistentry>
<varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.MAXTERMLENGTH"> <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.MAXTERMLENGTH">
<term><varname>maxtermlength</varname></term> <term><varname>maxtermlength</varname></term>
<listitem><para>Maximum term length. Words longer than this will be discarded. <listitem><para>Maximum term length. Words longer than this will be discarded.
@ -758,7 +762,7 @@ script. Example values: eng, fra... See the tesseract documentation.</para></lis
<term><varname>tesseractcmd</varname></term> <term><varname>tesseractcmd</varname></term>
<listitem><para>Path for the tesseract command. This is mostly useful on Windows, or for specifying a non-default <listitem><para>Path for the tesseract command. This is mostly useful on Windows, or for specifying a non-default
tesseract command. e.g. on Windows: tesseract command. e.g. on Windows:
C:/Program&nbsp;Files&nbsp;(x86)/Tesseract-OCR/tesseract.exe</para></listitem></varlistentry> C:/ProgramFiles(x86)/Tesseract-OCR/tesseract.exe</para></listitem></varlistentry>
<varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.ABBYYLANG"> <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.ABBYYLANG">
<term><varname>abbyylang</varname></term> <term><varname>abbyylang</varname></term>
<listitem><para>Language to assume for abbyy OCR. Important for improving the OCR accuracy. This can also be set <listitem><para>Language to assume for abbyy OCR. Important for improving the OCR accuracy. This can also be set

View File

@ -8933,12 +8933,21 @@ for i in range(nres):
id= id=
"RCL.INSTALL.CONFIG.RECOLLCONF.BACKSLASHASLETTER"></a><span class="term"><code class="varname">backslashasletter</code></span></dt> "RCL.INSTALL.CONFIG.RECOLLCONF.BACKSLASHASLETTER"></a><span class="term"><code class="varname">backslashasletter</code></span></dt>
<dd> <dd>
<p>Process backslash as normal letter This may <p>Process backslash as normal letter. This may
make sense for people wanting to index TeX make sense for people wanting to index TeX
commands as such but is not of much general commands as such but is not of much general
use.</p> use.</p>
</dd> </dd>
<dt><a name= <dt><a name=
"RCL.INSTALL.CONFIG.RECOLLCONF.UNDERSCOREASLETTER"
id=
"RCL.INSTALL.CONFIG.RECOLLCONF.UNDERSCOREASLETTER"></a><span class="term"><code class="varname">underscoreasletter</code></span></dt>
<dd>
<p>Process underscore as normal letter. This
makes sense in so many cases that one wonders if
it should not be the default.</p>
</dd>
<dt><a name=
"RCL.INSTALL.CONFIG.RECOLLCONF.MAXTERMLENGTH" id= "RCL.INSTALL.CONFIG.RECOLLCONF.MAXTERMLENGTH" id=
"RCL.INSTALL.CONFIG.RECOLLCONF.MAXTERMLENGTH"></a><span class="term"><code class="varname">maxtermlength</code></span></dt> "RCL.INSTALL.CONFIG.RECOLLCONF.MAXTERMLENGTH"></a><span class="term"><code class="varname">maxtermlength</code></span></dt>
<dd> <dd>
@ -9865,7 +9874,7 @@ for i in range(nres):
<p>Path for the tesseract command. This is mostly <p>Path for the tesseract command. This is mostly
useful on Windows, or for specifying a useful on Windows, or for specifying a
non-default tesseract command. e.g. on Windows: non-default tesseract command. e.g. on Windows:
C:/Program&nbsp;Files&nbsp;(x86)/Tesseract-OCR/tesseract.exe</p> C:/ProgramFiles(x86)/Tesseract-OCR/tesseract.exe</p>
</dd> </dd>
<dt><a name= <dt><a name=
"RCL.INSTALL.CONFIG.RECOLLCONF.ABBYYLANG" id= "RCL.INSTALL.CONFIG.RECOLLCONF.ABBYYLANG" id=

View File

@ -317,12 +317,19 @@ indexStoreDocText = 1
#dehyphenate = 1 #dehyphenate = 1
# <var name="backslashasletter" type="bool"> # <var name="backslashasletter" type="bool">
# <brief>Process backslash as normal letter</brief> # <brief>Process backslash as normal letter.</brief>
# <descr>This may make sense for people wanting to index TeX commands as # <descr>This may make sense for people wanting to index TeX commands as
# such but is not of much general use.</descr> # such but is not of much general use.</descr>
# </var> # </var>
#backslashasletter = 0 #backslashasletter = 0
# <var name="underscoreasletter" type="bool">
# <brief>Process underscore as normal letter.</brief>
# <descr>This makes sense in so many cases that one wonders if it should
# not be the default.</descr>
# </var>
#underscoreasletter = 0
# <var name="maxtermlength" type="int" values="10 200 40"> # <var name="maxtermlength" type="int" values="10 200 40">
# <brief>Maximum term length.</brief> # <brief>Maximum term length.</brief>
# <descr>Words longer than this will be discarded. # <descr>Words longer than this will be discarded.

View File

@ -265,6 +265,7 @@ int main(int argc, char **argv)
if (!kotagger.empty()) { if (!kotagger.empty()) {
fprintf(fp, "hangultagger = %s\n", kotagger.c_str()); fprintf(fp, "hangultagger = %s\n", kotagger.c_str());
} }
fprintf(fp, "underscoreasletter = 0\n");
fclose(fp); fclose(fp);
Logger::getTheLog("")->setLogLevel(Logger::LogLevel(loglevel)); Logger::getTheLog("")->setLogLevel(Logger::LogLevel(loglevel));