Merge branch 'master' of https://framagit.org/medoc91/recoll
This commit is contained in:
commit
db1d2f48f6
@ -6,7 +6,7 @@
|
||||
|
||||
PPA_KEYID=7808CE96D38B9201
|
||||
|
||||
RCLVERS=1.27.5
|
||||
RCLVERS=1.27.6
|
||||
SCOPEVERS=1.20.2.4
|
||||
GSSPVERS=1.0.0
|
||||
PPAVERS=1
|
||||
@ -49,7 +49,7 @@ debdir=debian
|
||||
# 16.04LTS xenial 2021-04
|
||||
# 18.04LTS bionic 2023-04
|
||||
# 20.04LTS focal 2025-04
|
||||
series="xenial bionic focal"
|
||||
series="xenial bionic focal groovy"
|
||||
series=
|
||||
|
||||
if test "X$series" != X ; then
|
||||
@ -89,8 +89,8 @@ done
|
||||
# 16.04LTS xenial 2021-04
|
||||
# 18.04LTS bionic 2023-04
|
||||
# 20.04LTS focal 2025-04
|
||||
series="xenial bionic focal"
|
||||
#series=
|
||||
series="xenial bionic focal groovy"
|
||||
# series=
|
||||
|
||||
debdir=debiankio
|
||||
topdir=kio-recoll-${RCLVERS}
|
||||
@ -133,7 +133,7 @@ done
|
||||
# 16.04LTS xenial 2021-04
|
||||
# 18.04LTS bionic 2023-04
|
||||
# 20.04LTS focal 2025-04
|
||||
series="xenial bionic focal"
|
||||
series="xenial bionic focal groovy"
|
||||
series=
|
||||
|
||||
debdir=debiangssp
|
||||
|
||||
@ -1,3 +1,15 @@
|
||||
recoll (1.27.6-1~ppaPPAVERS~SERIES1) SERIES; urgency=low
|
||||
|
||||
* Process PDF annotations if the poppler-glib Python bindings are available
|
||||
* Fix build error with some compiler versions
|
||||
* Fix inconsistent lock file name issue which could result in background
|
||||
indexer exit.
|
||||
* recollq: new -p option to show snippets lists instead of abstracts with
|
||||
-A
|
||||
* Fix nonumbers option.
|
||||
|
||||
-- Jean-Francois Dockes <jf@dockes.org> Sun, 13 Sep 2020 13:29:00 +0200
|
||||
|
||||
recoll (1.27.5-1~ppaPPAVERS~SERIES1) SERIES; urgency=low
|
||||
|
||||
* Misc small fixes.
|
||||
|
||||
@ -1,3 +1,15 @@
|
||||
kio-recoll (1.27.6-1~ppaPPAVERS~SERIES1) SERIES; urgency=low
|
||||
|
||||
* Process PDF annotations if the poppler-glib Python bindings are available
|
||||
* Fix build error with some compiler versions
|
||||
* Fix inconsistent lock file name issue which could result in background
|
||||
indexer exit.
|
||||
* recollq: new -p option to show snippets lists instead of abstracts with
|
||||
-A
|
||||
* Fix nonumbers option.
|
||||
|
||||
-- Jean-Francois Dockes <jf@dockes.org> Sun, 13 Sep 2020 13:29:00 +0200
|
||||
|
||||
kio-recoll (1.27.5-1~ppaPPAVERS~SERIES1) SERIES; urgency=low
|
||||
|
||||
* Misc small fixes.
|
||||
|
||||
@ -1 +1 @@
|
||||
1.27.6
|
||||
1.27.7
|
||||
|
||||
@ -84,6 +84,9 @@ int TextSplit::o_maxWordLength{40};
|
||||
static const int o_CJKMaxNgramLen{5};
|
||||
bool o_exthangultagger{false};
|
||||
|
||||
// This is changed to 0 if _ is processed as a letter
|
||||
static char underscoreatend = '_';
|
||||
|
||||
void TextSplit::staticConfInit(RclConfig *config)
|
||||
{
|
||||
config->getConfParam("maxtermlength", &o_maxWordLength);
|
||||
@ -118,6 +121,14 @@ void TextSplit::staticConfInit(RclConfig *config)
|
||||
}
|
||||
}
|
||||
|
||||
bvalue = false;
|
||||
if (config->getConfParam("underscoreasletter", &bvalue)) {
|
||||
if (bvalue) {
|
||||
charclasses[int('_')] = A_LLETTER;
|
||||
underscoreatend = 0;
|
||||
}
|
||||
}
|
||||
|
||||
string kotagger;
|
||||
config->getConfParam("hangultagger", kotagger);
|
||||
if (!kotagger.empty()) {
|
||||
@ -550,26 +561,26 @@ inline bool TextSplit::doemit(bool spanerase, size_t _bp)
|
||||
|
||||
// Maybe trim at end. These are chars that we might keep
|
||||
// inside a span, but not at the end.
|
||||
while (m_span.length() > 0) {
|
||||
switch (*(m_span.rbegin())) {
|
||||
case '.':
|
||||
case '-':
|
||||
case ',':
|
||||
case '@':
|
||||
case '_':
|
||||
case '\'':
|
||||
m_span.resize(m_span.length()-1);
|
||||
string::size_type trimsz{0};
|
||||
while (trimsz < m_span.length()) {
|
||||
auto c = m_span[m_span.length() - 1 - trimsz];
|
||||
if (c == '.' || c == '-' || c == ',' || c == '@' || c == '\'' ||
|
||||
c == underscoreatend) {
|
||||
trimsz++;
|
||||
if (m_words_in_span.size() &&
|
||||
m_words_in_span.back().second > int(m_span.size()))
|
||||
m_words_in_span.back().second > int(m_span.size())) {
|
||||
m_words_in_span.back().second = int(m_span.size());
|
||||
if (--bp < 0)
|
||||
}
|
||||
if (--bp < 0) {
|
||||
bp = 0;
|
||||
}
|
||||
} else {
|
||||
break;
|
||||
default:
|
||||
goto breaktrimloop;
|
||||
}
|
||||
}
|
||||
breaktrimloop:
|
||||
if (trimsz > 0) {
|
||||
m_span.resize(m_span.length() - trimsz);
|
||||
}
|
||||
|
||||
if (!words_from_span(bp)) {
|
||||
return false;
|
||||
@ -855,7 +866,7 @@ bool TextSplit::text_to_words(const string &in)
|
||||
goto SPACE;
|
||||
|
||||
case '@':
|
||||
case '_':
|
||||
case '_': // If underscoreasletter is set, we'll never get this
|
||||
case '\'':
|
||||
// If in word, potential span: o'brien, jf@dockes.org,
|
||||
// else just ignore
|
||||
|
||||
@ -253,8 +253,12 @@ in version 1.22, and on by default. Setting the variable to off allows
|
||||
restoring the previous behaviour.</para></listitem></varlistentry>
|
||||
<varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.BACKSLASHASLETTER">
|
||||
<term><varname>backslashasletter</varname></term>
|
||||
<listitem><para>Process backslash as normal letter This may make sense for people wanting to index TeX commands as
|
||||
<listitem><para>Process backslash as normal letter. This may make sense for people wanting to index TeX commands as
|
||||
such but is not of much general use.</para></listitem></varlistentry>
|
||||
<varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.UNDERSCOREASLETTER">
|
||||
<term><varname>underscoreasletter</varname></term>
|
||||
<listitem><para>Process underscore as normal letter. This makes sense in so many cases that one wonders if it should
|
||||
not be the default.</para></listitem></varlistentry>
|
||||
<varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.MAXTERMLENGTH">
|
||||
<term><varname>maxtermlength</varname></term>
|
||||
<listitem><para>Maximum term length. Words longer than this will be discarded.
|
||||
@ -758,7 +762,7 @@ script. Example values: eng, fra... See the tesseract documentation.</para></lis
|
||||
<term><varname>tesseractcmd</varname></term>
|
||||
<listitem><para>Path for the tesseract command. This is mostly useful on Windows, or for specifying a non-default
|
||||
tesseract command. e.g. on Windows:
|
||||
C:/Program Files (x86)/Tesseract-OCR/tesseract.exe</para></listitem></varlistentry>
|
||||
C:/ProgramFiles(x86)/Tesseract-OCR/tesseract.exe</para></listitem></varlistentry>
|
||||
<varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.ABBYYLANG">
|
||||
<term><varname>abbyylang</varname></term>
|
||||
<listitem><para>Language to assume for abbyy OCR. Important for improving the OCR accuracy. This can also be set
|
||||
|
||||
@ -8933,12 +8933,21 @@ for i in range(nres):
|
||||
id=
|
||||
"RCL.INSTALL.CONFIG.RECOLLCONF.BACKSLASHASLETTER"></a><span class="term"><code class="varname">backslashasletter</code></span></dt>
|
||||
<dd>
|
||||
<p>Process backslash as normal letter This may
|
||||
<p>Process backslash as normal letter. This may
|
||||
make sense for people wanting to index TeX
|
||||
commands as such but is not of much general
|
||||
use.</p>
|
||||
</dd>
|
||||
<dt><a name=
|
||||
"RCL.INSTALL.CONFIG.RECOLLCONF.UNDERSCOREASLETTER"
|
||||
id=
|
||||
"RCL.INSTALL.CONFIG.RECOLLCONF.UNDERSCOREASLETTER"></a><span class="term"><code class="varname">underscoreasletter</code></span></dt>
|
||||
<dd>
|
||||
<p>Process underscore as normal letter. This
|
||||
makes sense in so many cases that one wonders if
|
||||
it should not be the default.</p>
|
||||
</dd>
|
||||
<dt><a name=
|
||||
"RCL.INSTALL.CONFIG.RECOLLCONF.MAXTERMLENGTH" id=
|
||||
"RCL.INSTALL.CONFIG.RECOLLCONF.MAXTERMLENGTH"></a><span class="term"><code class="varname">maxtermlength</code></span></dt>
|
||||
<dd>
|
||||
@ -9865,7 +9874,7 @@ for i in range(nres):
|
||||
<p>Path for the tesseract command. This is mostly
|
||||
useful on Windows, or for specifying a
|
||||
non-default tesseract command. e.g. on Windows:
|
||||
C:/Program Files (x86)/Tesseract-OCR/tesseract.exe</p>
|
||||
C:/ProgramFiles(x86)/Tesseract-OCR/tesseract.exe</p>
|
||||
</dd>
|
||||
<dt><a name=
|
||||
"RCL.INSTALL.CONFIG.RECOLLCONF.ABBYYLANG" id=
|
||||
|
||||
@ -317,12 +317,19 @@ indexStoreDocText = 1
|
||||
#dehyphenate = 1
|
||||
|
||||
# <var name="backslashasletter" type="bool">
|
||||
# <brief>Process backslash as normal letter</brief>
|
||||
# <brief>Process backslash as normal letter.</brief>
|
||||
# <descr>This may make sense for people wanting to index TeX commands as
|
||||
# such but is not of much general use.</descr>
|
||||
# </var>
|
||||
#backslashasletter = 0
|
||||
|
||||
# <var name="underscoreasletter" type="bool">
|
||||
# <brief>Process underscore as normal letter.</brief>
|
||||
# <descr>This makes sense in so many cases that one wonders if it should
|
||||
# not be the default.</descr>
|
||||
# </var>
|
||||
#underscoreasletter = 0
|
||||
|
||||
# <var name="maxtermlength" type="int" values="10 200 40">
|
||||
# <brief>Maximum term length.</brief>
|
||||
# <descr>Words longer than this will be discarded.
|
||||
|
||||
@ -265,6 +265,7 @@ int main(int argc, char **argv)
|
||||
if (!kotagger.empty()) {
|
||||
fprintf(fp, "hangultagger = %s\n", kotagger.c_str());
|
||||
}
|
||||
fprintf(fp, "underscoreasletter = 0\n");
|
||||
fclose(fp);
|
||||
|
||||
Logger::getTheLog("")->setLogLevel(Logger::LogLevel(loglevel));
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user