add underscoreasletter config variable to process _ as a letter

This commit is contained in:
Jean-Francois Dockes 2020-09-13 15:40:28 +02:00
parent efaa4796b1
commit df09d65a4e
4 changed files with 33 additions and 6 deletions

View File

@ -118,6 +118,13 @@ void TextSplit::staticConfInit(RclConfig *config)
}
}
bvalue = false;
if (config->getConfParam("underscoreasletter", &bvalue)) {
if (bvalue) {
charclasses[int('_')] = A_LLETTER;
}
}
string kotagger;
config->getConfParam("hangultagger", kotagger);
if (!kotagger.empty()) {
@ -855,7 +862,7 @@ bool TextSplit::text_to_words(const string &in)
goto SPACE;
case '@':
case '_':
case '_': // If underscoreasletter is set, we'll never get this
case '\'':
// If in word, potential span: o'brien, jf@dockes.org,
// else just ignore

View File

@ -253,8 +253,12 @@ in version 1.22, and on by default. Setting the variable to off allows
restoring the previous behaviour.</para></listitem></varlistentry>
<varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.BACKSLASHASLETTER">
<term><varname>backslashasletter</varname></term>
<listitem><para>Process backslash as normal letter This may make sense for people wanting to index TeX commands as
<listitem><para>Process backslash as normal letter. This may make sense for people wanting to index TeX commands as
such but is not of much general use.</para></listitem></varlistentry>
<varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.UNDERSCOREASLETTER">
<term><varname>underscoreasletter</varname></term>
<listitem><para>Process underscore as normal letter. This makes sense in so many cases that one wonders if it should
not be the default.</para></listitem></varlistentry>
<varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.MAXTERMLENGTH">
<term><varname>maxtermlength</varname></term>
<listitem><para>Maximum term length. Words longer than this will be discarded.
@ -758,7 +762,7 @@ script. Example values: eng, fra... See the tesseract documentation.</para></lis
<term><varname>tesseractcmd</varname></term>
<listitem><para>Path for the tesseract command. This is mostly useful on Windows, or for specifying a non-default
tesseract command. e.g. on Windows:
C:/Program&nbsp;Files&nbsp;(x86)/Tesseract-OCR/tesseract.exe</para></listitem></varlistentry>
C:/ProgramFiles(x86)/Tesseract-OCR/tesseract.exe</para></listitem></varlistentry>
<varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.ABBYYLANG">
<term><varname>abbyylang</varname></term>
<listitem><para>Language to assume for abbyy OCR. Important for improving the OCR accuracy. This can also be set

View File

@ -8933,12 +8933,21 @@ for i in range(nres):
id=
"RCL.INSTALL.CONFIG.RECOLLCONF.BACKSLASHASLETTER"></a><span class="term"><code class="varname">backslashasletter</code></span></dt>
<dd>
<p>Process backslash as normal letter This may
<p>Process backslash as normal letter. This may
make sense for people wanting to index TeX
commands as such but is not of much general
use.</p>
</dd>
<dt><a name=
"RCL.INSTALL.CONFIG.RECOLLCONF.UNDERSCOREASLETTER"
id=
"RCL.INSTALL.CONFIG.RECOLLCONF.UNDERSCOREASLETTER"></a><span class="term"><code class="varname">underscoreasletter</code></span></dt>
<dd>
<p>Process underscore as normal letter. This
makes sense in so many cases that one wonders if
it should not be the default.</p>
</dd>
<dt><a name=
"RCL.INSTALL.CONFIG.RECOLLCONF.MAXTERMLENGTH" id=
"RCL.INSTALL.CONFIG.RECOLLCONF.MAXTERMLENGTH"></a><span class="term"><code class="varname">maxtermlength</code></span></dt>
<dd>
@ -9865,7 +9874,7 @@ for i in range(nres):
<p>Path for the tesseract command. This is mostly
useful on Windows, or for specifying a
non-default tesseract command. e.g. on Windows:
C:/Program&nbsp;Files&nbsp;(x86)/Tesseract-OCR/tesseract.exe</p>
C:/ProgramFiles(x86)/Tesseract-OCR/tesseract.exe</p>
</dd>
<dt><a name=
"RCL.INSTALL.CONFIG.RECOLLCONF.ABBYYLANG" id=

View File

@ -317,12 +317,19 @@ indexStoreDocText = 1
#dehyphenate = 1
# <var name="backslashasletter" type="bool">
# <brief>Process backslash as normal letter</brief>
# <brief>Process backslash as normal letter.</brief>
# <descr>This may make sense for people wanting to index TeX commands as
# such but is not of much general use.</descr>
# </var>
#backslashasletter = 0
# <var name="underscoreasletter" type="bool">
# <brief>Process underscore as normal letter.</brief>
# <descr>This makes sense in so many cases that one wonders if it should
# not be the default.</descr>
# </var>
#underscoreasletter = 0
# <var name="maxtermlength" type="int" values="10 200 40">
# <brief>Maximum term length.</brief>
# <descr>Words longer than this will be discarded.