add underscoreasletter config variable to process _ as a letter
This commit is contained in:
parent
efaa4796b1
commit
df09d65a4e
@ -118,6 +118,13 @@ void TextSplit::staticConfInit(RclConfig *config)
|
||||
}
|
||||
}
|
||||
|
||||
bvalue = false;
|
||||
if (config->getConfParam("underscoreasletter", &bvalue)) {
|
||||
if (bvalue) {
|
||||
charclasses[int('_')] = A_LLETTER;
|
||||
}
|
||||
}
|
||||
|
||||
string kotagger;
|
||||
config->getConfParam("hangultagger", kotagger);
|
||||
if (!kotagger.empty()) {
|
||||
@ -855,7 +862,7 @@ bool TextSplit::text_to_words(const string &in)
|
||||
goto SPACE;
|
||||
|
||||
case '@':
|
||||
case '_':
|
||||
case '_': // If underscoreasletter is set, we'll never get this
|
||||
case '\'':
|
||||
// If in word, potential span: o'brien, jf@dockes.org,
|
||||
// else just ignore
|
||||
|
||||
@ -253,8 +253,12 @@ in version 1.22, and on by default. Setting the variable to off allows
|
||||
restoring the previous behaviour.</para></listitem></varlistentry>
|
||||
<varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.BACKSLASHASLETTER">
|
||||
<term><varname>backslashasletter</varname></term>
|
||||
<listitem><para>Process backslash as normal letter This may make sense for people wanting to index TeX commands as
|
||||
<listitem><para>Process backslash as normal letter. This may make sense for people wanting to index TeX commands as
|
||||
such but is not of much general use.</para></listitem></varlistentry>
|
||||
<varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.UNDERSCOREASLETTER">
|
||||
<term><varname>underscoreasletter</varname></term>
|
||||
<listitem><para>Process underscore as normal letter. This makes sense in so many cases that one wonders if it should
|
||||
not be the default.</para></listitem></varlistentry>
|
||||
<varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.MAXTERMLENGTH">
|
||||
<term><varname>maxtermlength</varname></term>
|
||||
<listitem><para>Maximum term length. Words longer than this will be discarded.
|
||||
@ -758,7 +762,7 @@ script. Example values: eng, fra... See the tesseract documentation.</para></lis
|
||||
<term><varname>tesseractcmd</varname></term>
|
||||
<listitem><para>Path for the tesseract command. This is mostly useful on Windows, or for specifying a non-default
|
||||
tesseract command. e.g. on Windows:
|
||||
C:/Program Files (x86)/Tesseract-OCR/tesseract.exe</para></listitem></varlistentry>
|
||||
C:/ProgramFiles(x86)/Tesseract-OCR/tesseract.exe</para></listitem></varlistentry>
|
||||
<varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.ABBYYLANG">
|
||||
<term><varname>abbyylang</varname></term>
|
||||
<listitem><para>Language to assume for abbyy OCR. Important for improving the OCR accuracy. This can also be set
|
||||
|
||||
@ -8933,12 +8933,21 @@ for i in range(nres):
|
||||
id=
|
||||
"RCL.INSTALL.CONFIG.RECOLLCONF.BACKSLASHASLETTER"></a><span class="term"><code class="varname">backslashasletter</code></span></dt>
|
||||
<dd>
|
||||
<p>Process backslash as normal letter This may
|
||||
<p>Process backslash as normal letter. This may
|
||||
make sense for people wanting to index TeX
|
||||
commands as such but is not of much general
|
||||
use.</p>
|
||||
</dd>
|
||||
<dt><a name=
|
||||
"RCL.INSTALL.CONFIG.RECOLLCONF.UNDERSCOREASLETTER"
|
||||
id=
|
||||
"RCL.INSTALL.CONFIG.RECOLLCONF.UNDERSCOREASLETTER"></a><span class="term"><code class="varname">underscoreasletter</code></span></dt>
|
||||
<dd>
|
||||
<p>Process underscore as normal letter. This
|
||||
makes sense in so many cases that one wonders if
|
||||
it should not be the default.</p>
|
||||
</dd>
|
||||
<dt><a name=
|
||||
"RCL.INSTALL.CONFIG.RECOLLCONF.MAXTERMLENGTH" id=
|
||||
"RCL.INSTALL.CONFIG.RECOLLCONF.MAXTERMLENGTH"></a><span class="term"><code class="varname">maxtermlength</code></span></dt>
|
||||
<dd>
|
||||
@ -9865,7 +9874,7 @@ for i in range(nres):
|
||||
<p>Path for the tesseract command. This is mostly
|
||||
useful on Windows, or for specifying a
|
||||
non-default tesseract command. e.g. on Windows:
|
||||
C:/Program Files (x86)/Tesseract-OCR/tesseract.exe</p>
|
||||
C:/ProgramFiles(x86)/Tesseract-OCR/tesseract.exe</p>
|
||||
</dd>
|
||||
<dt><a name=
|
||||
"RCL.INSTALL.CONFIG.RECOLLCONF.ABBYYLANG" id=
|
||||
|
||||
@ -317,12 +317,19 @@ indexStoreDocText = 1
|
||||
#dehyphenate = 1
|
||||
|
||||
# <var name="backslashasletter" type="bool">
|
||||
# <brief>Process backslash as normal letter</brief>
|
||||
# <brief>Process backslash as normal letter.</brief>
|
||||
# <descr>This may make sense for people wanting to index TeX commands as
|
||||
# such but is not of much general use.</descr>
|
||||
# </var>
|
||||
#backslashasletter = 0
|
||||
|
||||
# <var name="underscoreasletter" type="bool">
|
||||
# <brief>Process underscore as normal letter.</brief>
|
||||
# <descr>This makes sense in so many cases that one wonders if it should
|
||||
# not be the default.</descr>
|
||||
# </var>
|
||||
#underscoreasletter = 0
|
||||
|
||||
# <var name="maxtermlength" type="int" values="10 200 40">
|
||||
# <brief>Maximum term length.</brief>
|
||||
# <descr>Words longer than this will be discarded.
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user