add underscoreasletter config variable to process _ as a letter
This commit is contained in:
parent
efaa4796b1
commit
df09d65a4e
@ -118,6 +118,13 @@ void TextSplit::staticConfInit(RclConfig *config)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bvalue = false;
|
||||||
|
if (config->getConfParam("underscoreasletter", &bvalue)) {
|
||||||
|
if (bvalue) {
|
||||||
|
charclasses[int('_')] = A_LLETTER;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
string kotagger;
|
string kotagger;
|
||||||
config->getConfParam("hangultagger", kotagger);
|
config->getConfParam("hangultagger", kotagger);
|
||||||
if (!kotagger.empty()) {
|
if (!kotagger.empty()) {
|
||||||
@ -855,7 +862,7 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
goto SPACE;
|
goto SPACE;
|
||||||
|
|
||||||
case '@':
|
case '@':
|
||||||
case '_':
|
case '_': // If underscoreasletter is set, we'll never get this
|
||||||
case '\'':
|
case '\'':
|
||||||
// If in word, potential span: o'brien, jf@dockes.org,
|
// If in word, potential span: o'brien, jf@dockes.org,
|
||||||
// else just ignore
|
// else just ignore
|
||||||
|
|||||||
@ -253,8 +253,12 @@ in version 1.22, and on by default. Setting the variable to off allows
|
|||||||
restoring the previous behaviour.</para></listitem></varlistentry>
|
restoring the previous behaviour.</para></listitem></varlistentry>
|
||||||
<varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.BACKSLASHASLETTER">
|
<varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.BACKSLASHASLETTER">
|
||||||
<term><varname>backslashasletter</varname></term>
|
<term><varname>backslashasletter</varname></term>
|
||||||
<listitem><para>Process backslash as normal letter This may make sense for people wanting to index TeX commands as
|
<listitem><para>Process backslash as normal letter. This may make sense for people wanting to index TeX commands as
|
||||||
such but is not of much general use.</para></listitem></varlistentry>
|
such but is not of much general use.</para></listitem></varlistentry>
|
||||||
|
<varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.UNDERSCOREASLETTER">
|
||||||
|
<term><varname>underscoreasletter</varname></term>
|
||||||
|
<listitem><para>Process underscore as normal letter. This makes sense in so many cases that one wonders if it should
|
||||||
|
not be the default.</para></listitem></varlistentry>
|
||||||
<varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.MAXTERMLENGTH">
|
<varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.MAXTERMLENGTH">
|
||||||
<term><varname>maxtermlength</varname></term>
|
<term><varname>maxtermlength</varname></term>
|
||||||
<listitem><para>Maximum term length. Words longer than this will be discarded.
|
<listitem><para>Maximum term length. Words longer than this will be discarded.
|
||||||
@ -758,7 +762,7 @@ script. Example values: eng, fra... See the tesseract documentation.</para></lis
|
|||||||
<term><varname>tesseractcmd</varname></term>
|
<term><varname>tesseractcmd</varname></term>
|
||||||
<listitem><para>Path for the tesseract command. This is mostly useful on Windows, or for specifying a non-default
|
<listitem><para>Path for the tesseract command. This is mostly useful on Windows, or for specifying a non-default
|
||||||
tesseract command. e.g. on Windows:
|
tesseract command. e.g. on Windows:
|
||||||
C:/Program Files (x86)/Tesseract-OCR/tesseract.exe</para></listitem></varlistentry>
|
C:/ProgramFiles(x86)/Tesseract-OCR/tesseract.exe</para></listitem></varlistentry>
|
||||||
<varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.ABBYYLANG">
|
<varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.ABBYYLANG">
|
||||||
<term><varname>abbyylang</varname></term>
|
<term><varname>abbyylang</varname></term>
|
||||||
<listitem><para>Language to assume for abbyy OCR. Important for improving the OCR accuracy. This can also be set
|
<listitem><para>Language to assume for abbyy OCR. Important for improving the OCR accuracy. This can also be set
|
||||||
|
|||||||
@ -8933,12 +8933,21 @@ for i in range(nres):
|
|||||||
id=
|
id=
|
||||||
"RCL.INSTALL.CONFIG.RECOLLCONF.BACKSLASHASLETTER"></a><span class="term"><code class="varname">backslashasletter</code></span></dt>
|
"RCL.INSTALL.CONFIG.RECOLLCONF.BACKSLASHASLETTER"></a><span class="term"><code class="varname">backslashasletter</code></span></dt>
|
||||||
<dd>
|
<dd>
|
||||||
<p>Process backslash as normal letter This may
|
<p>Process backslash as normal letter. This may
|
||||||
make sense for people wanting to index TeX
|
make sense for people wanting to index TeX
|
||||||
commands as such but is not of much general
|
commands as such but is not of much general
|
||||||
use.</p>
|
use.</p>
|
||||||
</dd>
|
</dd>
|
||||||
<dt><a name=
|
<dt><a name=
|
||||||
|
"RCL.INSTALL.CONFIG.RECOLLCONF.UNDERSCOREASLETTER"
|
||||||
|
id=
|
||||||
|
"RCL.INSTALL.CONFIG.RECOLLCONF.UNDERSCOREASLETTER"></a><span class="term"><code class="varname">underscoreasletter</code></span></dt>
|
||||||
|
<dd>
|
||||||
|
<p>Process underscore as normal letter. This
|
||||||
|
makes sense in so many cases that one wonders if
|
||||||
|
it should not be the default.</p>
|
||||||
|
</dd>
|
||||||
|
<dt><a name=
|
||||||
"RCL.INSTALL.CONFIG.RECOLLCONF.MAXTERMLENGTH" id=
|
"RCL.INSTALL.CONFIG.RECOLLCONF.MAXTERMLENGTH" id=
|
||||||
"RCL.INSTALL.CONFIG.RECOLLCONF.MAXTERMLENGTH"></a><span class="term"><code class="varname">maxtermlength</code></span></dt>
|
"RCL.INSTALL.CONFIG.RECOLLCONF.MAXTERMLENGTH"></a><span class="term"><code class="varname">maxtermlength</code></span></dt>
|
||||||
<dd>
|
<dd>
|
||||||
@ -9865,7 +9874,7 @@ for i in range(nres):
|
|||||||
<p>Path for the tesseract command. This is mostly
|
<p>Path for the tesseract command. This is mostly
|
||||||
useful on Windows, or for specifying a
|
useful on Windows, or for specifying a
|
||||||
non-default tesseract command. e.g. on Windows:
|
non-default tesseract command. e.g. on Windows:
|
||||||
C:/Program Files (x86)/Tesseract-OCR/tesseract.exe</p>
|
C:/ProgramFiles(x86)/Tesseract-OCR/tesseract.exe</p>
|
||||||
</dd>
|
</dd>
|
||||||
<dt><a name=
|
<dt><a name=
|
||||||
"RCL.INSTALL.CONFIG.RECOLLCONF.ABBYYLANG" id=
|
"RCL.INSTALL.CONFIG.RECOLLCONF.ABBYYLANG" id=
|
||||||
|
|||||||
@ -317,12 +317,19 @@ indexStoreDocText = 1
|
|||||||
#dehyphenate = 1
|
#dehyphenate = 1
|
||||||
|
|
||||||
# <var name="backslashasletter" type="bool">
|
# <var name="backslashasletter" type="bool">
|
||||||
# <brief>Process backslash as normal letter</brief>
|
# <brief>Process backslash as normal letter.</brief>
|
||||||
# <descr>This may make sense for people wanting to index TeX commands as
|
# <descr>This may make sense for people wanting to index TeX commands as
|
||||||
# such but is not of much general use.</descr>
|
# such but is not of much general use.</descr>
|
||||||
# </var>
|
# </var>
|
||||||
#backslashasletter = 0
|
#backslashasletter = 0
|
||||||
|
|
||||||
|
# <var name="underscoreasletter" type="bool">
|
||||||
|
# <brief>Process underscore as normal letter.</brief>
|
||||||
|
# <descr>This makes sense in so many cases that one wonders if it should
|
||||||
|
# not be the default.</descr>
|
||||||
|
# </var>
|
||||||
|
#underscoreasletter = 0
|
||||||
|
|
||||||
# <var name="maxtermlength" type="int" values="10 200 40">
|
# <var name="maxtermlength" type="int" values="10 200 40">
|
||||||
# <brief>Maximum term length.</brief>
|
# <brief>Maximum term length.</brief>
|
||||||
# <descr>Words longer than this will be discarded.
|
# <descr>Words longer than this will be discarded.
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user