Added option nonumbers not to generate terms for numbers. closes #16
This commit is contained in:
parent
d0e56e361f
commit
48358c8252
@ -203,6 +203,12 @@ bool RclConfig::updateMainConfig()
|
||||
TextSplit::cjkProcessing(true);
|
||||
}
|
||||
}
|
||||
|
||||
bool nonum = false;
|
||||
if (getConfParam("nonumbers", &nonum) && nonum == true) {
|
||||
TextSplit::noNumbers();
|
||||
}
|
||||
|
||||
m_skpnstate.init(this, m_conf, "skippedNames");
|
||||
m_rmtstate.init(this, m_conf, "indexedmimetypes");
|
||||
return true;
|
||||
|
||||
@ -164,6 +164,7 @@ bool TextSplit::isCJK(int c)
|
||||
|
||||
bool TextSplit::o_processCJK = true;
|
||||
unsigned int TextSplit::o_CJKNgramLen = 2;
|
||||
bool TextSplit::o_noNumbers = false;
|
||||
|
||||
// Do some checking (the kind which is simpler to do here than in the
|
||||
// main loop), then send term to our client.
|
||||
@ -212,12 +213,15 @@ inline bool TextSplit::emitterm(bool isspan, string &w, int pos,
|
||||
*/
|
||||
inline bool TextSplit::doemit(bool spanerase, int bp, bool spanemit)
|
||||
{
|
||||
LOGDEB3(("TextSplit::doemit:spn [%s] sp %d wrdS %d wrdL %d spe %d bp %d\n",
|
||||
span.c_str(), spanpos, wordStart, wordLen, spanerase, bp));
|
||||
LOGDEB3(("TextSplit::doemit:spn [%s] sp %d wrdS %d wrdL %d spe %d bp %d "
|
||||
"innum %d\n", m_span.c_str(), m_spanpos, m_wordStart,
|
||||
m_wordLen, spanerase, bp, m_inNumber));
|
||||
|
||||
// Emit span. When splitting for query, we only emit final spans
|
||||
bool spanemitted = false;
|
||||
if (!(m_flags & TXTS_NOSPANS) &&
|
||||
!((m_wordLen == m_span.length()) &&
|
||||
(o_noNumbers) && m_inNumber) &&
|
||||
((spanemit && !(m_flags & TXTS_ONLYSPANS)) || spanerase) ) {
|
||||
// Maybe trim at end. These are chars that we would keep inside
|
||||
// a span, but not at the end
|
||||
@ -243,6 +247,7 @@ inline bool TextSplit::doemit(bool spanerase, int bp, bool spanemit)
|
||||
|
||||
// Emit word if different from span and not 'no words' mode
|
||||
if (!(m_flags & TXTS_ONLYSPANS) && m_wordLen &&
|
||||
!(o_noNumbers && m_inNumber) &&
|
||||
(!spanemitted || m_wordLen != m_span.length())) {
|
||||
string s(m_span.substr(m_wordStart, m_wordLen));
|
||||
if (!emitterm(false, s, m_wordpos, bp - m_wordLen, bp))
|
||||
@ -494,6 +499,9 @@ bool TextSplit::text_to_words(const string &in)
|
||||
|
||||
default:
|
||||
NORMALCHAR:
|
||||
if (m_inNumber && c != 'e' && c != 'E') {
|
||||
m_inNumber = false;
|
||||
}
|
||||
m_wordLen += it.appendchartostring(m_span);
|
||||
break;
|
||||
}
|
||||
@ -746,10 +754,12 @@ class myTextSplit : public TextSplit {
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
static string teststring =
|
||||
"Un bout de texte \nnormal. 2eme phrase.3eme;quatrieme.\n"
|
||||
"\"Jean-Francois Dockes\" <jfd@okyz.com>\n"
|
||||
"n@d @net .net t@v@c c# c++ o'brien 'o'brien' l'ami\n"
|
||||
"data123\n"
|
||||
"134 +134 -14 -1.5 +1.5 1.54e10 1,2 1,2e30\n"
|
||||
"@^#$(#$(*)\n"
|
||||
"192.168.4.1 one\n\rtwo\r"
|
||||
@ -762,6 +772,7 @@ static string teststring =
|
||||
" -wl,--export-dynamic "
|
||||
" ~/.xsession-errors "
|
||||
;
|
||||
|
||||
static string teststring1 = " nouvel-an ";
|
||||
|
||||
static string thisprog;
|
||||
@ -771,6 +782,7 @@ static string usage =
|
||||
" -S: no output\n"
|
||||
" -s: only spans\n"
|
||||
" -w: only words\n"
|
||||
" -n: no numbers\n"
|
||||
" -k: preserve wildcards (?*)\n"
|
||||
" -c: just count words\n"
|
||||
" -C [charset] : input charset\n"
|
||||
@ -792,6 +804,7 @@ static int op_flags;
|
||||
#define OPT_c 0x8
|
||||
#define OPT_k 0x10
|
||||
#define OPT_C 0x20
|
||||
#define OPT_n 0x40
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
@ -811,6 +824,7 @@ int main(int argc, char **argv)
|
||||
charset = *(++argv); argc--;
|
||||
goto b1;
|
||||
case 'k': op_flags |= OPT_k; break;
|
||||
case 'n': op_flags |= OPT_n; break;
|
||||
case 's': op_flags |= OPT_s; break;
|
||||
case 'S': op_flags |= OPT_S; break;
|
||||
case 'w': op_flags |= OPT_w; break;
|
||||
@ -829,6 +843,8 @@ int main(int argc, char **argv)
|
||||
flags = TextSplit::TXTS_NOSPANS;
|
||||
if (op_flags & OPT_k)
|
||||
flags = (TextSplit::Flags)(flags | TextSplit::TXTS_KEEPWILD);
|
||||
if (op_flags & OPT_n)
|
||||
TextSplit::noNumbers();
|
||||
|
||||
string odata, reason;
|
||||
if (argc == 1) {
|
||||
|
||||
@ -37,7 +37,8 @@ class Utf8Iter;
|
||||
class TextSplit {
|
||||
public:
|
||||
// Should we activate special processing of Chinese characters ? This
|
||||
// needs a little more cpu, so it can be turned off globally.
|
||||
// needs a little more cpu, so it can be turned off globally. This is set
|
||||
// by rclconfig, changing it means reindexing
|
||||
static bool o_processCJK;
|
||||
static unsigned int o_CJKNgramLen;
|
||||
static const unsigned int o_CJKMaxNgramLen = 5;
|
||||
@ -48,6 +49,13 @@ public:
|
||||
ngramlen : o_CJKMaxNgramLen;
|
||||
}
|
||||
|
||||
// Are we indexing numbers ? Set by rclconfig. Change needs reindex
|
||||
static bool o_noNumbers;
|
||||
static void noNumbers()
|
||||
{
|
||||
o_noNumbers = true;
|
||||
}
|
||||
|
||||
enum Flags {TXTS_NONE = 0,
|
||||
TXTS_ONLYSPANS = 1, // Only return maximum spans (a@b.com)
|
||||
TXTS_NOSPANS = 2, // Only return atomic words (a, b, com)
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2001, 2003 The FreeBSD Documentation Project
|
||||
* Copyright (c) 2001, 2003, 2010 The FreeBSD Documentation Project
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
@ -23,7 +23,7 @@
|
||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*
|
||||
* $FreeBSD: doc/share/misc/docbook.css,v 1.14 2008/11/21 07:28:34 keramida Exp $
|
||||
* $FreeBSD: doc/share/misc/docbook.css,v 1.15 2010/03/20 04:15:01 hrs Exp $
|
||||
*/
|
||||
|
||||
BODY ADDRESS {
|
||||
@ -201,3 +201,8 @@ BLOCKQUOTE.WARNING {
|
||||
padding: 0.2em 2em;
|
||||
width: 90%;
|
||||
}
|
||||
|
||||
.INFORMALTABLE TABLE.CALSTABLE TR TD {
|
||||
padding-left: 1em;
|
||||
padding-right: 1em;
|
||||
}
|
||||
|
||||
@ -1,8 +1,8 @@
|
||||
<!DOCTYPE BOOK PUBLIC "-//FreeBSD//DTD DocBook V4.1-Based Extension//EN" [
|
||||
|
||||
<!ENTITY RCL "<application>Recoll</application>">
|
||||
<!ENTITY RCLVERSION "1.12-1.13">
|
||||
<!ENTITY XAP "<application>Xapian</application>">
|
||||
|
||||
]>
|
||||
|
||||
<book lang="en">
|
||||
@ -10,7 +10,6 @@
|
||||
<bookinfo>
|
||||
<title>Recoll user manual</title>
|
||||
|
||||
|
||||
<author>
|
||||
<firstname>Jean-Francois</firstname>
|
||||
<surname>Dockes</surname>
|
||||
@ -41,7 +40,7 @@
|
||||
|
||||
<sect1 id="rcl.introduction.tryit">
|
||||
<title>Giving it a try</title>
|
||||
|
||||
|
||||
<para>If you do not like reading manuals (who does?) and would
|
||||
like to give &RCL; a try, just perform <link
|
||||
linkend="rcl.install.binary">installation</link> and start the
|
||||
@ -2956,8 +2955,14 @@ while query.next >= 0 and query.next < nres:
|
||||
configuration, click <guimenu>Cancel</guimenu>, and edit
|
||||
the configuration file before restarting the command. This
|
||||
will start the initial indexing, which may take some time.</para>
|
||||
|
||||
<para>Paramers affecting what we index:</para>
|
||||
|
||||
<para>Most of the following parameters can be changed from the
|
||||
<guilabel>Index Configuration</guilabel> menu in the
|
||||
<command>recoll</command> interface. Some can only be set by
|
||||
editing the configuration file.</para>
|
||||
|
||||
<sect3 id="rcl.install.config.recollconf.files">
|
||||
<title>Parameters affecting what documents we index:</title>
|
||||
|
||||
<variablelist>
|
||||
|
||||
@ -3116,9 +3121,95 @@ skippedPaths = ~/somedir/∗.txt
|
||||
</varlistentry>
|
||||
|
||||
</variablelist>
|
||||
</sect3>
|
||||
|
||||
<sect3 id="rcl.install.config.recollconf.terms">
|
||||
<title>Parameters affecting how we generate terms:</title>
|
||||
|
||||
<para>Parameters affecting where and how we store things:</para>
|
||||
<para>Changing some of these parameters will imply a full
|
||||
reindex. Also, when using multiple indexes, it may not make sense
|
||||
to search indexes that don't share the values for these parameters,
|
||||
because they usually affect both search and index operations.</para>
|
||||
|
||||
<variablelist>
|
||||
|
||||
<varlistentry><term><literal>nonumbers</literal></term>
|
||||
<listitem><para>If this set to true, no terms will be generated
|
||||
for numbers. For example "123", "1.5e6", 192.168.1.4, would not
|
||||
be indexed ("value123" would still be). Numbers are often quite
|
||||
interesting to search for, and this should probably not be set
|
||||
except for special situations, ie, scientific documents with huge
|
||||
amounts of numbers in them. </para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry><term><literal>nocjk</literal></term>
|
||||
<listitem><para>If this set to true, specific east asian
|
||||
(Chinese Korean Japanese) characters/word splitting is
|
||||
turned off. This will save a small amount of cpu if you
|
||||
have no CJK documents. If your document base does include
|
||||
such text but you are not interested in searching it,
|
||||
setting <literal>nocjk</literal> may be a significant time
|
||||
and space saver.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry><term><literal>cjkngramlen</literal></term>
|
||||
<listitem><para>This lets you adjust the size of n-grams
|
||||
used for indexing CJK text. The default value of 2 is
|
||||
probably appropriate in most cases. A value of 3 would
|
||||
allow more precision and efficiency on longer words, but
|
||||
the index will be approximately twice as large.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry><term><literal>indexstemminglanguages</literal></term>
|
||||
<listitem><para>A list of languages for which the stem
|
||||
expansion databases will be built. See recollindex(1) or
|
||||
use the <literal>recollindex -l</literal> command for
|
||||
possible values. You can add a stem expansion database for
|
||||
a different language by using <command>recollindex
|
||||
-s</command>, but it will be deleted during the next
|
||||
indexing. Only languages listed in the configuration
|
||||
file are permanent.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry><term><literal>defaultcharset</literal></term>
|
||||
<listitem><para>The name of the character set used for
|
||||
files that do not contain a character set definition (ie:
|
||||
plain text files). This can be redefined for any
|
||||
sub-directory. If it is not set at all, the character set
|
||||
used is the one defined by the nls environment (LC_ALL,
|
||||
LC_CTYPE, LANG), or iso8859-1 if nothing is set.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry><term><literal>maildefcharset</literal></term>
|
||||
<listitem><para>This can be used to define the default
|
||||
character set specifically for mail messages which don't
|
||||
specify it. This is mainly useful for readpst (libpst) dumps,
|
||||
which are utf-8 but do not say so.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry><term><literal>localfields</literal></term>
|
||||
<listitem><para>This allows setting fields for all documents
|
||||
under a given directory. Typical usage would be to set an
|
||||
"rclaptg" field, to be used in <filename>mimeview</filename> to
|
||||
select a specific viewer. If several fields are to be set, they
|
||||
should be separated with a colon (':') character (which there
|
||||
is currently no way to escape). Ie:
|
||||
<literal>localfields= rclaptg=gnus:other = val</literal>, then
|
||||
select specifier viewer with
|
||||
<literal>mimetype|tag=...</literal> in
|
||||
<filename>mimeview</filename>.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
</variablelist>
|
||||
</sect3>
|
||||
|
||||
<sect3 id="rcl.install.config.recollconf.storage">
|
||||
<title>Parameters affecting where and how we store things:</title>
|
||||
|
||||
<variablelist>
|
||||
<varlistentry><term><literal>dbdir</literal></term>
|
||||
@ -3181,8 +3272,11 @@ skippedPaths = ~/somedir/∗.txt
|
||||
</varlistentry>
|
||||
|
||||
</variablelist>
|
||||
</sect3>
|
||||
|
||||
<para>Miscellani:</para>
|
||||
|
||||
<sect3 id="rcl.install.config.recollconf.misc">
|
||||
<title>Miscellaneous parameters:</title>
|
||||
|
||||
<variablelist>
|
||||
|
||||
@ -3204,57 +3298,12 @@ skippedPaths = ~/somedir/∗.txt
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry><term><literal>indexstemminglanguages</literal></term>
|
||||
<listitem><para>A list of languages for which the stem
|
||||
expansion databases will be built. See recollindex(1) or
|
||||
use the <literal>recollindex -l</literal> command for
|
||||
possible values. You can add a stem expansion database for
|
||||
a different language by using <command>recollindex
|
||||
-s</command>, but it will be deleted during the next
|
||||
indexing. Only languages listed in the configuration
|
||||
file are permanent.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry><term><literal>defaultcharset</literal></term>
|
||||
<listitem><para>The name of the character set used for
|
||||
files that do not contain a character set definition (ie:
|
||||
plain text files). This can be redefined for any
|
||||
sub-directory. If it is not set at all, the character set
|
||||
used is the one defined by the nls environment (LC_ALL,
|
||||
LC_CTYPE, LANG), or iso8859-1 if nothing is set.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry><term><literal>filtermaxseconds</literal></term>
|
||||
<listitem><para>Maximum filter execution time, after which it
|
||||
is aborted. Some postscript programs just loop...</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry><term><literal>maildefcharset</literal></term>
|
||||
<listitem><para>This can be used to define the default
|
||||
character set specifically for mail messages which don't
|
||||
specify it. This is mainly useful for readpst (libpst) dumps,
|
||||
which are utf-8 but do not say so.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry><term><literal>localfields</literal></term>
|
||||
<listitem><para>This allows setting fields for all
|
||||
documents under a given directory. Typical usage would be
|
||||
to set an "rclaptg" field, to be used in
|
||||
<filename>mimeview</filename> to select a specific
|
||||
viewer. If several fields are to be set, they should be
|
||||
separated with a ':' character (which there is currently no way to
|
||||
escape). Ie:
|
||||
<literal>localfields= rclaptg=gnus:other = val</literal>, then
|
||||
select specifier viewer with
|
||||
<literal>mimetype|tag=...</literal> in
|
||||
<filename>mimeview</filename>.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry><term><literal>filtersdir</literal></term>
|
||||
<listitem><para>A directory to search for the external
|
||||
filter scripts used to index some types of files. The
|
||||
@ -3309,26 +3358,6 @@ skippedPaths = ~/somedir/∗.txt
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry><term><literal>nocjk</literal></term>
|
||||
<listitem><para>If this set to true, specific east asian
|
||||
(Chinese Korean Japanese) characters/word splitting is
|
||||
turned off. This will save a small amount of cpu if you
|
||||
have no CJK documents. If your document base does include
|
||||
such text but you are not interested in searching it,
|
||||
setting <literal>nocjk</literal> may be a significant time
|
||||
and space saver.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry><term><literal>cjkngramlen</literal></term>
|
||||
<listitem><para>This lets you adjust the size of n-grams
|
||||
used for indexing CJK text. The default value of 2 is
|
||||
probably appropriate in most cases. A value of 3 would
|
||||
allow more precision and efficiency on longer words, but
|
||||
the index will be approximately twice as large.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry><term><literal>guesscharset</literal></term>
|
||||
<listitem><para>Decide if we try to guess the character
|
||||
set of files if no internal value is available (ie: for
|
||||
|
||||
@ -82,7 +82,10 @@ bool SearchData::toNativeQuery(Rcl::Db &db, void *d)
|
||||
m_reason = (*it)->getReason();
|
||||
return false;
|
||||
}
|
||||
|
||||
if (nq.empty()) {
|
||||
LOGDEB(("SearchData::toNativeQuery: skipping empty clause\n"));
|
||||
continue;
|
||||
}
|
||||
// If this structure is an AND list, must use AND_NOT for excl clauses.
|
||||
// Else this is an OR list, and there can't be excl clauses (checked by
|
||||
// addClause())
|
||||
@ -191,10 +194,11 @@ void SearchData::getUTerms(vector<string>& terms) const
|
||||
class TextSplitQ : public TextSplit {
|
||||
public:
|
||||
TextSplitQ(Flags flags, const StopList &_stops)
|
||||
: TextSplit(flags), stops(_stops), alltermcount(0)
|
||||
: TextSplit(flags), stops(_stops), alltermcount(0), lastpos(0)
|
||||
{}
|
||||
bool takeword(const std::string &interm, int , int, int) {
|
||||
bool takeword(const std::string &interm, int pos, int, int) {
|
||||
alltermcount++;
|
||||
lastpos = pos
|
||||
LOGDEB1(("TextSplitQ::takeword: %s\n", interm.c_str()));
|
||||
|
||||
// Check if the first letter is a majuscule in which
|
||||
@ -233,6 +237,7 @@ class TextSplitQ : public TextSplit {
|
||||
// Count of terms including stopwords: this is for adjusting
|
||||
// phrase/near slack
|
||||
int alltermcount;
|
||||
int lastpos;
|
||||
};
|
||||
|
||||
// A class used to translate a user compound string (*not* a query
|
||||
@ -456,8 +461,10 @@ void StringToXapianQ::processPhraseOrNear(TextSplitQ *splitData,
|
||||
|
||||
// Generate an appropriate PHRASE/NEAR query with adjusted slack
|
||||
// For phrases, give a relevance boost like we do for original terms
|
||||
LOGDEB2(("PHRASE/NEAR: alltermcount %d lastpos %d\n",
|
||||
splitData->alltermcount, splitData->lastpos));
|
||||
Xapian::Query xq(op, orqueries.begin(), orqueries.end(),
|
||||
splitData->alltermcount + slack);
|
||||
splitData->lastpos + 1 + slack);
|
||||
if (op == Xapian::Query::OP_PHRASE)
|
||||
xq = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, xq,
|
||||
original_term_wqf_booster);
|
||||
@ -611,7 +618,7 @@ bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p,
|
||||
return false;
|
||||
if (pqueries.empty()) {
|
||||
LOGERR(("SearchDataClauseSimple: resolved to null query\n"));
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
tr.getTerms(m_terms, m_groups);
|
||||
tr.getUTerms(m_uterms);
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user