Added option nonumbers not to generate terms for numbers. closes #16

This commit is contained in:
Jean-Francois Dockes 2010-05-05 10:18:56 +02:00
parent d0e56e361f
commit 48358c8252
6 changed files with 153 additions and 82 deletions

View File

@ -203,6 +203,12 @@ bool RclConfig::updateMainConfig()
TextSplit::cjkProcessing(true);
}
}
bool nonum = false;
if (getConfParam("nonumbers", &nonum) && nonum == true) {
TextSplit::noNumbers();
}
m_skpnstate.init(this, m_conf, "skippedNames");
m_rmtstate.init(this, m_conf, "indexedmimetypes");
return true;

View File

@ -164,6 +164,7 @@ bool TextSplit::isCJK(int c)
bool TextSplit::o_processCJK = true;
unsigned int TextSplit::o_CJKNgramLen = 2;
bool TextSplit::o_noNumbers = false;
// Do some checking (the kind which is simpler to do here than in the
// main loop), then send term to our client.
@ -212,12 +213,15 @@ inline bool TextSplit::emitterm(bool isspan, string &w, int pos,
*/
inline bool TextSplit::doemit(bool spanerase, int bp, bool spanemit)
{
LOGDEB3(("TextSplit::doemit:spn [%s] sp %d wrdS %d wrdL %d spe %d bp %d\n",
span.c_str(), spanpos, wordStart, wordLen, spanerase, bp));
LOGDEB3(("TextSplit::doemit:spn [%s] sp %d wrdS %d wrdL %d spe %d bp %d "
"innum %d\n", m_span.c_str(), m_spanpos, m_wordStart,
m_wordLen, spanerase, bp, m_inNumber));
// Emit span. When splitting for query, we only emit final spans
bool spanemitted = false;
if (!(m_flags & TXTS_NOSPANS) &&
!((m_wordLen == m_span.length()) &&
(o_noNumbers) && m_inNumber) &&
((spanemit && !(m_flags & TXTS_ONLYSPANS)) || spanerase) ) {
// Maybe trim at end. These are chars that we would keep inside
// a span, but not at the end
@ -243,6 +247,7 @@ inline bool TextSplit::doemit(bool spanerase, int bp, bool spanemit)
// Emit word if different from span and not 'no words' mode
if (!(m_flags & TXTS_ONLYSPANS) && m_wordLen &&
!(o_noNumbers && m_inNumber) &&
(!spanemitted || m_wordLen != m_span.length())) {
string s(m_span.substr(m_wordStart, m_wordLen));
if (!emitterm(false, s, m_wordpos, bp - m_wordLen, bp))
@ -494,6 +499,9 @@ bool TextSplit::text_to_words(const string &in)
default:
NORMALCHAR:
if (m_inNumber && c != 'e' && c != 'E') {
m_inNumber = false;
}
m_wordLen += it.appendchartostring(m_span);
break;
}
@ -746,10 +754,12 @@ class myTextSplit : public TextSplit {
}
};
static string teststring =
"Un bout de texte \nnormal. 2eme phrase.3eme;quatrieme.\n"
"\"Jean-Francois Dockes\" <jfd@okyz.com>\n"
"n@d @net .net t@v@c c# c++ o'brien 'o'brien' l'ami\n"
"data123\n"
"134 +134 -14 -1.5 +1.5 1.54e10 1,2 1,2e30\n"
"@^#$(#$(*)\n"
"192.168.4.1 one\n\rtwo\r"
@ -762,6 +772,7 @@ static string teststring =
" -wl,--export-dynamic "
" ~/.xsession-errors "
;
static string teststring1 = " nouvel-an ";
static string thisprog;
@ -771,6 +782,7 @@ static string usage =
" -S: no output\n"
" -s: only spans\n"
" -w: only words\n"
" -n: no numbers\n"
" -k: preserve wildcards (?*)\n"
" -c: just count words\n"
" -C [charset] : input charset\n"
@ -792,6 +804,7 @@ static int op_flags;
#define OPT_c 0x8
#define OPT_k 0x10
#define OPT_C 0x20
#define OPT_n 0x40
int main(int argc, char **argv)
{
@ -811,6 +824,7 @@ int main(int argc, char **argv)
charset = *(++argv); argc--;
goto b1;
case 'k': op_flags |= OPT_k; break;
case 'n': op_flags |= OPT_n; break;
case 's': op_flags |= OPT_s; break;
case 'S': op_flags |= OPT_S; break;
case 'w': op_flags |= OPT_w; break;
@ -829,6 +843,8 @@ int main(int argc, char **argv)
flags = TextSplit::TXTS_NOSPANS;
if (op_flags & OPT_k)
flags = (TextSplit::Flags)(flags | TextSplit::TXTS_KEEPWILD);
if (op_flags & OPT_n)
TextSplit::noNumbers();
string odata, reason;
if (argc == 1) {

View File

@ -37,7 +37,8 @@ class Utf8Iter;
class TextSplit {
public:
// Should we activate special processing of Chinese characters ? This
// needs a little more cpu, so it can be turned off globally.
// needs a little more cpu, so it can be turned off globally. This is set
// by rclconfig, changing it means reindexing
static bool o_processCJK;
static unsigned int o_CJKNgramLen;
static const unsigned int o_CJKMaxNgramLen = 5;
@ -48,6 +49,13 @@ public:
ngramlen : o_CJKMaxNgramLen;
}
// Are we indexing numbers ? Set by rclconfig. Change needs reindex
static bool o_noNumbers;
static void noNumbers()
{
o_noNumbers = true;
}
enum Flags {TXTS_NONE = 0,
TXTS_ONLYSPANS = 1, // Only return maximum spans (a@b.com)
TXTS_NOSPANS = 2, // Only return atomic words (a, b, com)

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2001, 2003 The FreeBSD Documentation Project
* Copyright (c) 2001, 2003, 2010 The FreeBSD Documentation Project
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@ -23,7 +23,7 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD: doc/share/misc/docbook.css,v 1.14 2008/11/21 07:28:34 keramida Exp $
* $FreeBSD: doc/share/misc/docbook.css,v 1.15 2010/03/20 04:15:01 hrs Exp $
*/
BODY ADDRESS {
@ -201,3 +201,8 @@ BLOCKQUOTE.WARNING {
padding: 0.2em 2em;
width: 90%;
}
.INFORMALTABLE TABLE.CALSTABLE TR TD {
padding-left: 1em;
padding-right: 1em;
}

View File

@ -1,8 +1,8 @@
<!DOCTYPE BOOK PUBLIC "-//FreeBSD//DTD DocBook V4.1-Based Extension//EN" [
<!ENTITY RCL "<application>Recoll</application>">
<!ENTITY RCLVERSION "1.12-1.13">
<!ENTITY XAP "<application>Xapian</application>">
]>
<book lang="en">
@ -10,7 +10,6 @@
<bookinfo>
<title>Recoll user manual</title>
<author>
<firstname>Jean-Francois</firstname>
<surname>Dockes</surname>
@ -41,7 +40,7 @@
<sect1 id="rcl.introduction.tryit">
<title>Giving it a try</title>
<para>If you do not like reading manuals (who does?) and would
like to give &RCL; a try, just perform <link
linkend="rcl.install.binary">installation</link> and start the
@ -2956,8 +2955,14 @@ while query.next >= 0 and query.next < nres:
configuration, click <guimenu>Cancel</guimenu>, and edit
the configuration file before restarting the command. This
will start the initial indexing, which may take some time.</para>
<para>Paramers affecting what we index:</para>
<para>Most of the following parameters can be changed from the
<guilabel>Index Configuration</guilabel> menu in the
<command>recoll</command> interface. Some can only be set by
editing the configuration file.</para>
<sect3 id="rcl.install.config.recollconf.files">
<title>Parameters affecting what documents we index:</title>
<variablelist>
@ -3116,9 +3121,95 @@ skippedPaths = ~/somedir/&lowast;.txt
</varlistentry>
</variablelist>
</sect3>
<sect3 id="rcl.install.config.recollconf.terms">
<title>Parameters affecting how we generate terms:</title>
<para>Parameters affecting where and how we store things:</para>
<para>Changing some of these parameters will imply a full
reindex. Also, when using multiple indexes, it may not make sense
to search indexes that don't share the values for these parameters,
because they usually affect both search and index operations.</para>
<variablelist>
<varlistentry><term><literal>nonumbers</literal></term>
<listitem><para>If this set to true, no terms will be generated
for numbers. For example "123", "1.5e6", 192.168.1.4, would not
be indexed ("value123" would still be). Numbers are often quite
interesting to search for, and this should probably not be set
except for special situations, ie, scientific documents with huge
amounts of numbers in them. </para>
</listitem>
</varlistentry>
<varlistentry><term><literal>nocjk</literal></term>
<listitem><para>If this set to true, specific east asian
(Chinese Korean Japanese) characters/word splitting is
turned off. This will save a small amount of cpu if you
have no CJK documents. If your document base does include
such text but you are not interested in searching it,
setting <literal>nocjk</literal> may be a significant time
and space saver.</para>
</listitem>
</varlistentry>
<varlistentry><term><literal>cjkngramlen</literal></term>
<listitem><para>This lets you adjust the size of n-grams
used for indexing CJK text. The default value of 2 is
probably appropriate in most cases. A value of 3 would
allow more precision and efficiency on longer words, but
the index will be approximately twice as large.</para>
</listitem>
</varlistentry>
<varlistentry><term><literal>indexstemminglanguages</literal></term>
<listitem><para>A list of languages for which the stem
expansion databases will be built. See recollindex(1) or
use the <literal>recollindex -l</literal> command for
possible values. You can add a stem expansion database for
a different language by using <command>recollindex
-s</command>, but it will be deleted during the next
indexing. Only languages listed in the configuration
file are permanent.</para>
</listitem>
</varlistentry>
<varlistentry><term><literal>defaultcharset</literal></term>
<listitem><para>The name of the character set used for
files that do not contain a character set definition (ie:
plain text files). This can be redefined for any
sub-directory. If it is not set at all, the character set
used is the one defined by the nls environment (LC_ALL,
LC_CTYPE, LANG), or iso8859-1 if nothing is set.</para>
</listitem>
</varlistentry>
<varlistentry><term><literal>maildefcharset</literal></term>
<listitem><para>This can be used to define the default
character set specifically for mail messages which don't
specify it. This is mainly useful for readpst (libpst) dumps,
which are utf-8 but do not say so.</para>
</listitem>
</varlistentry>
<varlistentry><term><literal>localfields</literal></term>
<listitem><para>This allows setting fields for all documents
under a given directory. Typical usage would be to set an
"rclaptg" field, to be used in <filename>mimeview</filename> to
select a specific viewer. If several fields are to be set, they
should be separated with a colon (':') character (which there
is currently no way to escape). Ie:
<literal>localfields= rclaptg=gnus:other = val</literal>, then
select specifier viewer with
<literal>mimetype|tag=...</literal> in
<filename>mimeview</filename>.</para>
</listitem>
</varlistentry>
</variablelist>
</sect3>
<sect3 id="rcl.install.config.recollconf.storage">
<title>Parameters affecting where and how we store things:</title>
<variablelist>
<varlistentry><term><literal>dbdir</literal></term>
@ -3181,8 +3272,11 @@ skippedPaths = ~/somedir/&lowast;.txt
</varlistentry>
</variablelist>
</sect3>
<para>Miscellani:</para>
<sect3 id="rcl.install.config.recollconf.misc">
<title>Miscellaneous parameters:</title>
<variablelist>
@ -3204,57 +3298,12 @@ skippedPaths = ~/somedir/&lowast;.txt
</listitem>
</varlistentry>
<varlistentry><term><literal>indexstemminglanguages</literal></term>
<listitem><para>A list of languages for which the stem
expansion databases will be built. See recollindex(1) or
use the <literal>recollindex -l</literal> command for
possible values. You can add a stem expansion database for
a different language by using <command>recollindex
-s</command>, but it will be deleted during the next
indexing. Only languages listed in the configuration
file are permanent.</para>
</listitem>
</varlistentry>
<varlistentry><term><literal>defaultcharset</literal></term>
<listitem><para>The name of the character set used for
files that do not contain a character set definition (ie:
plain text files). This can be redefined for any
sub-directory. If it is not set at all, the character set
used is the one defined by the nls environment (LC_ALL,
LC_CTYPE, LANG), or iso8859-1 if nothing is set.</para>
</listitem>
</varlistentry>
<varlistentry><term><literal>filtermaxseconds</literal></term>
<listitem><para>Maximum filter execution time, after which it
is aborted. Some postscript programs just loop...</para>
</listitem>
</varlistentry>
<varlistentry><term><literal>maildefcharset</literal></term>
<listitem><para>This can be used to define the default
character set specifically for mail messages which don't
specify it. This is mainly useful for readpst (libpst) dumps,
which are utf-8 but do not say so.</para>
</listitem>
</varlistentry>
<varlistentry><term><literal>localfields</literal></term>
<listitem><para>This allows setting fields for all
documents under a given directory. Typical usage would be
to set an "rclaptg" field, to be used in
<filename>mimeview</filename> to select a specific
viewer. If several fields are to be set, they should be
separated with a ':' character (which there is currently no way to
escape). Ie:
<literal>localfields= rclaptg=gnus:other = val</literal>, then
select specifier viewer with
<literal>mimetype|tag=...</literal> in
<filename>mimeview</filename>.</para>
</listitem>
</varlistentry>
<varlistentry><term><literal>filtersdir</literal></term>
<listitem><para>A directory to search for the external
filter scripts used to index some types of files. The
@ -3309,26 +3358,6 @@ skippedPaths = ~/somedir/&lowast;.txt
</listitem>
</varlistentry>
<varlistentry><term><literal>nocjk</literal></term>
<listitem><para>If this set to true, specific east asian
(Chinese Korean Japanese) characters/word splitting is
turned off. This will save a small amount of cpu if you
have no CJK documents. If your document base does include
such text but you are not interested in searching it,
setting <literal>nocjk</literal> may be a significant time
and space saver.</para>
</listitem>
</varlistentry>
<varlistentry><term><literal>cjkngramlen</literal></term>
<listitem><para>This lets you adjust the size of n-grams
used for indexing CJK text. The default value of 2 is
probably appropriate in most cases. A value of 3 would
allow more precision and efficiency on longer words, but
the index will be approximately twice as large.</para>
</listitem>
</varlistentry>
<varlistentry><term><literal>guesscharset</literal></term>
<listitem><para>Decide if we try to guess the character
set of files if no internal value is available (ie: for

View File

@ -82,7 +82,10 @@ bool SearchData::toNativeQuery(Rcl::Db &db, void *d)
m_reason = (*it)->getReason();
return false;
}
if (nq.empty()) {
LOGDEB(("SearchData::toNativeQuery: skipping empty clause\n"));
continue;
}
// If this structure is an AND list, must use AND_NOT for excl clauses.
// Else this is an OR list, and there can't be excl clauses (checked by
// addClause())
@ -191,10 +194,11 @@ void SearchData::getUTerms(vector<string>& terms) const
class TextSplitQ : public TextSplit {
public:
TextSplitQ(Flags flags, const StopList &_stops)
: TextSplit(flags), stops(_stops), alltermcount(0)
: TextSplit(flags), stops(_stops), alltermcount(0), lastpos(0)
{}
bool takeword(const std::string &interm, int , int, int) {
bool takeword(const std::string &interm, int pos, int, int) {
alltermcount++;
lastpos = pos
LOGDEB1(("TextSplitQ::takeword: %s\n", interm.c_str()));
// Check if the first letter is a majuscule in which
@ -233,6 +237,7 @@ class TextSplitQ : public TextSplit {
// Count of terms including stopwords: this is for adjusting
// phrase/near slack
int alltermcount;
int lastpos;
};
// A class used to translate a user compound string (*not* a query
@ -456,8 +461,10 @@ void StringToXapianQ::processPhraseOrNear(TextSplitQ *splitData,
// Generate an appropriate PHRASE/NEAR query with adjusted slack
// For phrases, give a relevance boost like we do for original terms
LOGDEB2(("PHRASE/NEAR: alltermcount %d lastpos %d\n",
splitData->alltermcount, splitData->lastpos));
Xapian::Query xq(op, orqueries.begin(), orqueries.end(),
splitData->alltermcount + slack);
splitData->lastpos + 1 + slack);
if (op == Xapian::Query::OP_PHRASE)
xq = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, xq,
original_term_wqf_booster);
@ -611,7 +618,7 @@ bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p,
return false;
if (pqueries.empty()) {
LOGERR(("SearchDataClauseSimple: resolved to null query\n"));
return false;
return true;
}
tr.getTerms(m_terms, m_groups);
tr.getUTerms(m_uterms);