Added option nonumbers not to generate terms for numbers. closes #16

2010-05-05 10:18:56 +02:00 · 2010-05-05 10:18:56 +02:00 · 48358c8252
commit 48358c8252
parent d0e56e361f
6 changed files with 153 additions and 82 deletions
--- a/src/common/rclconfig.cpp
+++ b/src/common/rclconfig.cpp
@ -203,6 +203,12 @@ bool RclConfig::updateMainConfig()
 	    TextSplit::cjkProcessing(true);
 	}
    }
+
+    bool nonum = false;
+    if (getConfParam("nonumbers", &nonum) && nonum == true) {
+	TextSplit::noNumbers();
+    }
+
    m_skpnstate.init(this, m_conf, "skippedNames");
    m_rmtstate.init(this, m_conf, "indexedmimetypes");
    return true;
--- a/src/common/textsplit.cpp
+++ b/src/common/textsplit.cpp
@ -164,6 +164,7 @@ bool TextSplit::isCJK(int c)

 bool          TextSplit::o_processCJK = true;
 unsigned int  TextSplit::o_CJKNgramLen = 2;
+bool          TextSplit::o_noNumbers = false;

 // Do some checking (the kind which is simpler to do here than in the
 // main loop), then send term to our client.
@ -212,12 +213,15 @@ inline bool TextSplit::emitterm(bool isspan, string &w, int pos,
 */
 inline bool TextSplit::doemit(bool spanerase, int bp, bool spanemit)
 {
-    LOGDEB3(("TextSplit::doemit:spn [%s] sp %d wrdS %d wrdL %d spe %d bp %d\n",
-	     span.c_str(), spanpos, wordStart, wordLen, spanerase, bp));
+    LOGDEB3(("TextSplit::doemit:spn [%s] sp %d wrdS %d wrdL %d spe %d bp %d "
+             "innum %d\n", m_span.c_str(), m_spanpos, m_wordStart, 
+             m_wordLen, spanerase, bp, m_inNumber));

    // Emit span. When splitting for query, we only emit final spans
    bool spanemitted = false;
    if (!(m_flags & TXTS_NOSPANS) && 
+        !((m_wordLen == m_span.length()) && 
+          (o_noNumbers) && m_inNumber) &&
 	((spanemit && !(m_flags & TXTS_ONLYSPANS)) || spanerase) ) {
 	// Maybe trim at end. These are chars that we would keep inside 
 	// a span, but not at the end
@ -243,6 +247,7 @@ inline bool TextSplit::doemit(bool spanerase, int bp, bool spanemit)

    // Emit word if different from span and not 'no words' mode
    if (!(m_flags & TXTS_ONLYSPANS) && m_wordLen && 
+        !(o_noNumbers && m_inNumber) &&
 	(!spanemitted || m_wordLen != m_span.length())) {
 	string s(m_span.substr(m_wordStart, m_wordLen));
 	if (!emitterm(false, s, m_wordpos, bp - m_wordLen, bp))
@ -494,6 +499,9 @@ bool TextSplit::text_to_words(const string &in)

 	default:
 	NORMALCHAR:
+            if (m_inNumber && c != 'e' && c != 'E') {
+                m_inNumber = false;
+            }
 	    m_wordLen += it.appendchartostring(m_span);
 	    break;
 	}
@ -746,10 +754,12 @@ class myTextSplit : public TextSplit {
    }
 };

+
 static string teststring = 
 	    "Un bout de texte \nnormal. 2eme phrase.3eme;quatrieme.\n"
 	    "\"Jean-Francois Dockes\" <jfd@okyz.com>\n"
 	    "n@d @net .net t@v@c c# c++ o'brien 'o'brien' l'ami\n"
+            "data123\n"
 	    "134 +134 -14 -1.5 +1.5 1.54e10 1,2 1,2e30\n"
 	    "@^#$(#$(*)\n"
 	    "192.168.4.1 one\n\rtwo\r"
@ -762,6 +772,7 @@ static string teststring =
 	    " -wl,--export-dynamic "
 	    " ~/.xsession-errors "
 ;
+
 static string teststring1 = " nouvel-an ";

 static string thisprog;
@ -771,6 +782,7 @@ static string usage =
    "   -S: no output\n"
    "   -s:  only spans\n"
    "   -w:  only words\n"
+    "   -n:  no numbers\n"
    "   -k:  preserve wildcards (?*)\n"
    "   -c: just count words\n"
    "   -C [charset] : input charset\n"
@ -792,6 +804,7 @@ static int        op_flags;
 #define OPT_c     0x8
 #define OPT_k     0x10
 #define OPT_C     0x20
+#define OPT_n     0x40

 int main(int argc, char **argv)
 {
@ -811,6 +824,7 @@ int main(int argc, char **argv)
                charset = *(++argv); argc--; 
                goto b1;
 	    case 'k':	op_flags |= OPT_k; break;
+	    case 'n':	op_flags |= OPT_n; break;
 	    case 's':	op_flags |= OPT_s; break;
 	    case 'S':	op_flags |= OPT_S; break;
 	    case 'w':	op_flags |= OPT_w; break;
@ -829,6 +843,8 @@ int main(int argc, char **argv)
 	flags = TextSplit::TXTS_NOSPANS;
    if (op_flags & OPT_k) 
 	flags = (TextSplit::Flags)(flags | TextSplit::TXTS_KEEPWILD); 
+    if (op_flags & OPT_n)
+	TextSplit::noNumbers();

    string odata, reason;
    if (argc == 1) {
--- a/src/common/textsplit.h
+++ b/src/common/textsplit.h
@ -37,7 +37,8 @@ class Utf8Iter;
 class TextSplit {
 public:
    // Should we activate special processing of Chinese characters ? This
-    // needs a little more cpu, so it can be turned off globally.
+    // needs a little more cpu, so it can be turned off globally. This is set
+    // by rclconfig, changing it means reindexing
    static bool o_processCJK;
    static unsigned int  o_CJKNgramLen;
    static const unsigned int o_CJKMaxNgramLen =  5;
@ -48,6 +49,13 @@ public:
 	    ngramlen : o_CJKMaxNgramLen;
    }

+    // Are we indexing numbers ? Set by rclconfig. Change needs reindex
+    static bool o_noNumbers;
+    static void noNumbers()
+    {
+	o_noNumbers = true;
+    }
+
    enum Flags {TXTS_NONE = 0, 
 		TXTS_ONLYSPANS = 1,  // Only return maximum spans (a@b.com) 
 		TXTS_NOSPANS = 2,  // Only return atomic words (a, b, com)
--- a/src/doc/user/docbook.css
+++ b/src/doc/user/docbook.css
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2003 The FreeBSD Documentation Project
+ * Copyright (c) 2001, 2003, 2010 The FreeBSD Documentation Project
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
@ -23,7 +23,7 @@
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
- * $FreeBSD: doc/share/misc/docbook.css,v 1.14 2008/11/21 07:28:34 keramida Exp $
+ * $FreeBSD: doc/share/misc/docbook.css,v 1.15 2010/03/20 04:15:01 hrs Exp $
 */

 BODY ADDRESS {
@ -201,3 +201,8 @@ BLOCKQUOTE.WARNING {
 	padding: 0.2em 2em;
 	width: 90%;
 }
+
+.INFORMALTABLE TABLE.CALSTABLE TR TD {
+        padding-left: 1em;
+        padding-right: 1em;
+}
--- a/src/doc/user/usermanual.sgml
+++ b/src/doc/user/usermanual.sgml
@ -1,8 +1,8 @@
 <!DOCTYPE BOOK PUBLIC "-//FreeBSD//DTD DocBook V4.1-Based Extension//EN" [
+
 <!ENTITY RCL "<application>Recoll</application>">
 <!ENTITY RCLVERSION "1.12-1.13">
 <!ENTITY XAP "<application>Xapian</application>">
-
 ]>
 
 <book lang="en">
@ -10,7 +10,6 @@
  <bookinfo>
    <title>Recoll user manual</title>

-
    <author>
      <firstname>Jean-Francois</firstname>
      <surname>Dockes</surname>
@ -41,7 +40,7 @@

    <sect1 id="rcl.introduction.tryit">
      <title>Giving it a try</title>
-
+      
      <para>If you do not like reading manuals (who does?) and would
      like to give &RCL; a try, just perform <link
      linkend="rcl.install.binary">installation</link> and start the
@ -2956,8 +2955,14 @@ while query.next >= 0 and query.next < nres:
         configuration, click <guimenu>Cancel</guimenu>, and edit
         the configuration file before restarting the command. This
         will start the initial indexing, which may take some time.</para>
-        
-        <para>Paramers affecting what we index:</para>
+
+        <para>Most of the following parameters can be changed from the
+        <guilabel>Index Configuration</guilabel> menu in the
+        <command>recoll</command> interface. Some can only be set by
+        editing the configuration file.</para>
+
+        <sect3 id="rcl.install.config.recollconf.files">
+          <title>Parameters affecting what documents we index:</title>

        <variablelist>

@ -3116,9 +3121,95 @@ skippedPaths = ~/somedir/&lowast;.txt
           </varlistentry>

        </variablelist>
+       </sect3>

+       <sect3 id="rcl.install.config.recollconf.terms">
+	<title>Parameters affecting how we generate terms:</title>

-	<para>Parameters affecting where and how we store things:</para>
+        <para>Changing some of these parameters will imply a full
+        reindex. Also, when using multiple indexes, it may not make sense
+        to search indexes that don't share the values for these parameters,
+        because they usually affect both search and index operations.</para>
+
+        <variablelist>
+
+          <varlistentry><term><literal>nonumbers</literal></term>
+            <listitem><para>If this set to true, no terms will be generated
+            for numbers. For example "123", "1.5e6", 192.168.1.4, would not
+            be indexed ("value123" would still be). Numbers are often quite
+            interesting to search for, and this should probably not be set
+            except for special situations, ie, scientific documents with huge
+            amounts of numbers in them. </para>
+            </listitem>
+          </varlistentry>
+
+          <varlistentry><term><literal>nocjk</literal></term>
+            <listitem><para>If this set to true, specific east asian
+            (Chinese Korean Japanese) characters/word splitting is
+            turned off. This will save a small amount of cpu if you
+            have no CJK documents. If your document base does include
+            such text but you are not interested in searching it,
+            setting <literal>nocjk</literal> may be a significant time
+            and space saver.</para>
+            </listitem>
+          </varlistentry>
+
+          <varlistentry><term><literal>cjkngramlen</literal></term>
+            <listitem><para>This lets you adjust the size of n-grams
+            used for indexing CJK text. The default value of 2 is
+            probably appropriate in most cases. A value of 3 would
+            allow more precision and efficiency on longer words, but
+            the index will be approximately twice as large.</para>
+            </listitem>
+          </varlistentry>
+          <varlistentry><term><literal>indexstemminglanguages</literal></term>
+            <listitem><para>A list of languages for which the stem
+            expansion databases will be built. See recollindex(1) or
+            use the <literal>recollindex -l</literal> command for
+            possible values. You can add a stem expansion database for
+            a different language by using <command>recollindex
+            -s</command>, but it will be deleted during the next
+            indexing. Only languages listed in the configuration
+            file are permanent.</para>
+            </listitem>
+          </varlistentry>
+         
+          <varlistentry><term><literal>defaultcharset</literal></term>
+            <listitem><para>The name of the character set used for
+            files that do not contain a character set definition (ie:
+            plain text files). This can be redefined for any
+            sub-directory. If it is not set at all, the character set
+            used is the one defined by the nls environment (LC_ALL,
+            LC_CTYPE, LANG), or iso8859-1 if nothing is set.</para> 
+	   </listitem>
+         </varlistentry>
+
+          <varlistentry><term><literal>maildefcharset</literal></term>
+            <listitem><para>This can be used to define the default
+		character set specifically for mail messages which don't
+		specify it. This is mainly useful for readpst (libpst) dumps,
+		which are utf-8 but do not say so.</para>
+            </listitem>
+          </varlistentry>
+
+          <varlistentry><term><literal>localfields</literal></term>
+            <listitem><para>This allows setting fields for all documents
+            under a given directory. Typical usage would be to set an
+            "rclaptg" field, to be used in <filename>mimeview</filename> to
+            select a specific viewer. If several fields are to be set, they
+            should be separated with a colon (':') character (which there
+            is currently no way to escape). Ie:
+		<literal>localfields= rclaptg=gnus:other = val</literal>, then
+		select specifier viewer with
+		<literal>mimetype|tag=...</literal> in
+		<filename>mimeview</filename>.</para>  
+            </listitem>
+           </varlistentry>
+        </variablelist>
+       </sect3>
+
+       <sect3 id="rcl.install.config.recollconf.storage">
+	<title>Parameters affecting where and how we store things:</title>

 	<variablelist>
          <varlistentry><term><literal>dbdir</literal></term>
@ -3181,8 +3272,11 @@ skippedPaths = ~/somedir/&lowast;.txt
          </varlistentry>

        </variablelist>
+       </sect3>

-	<para>Miscellani:</para>
+
+       <sect3 id="rcl.install.config.recollconf.misc">
+	<title>Miscellaneous parameters:</title>

 	 <variablelist>

@ -3204,57 +3298,12 @@ skippedPaths = ~/somedir/&lowast;.txt
            </listitem>
          </varlistentry>

-          <varlistentry><term><literal>indexstemminglanguages</literal></term>
-            <listitem><para>A list of languages for which the stem
-            expansion databases will be built. See recollindex(1) or
-            use the <literal>recollindex -l</literal> command for
-            possible values. You can add a stem expansion database for
-            a different language by using <command>recollindex
-            -s</command>, but it will be deleted during the next
-            indexing. Only languages listed in the configuration
-            file are permanent.</para>
-            </listitem>
-          </varlistentry>
-         
-          <varlistentry><term><literal>defaultcharset</literal></term>
-            <listitem><para>The name of the character set used for
-            files that do not contain a character set definition (ie:
-            plain text files). This can be redefined for any
-            sub-directory. If it is not set at all, the character set
-            used is the one defined by the nls environment (LC_ALL,
-            LC_CTYPE, LANG), or iso8859-1 if nothing is set.</para> 
-	   </listitem>
-         </varlistentry>
-
          <varlistentry><term><literal>filtermaxseconds</literal></term>
            <listitem><para>Maximum filter execution time, after which it
            is aborted. Some postscript programs just loop...</para> 
            </listitem>
           </varlistentry>

-          <varlistentry><term><literal>maildefcharset</literal></term>
-            <listitem><para>This can be used to define the default
-		character set specifically for mail messages which don't
-		specify it. This is mainly useful for readpst (libpst) dumps,
-		which are utf-8 but do not say so.</para>
-            </listitem>
-          </varlistentry>
-
-          <varlistentry><term><literal>localfields</literal></term>
-            <listitem><para>This allows setting fields for all
-            documents under a given directory. Typical usage would be
-            to set an "rclaptg" field, to be used in
-            <filename>mimeview</filename> to select a specific
-            viewer. If several fields are to be set, they should be
-            separated with a ':' character (which there is currently no way to
-            escape). Ie:
-		<literal>localfields= rclaptg=gnus:other = val</literal>, then
-		select specifier viewer with
-		<literal>mimetype|tag=...</literal> in
-		<filename>mimeview</filename>.</para>  
-            </listitem>
-           </varlistentry>
-
          <varlistentry><term><literal>filtersdir</literal></term>
            <listitem><para>A directory to search for the external
            filter scripts used to index some types of files. The
@ -3309,26 +3358,6 @@ skippedPaths = ~/somedir/&lowast;.txt
            </listitem>
          </varlistentry>

-          <varlistentry><term><literal>nocjk</literal></term>
-            <listitem><para>If this set to true, specific east asian
-            (Chinese Korean Japanese) characters/word splitting is
-            turned off. This will save a small amount of cpu if you
-            have no CJK documents. If your document base does include
-            such text but you are not interested in searching it,
-            setting <literal>nocjk</literal> may be a significant time
-            and space saver.</para>
-            </listitem>
-          </varlistentry>
-
-          <varlistentry><term><literal>cjkngramlen</literal></term>
-            <listitem><para>This lets you adjust the size of n-grams
-            used for indexing CJK text. The default value of 2 is
-            probably appropriate in most cases. A value of 3 would
-            allow more precision and efficiency on longer words, but
-            the index will be approximately twice as large.</para>
-            </listitem>
-          </varlistentry>
-
          <varlistentry><term><literal>guesscharset</literal></term>
            <listitem><para>Decide if we try to guess the character
            set of files if no internal value is available (ie: for
--- a/src/rcldb/searchdata.cpp
+++ b/src/rcldb/searchdata.cpp
@ -82,7 +82,10 @@ bool SearchData::toNativeQuery(Rcl::Db &db, void *d)
 	    m_reason = (*it)->getReason();
 	    return false;
 	}	    
-
+        if (nq.empty()) {
+            LOGDEB(("SearchData::toNativeQuery: skipping empty clause\n"));
+            continue;
+        }
 	// If this structure is an AND list, must use AND_NOT for excl clauses.
 	// Else this is an OR list, and there can't be excl clauses (checked by
 	// addClause())
@ -191,10 +194,11 @@ void SearchData::getUTerms(vector<string>& terms) const
 class TextSplitQ : public TextSplit {
 public:
    TextSplitQ(Flags flags, const StopList &_stops) 
-	: TextSplit(flags), stops(_stops), alltermcount(0)
+	: TextSplit(flags), stops(_stops), alltermcount(0), lastpos(0)
    {}
-    bool takeword(const std::string &interm, int , int, int) {
+    bool takeword(const std::string &interm, int pos, int, int) {
 	alltermcount++;
+        lastpos = pos
 	LOGDEB1(("TextSplitQ::takeword: %s\n", interm.c_str()));

 	// Check if the first letter is a majuscule in which
@ -233,6 +237,7 @@ class TextSplitQ : public TextSplit {
    // Count of terms including stopwords: this is for adjusting
    // phrase/near slack
    int alltermcount; 
+    int lastpos;
 };

 // A class used to translate a user compound string (*not* a query
@ -456,8 +461,10 @@ void StringToXapianQ::processPhraseOrNear(TextSplitQ *splitData,

    // Generate an appropriate PHRASE/NEAR query with adjusted slack
    // For phrases, give a relevance boost like we do for original terms
+    LOGDEB2(("PHRASE/NEAR: alltermcount %d lastpos %d\n", 
+             splitData->alltermcount, splitData->lastpos));
    Xapian::Query xq(op, orqueries.begin(), orqueries.end(),
-		     splitData->alltermcount + slack);
+		     splitData->lastpos + 1 + slack);
    if (op == Xapian::Query::OP_PHRASE)
 	xq = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, xq, 
 			   original_term_wqf_booster);
@ -611,7 +618,7 @@ bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p,
 	return false;
    if (pqueries.empty()) {
 	LOGERR(("SearchDataClauseSimple: resolved to null query\n"));
-	return false;
+	return true;
    }
    tr.getTerms(m_terms, m_groups);
    tr.getUTerms(m_uterms);