diff --git a/src/common/rclconfig.cpp b/src/common/rclconfig.cpp index c2149757..2719ee1d 100644 --- a/src/common/rclconfig.cpp +++ b/src/common/rclconfig.cpp @@ -203,6 +203,12 @@ bool RclConfig::updateMainConfig() TextSplit::cjkProcessing(true); } } + + bool nonum = false; + if (getConfParam("nonumbers", &nonum) && nonum == true) { + TextSplit::noNumbers(); + } + m_skpnstate.init(this, m_conf, "skippedNames"); m_rmtstate.init(this, m_conf, "indexedmimetypes"); return true; diff --git a/src/common/textsplit.cpp b/src/common/textsplit.cpp index 0b91ad6c..73666b3f 100644 --- a/src/common/textsplit.cpp +++ b/src/common/textsplit.cpp @@ -164,6 +164,7 @@ bool TextSplit::isCJK(int c) bool TextSplit::o_processCJK = true; unsigned int TextSplit::o_CJKNgramLen = 2; +bool TextSplit::o_noNumbers = false; // Do some checking (the kind which is simpler to do here than in the // main loop), then send term to our client. @@ -212,12 +213,15 @@ inline bool TextSplit::emitterm(bool isspan, string &w, int pos, */ inline bool TextSplit::doemit(bool spanerase, int bp, bool spanemit) { - LOGDEB3(("TextSplit::doemit:spn [%s] sp %d wrdS %d wrdL %d spe %d bp %d\n", - span.c_str(), spanpos, wordStart, wordLen, spanerase, bp)); + LOGDEB3(("TextSplit::doemit:spn [%s] sp %d wrdS %d wrdL %d spe %d bp %d " + "innum %d\n", m_span.c_str(), m_spanpos, m_wordStart, + m_wordLen, spanerase, bp, m_inNumber)); // Emit span. When splitting for query, we only emit final spans bool spanemitted = false; if (!(m_flags & TXTS_NOSPANS) && + !((m_wordLen == m_span.length()) && + (o_noNumbers) && m_inNumber) && ((spanemit && !(m_flags & TXTS_ONLYSPANS)) || spanerase) ) { // Maybe trim at end. These are chars that we would keep inside // a span, but not at the end @@ -243,6 +247,7 @@ inline bool TextSplit::doemit(bool spanerase, int bp, bool spanemit) // Emit word if different from span and not 'no words' mode if (!(m_flags & TXTS_ONLYSPANS) && m_wordLen && + !(o_noNumbers && m_inNumber) && (!spanemitted || m_wordLen != m_span.length())) { string s(m_span.substr(m_wordStart, m_wordLen)); if (!emitterm(false, s, m_wordpos, bp - m_wordLen, bp)) @@ -494,6 +499,9 @@ bool TextSplit::text_to_words(const string &in) default: NORMALCHAR: + if (m_inNumber && c != 'e' && c != 'E') { + m_inNumber = false; + } m_wordLen += it.appendchartostring(m_span); break; } @@ -746,10 +754,12 @@ class myTextSplit : public TextSplit { } }; + static string teststring = "Un bout de texte \nnormal. 2eme phrase.3eme;quatrieme.\n" "\"Jean-Francois Dockes\" \n" "n@d @net .net t@v@c c# c++ o'brien 'o'brien' l'ami\n" + "data123\n" "134 +134 -14 -1.5 +1.5 1.54e10 1,2 1,2e30\n" "@^#$(#$(*)\n" "192.168.4.1 one\n\rtwo\r" @@ -762,6 +772,7 @@ static string teststring = " -wl,--export-dynamic " " ~/.xsession-errors " ; + static string teststring1 = " nouvel-an "; static string thisprog; @@ -771,6 +782,7 @@ static string usage = " -S: no output\n" " -s: only spans\n" " -w: only words\n" + " -n: no numbers\n" " -k: preserve wildcards (?*)\n" " -c: just count words\n" " -C [charset] : input charset\n" @@ -792,6 +804,7 @@ static int op_flags; #define OPT_c 0x8 #define OPT_k 0x10 #define OPT_C 0x20 +#define OPT_n 0x40 int main(int argc, char **argv) { @@ -811,6 +824,7 @@ int main(int argc, char **argv) charset = *(++argv); argc--; goto b1; case 'k': op_flags |= OPT_k; break; + case 'n': op_flags |= OPT_n; break; case 's': op_flags |= OPT_s; break; case 'S': op_flags |= OPT_S; break; case 'w': op_flags |= OPT_w; break; @@ -829,6 +843,8 @@ int main(int argc, char **argv) flags = TextSplit::TXTS_NOSPANS; if (op_flags & OPT_k) flags = (TextSplit::Flags)(flags | TextSplit::TXTS_KEEPWILD); + if (op_flags & OPT_n) + TextSplit::noNumbers(); string odata, reason; if (argc == 1) { diff --git a/src/common/textsplit.h b/src/common/textsplit.h index 7f2aa47a..8b152ee3 100644 --- a/src/common/textsplit.h +++ b/src/common/textsplit.h @@ -37,7 +37,8 @@ class Utf8Iter; class TextSplit { public: // Should we activate special processing of Chinese characters ? This - // needs a little more cpu, so it can be turned off globally. + // needs a little more cpu, so it can be turned off globally. This is set + // by rclconfig, changing it means reindexing static bool o_processCJK; static unsigned int o_CJKNgramLen; static const unsigned int o_CJKMaxNgramLen = 5; @@ -48,6 +49,13 @@ public: ngramlen : o_CJKMaxNgramLen; } + // Are we indexing numbers ? Set by rclconfig. Change needs reindex + static bool o_noNumbers; + static void noNumbers() + { + o_noNumbers = true; + } + enum Flags {TXTS_NONE = 0, TXTS_ONLYSPANS = 1, // Only return maximum spans (a@b.com) TXTS_NOSPANS = 2, // Only return atomic words (a, b, com) diff --git a/src/doc/user/docbook.css b/src/doc/user/docbook.css index 55a9fa43..3d40fa70 100644 --- a/src/doc/user/docbook.css +++ b/src/doc/user/docbook.css @@ -1,5 +1,5 @@ /* - * Copyright (c) 2001, 2003 The FreeBSD Documentation Project + * Copyright (c) 2001, 2003, 2010 The FreeBSD Documentation Project * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -23,7 +23,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: doc/share/misc/docbook.css,v 1.14 2008/11/21 07:28:34 keramida Exp $ + * $FreeBSD: doc/share/misc/docbook.css,v 1.15 2010/03/20 04:15:01 hrs Exp $ */ BODY ADDRESS { @@ -201,3 +201,8 @@ BLOCKQUOTE.WARNING { padding: 0.2em 2em; width: 90%; } + +.INFORMALTABLE TABLE.CALSTABLE TR TD { + padding-left: 1em; + padding-right: 1em; +} diff --git a/src/doc/user/usermanual.sgml b/src/doc/user/usermanual.sgml index 19f4df6e..6cd26572 100644 --- a/src/doc/user/usermanual.sgml +++ b/src/doc/user/usermanual.sgml @@ -1,8 +1,8 @@ Recoll"> Xapian"> - ]> @@ -10,7 +10,6 @@ Recoll user manual - Jean-Francois Dockes @@ -41,7 +40,7 @@ Giving it a try - + If you do not like reading manuals (who does?) and would like to give &RCL; a try, just perform installation and start the @@ -2956,8 +2955,14 @@ while query.next >= 0 and query.next < nres: configuration, click Cancel, and edit the configuration file before restarting the command. This will start the initial indexing, which may take some time. - - Paramers affecting what we index: + + Most of the following parameters can be changed from the + Index Configuration menu in the + recoll interface. Some can only be set by + editing the configuration file. + + + Parameters affecting what documents we index: @@ -3116,9 +3121,95 @@ skippedPaths = ~/somedir/∗.txt + + + Parameters affecting how we generate terms: - Parameters affecting where and how we store things: + Changing some of these parameters will imply a full + reindex. Also, when using multiple indexes, it may not make sense + to search indexes that don't share the values for these parameters, + because they usually affect both search and index operations. + + + + nonumbers + If this set to true, no terms will be generated + for numbers. For example "123", "1.5e6", 192.168.1.4, would not + be indexed ("value123" would still be). Numbers are often quite + interesting to search for, and this should probably not be set + except for special situations, ie, scientific documents with huge + amounts of numbers in them. + + + + nocjk + If this set to true, specific east asian + (Chinese Korean Japanese) characters/word splitting is + turned off. This will save a small amount of cpu if you + have no CJK documents. If your document base does include + such text but you are not interested in searching it, + setting nocjk may be a significant time + and space saver. + + + + cjkngramlen + This lets you adjust the size of n-grams + used for indexing CJK text. The default value of 2 is + probably appropriate in most cases. A value of 3 would + allow more precision and efficiency on longer words, but + the index will be approximately twice as large. + + + indexstemminglanguages + A list of languages for which the stem + expansion databases will be built. See recollindex(1) or + use the recollindex -l command for + possible values. You can add a stem expansion database for + a different language by using recollindex + -s, but it will be deleted during the next + indexing. Only languages listed in the configuration + file are permanent. + + + + defaultcharset + The name of the character set used for + files that do not contain a character set definition (ie: + plain text files). This can be redefined for any + sub-directory. If it is not set at all, the character set + used is the one defined by the nls environment (LC_ALL, + LC_CTYPE, LANG), or iso8859-1 if nothing is set. + + + + maildefcharset + This can be used to define the default + character set specifically for mail messages which don't + specify it. This is mainly useful for readpst (libpst) dumps, + which are utf-8 but do not say so. + + + + localfields + This allows setting fields for all documents + under a given directory. Typical usage would be to set an + "rclaptg" field, to be used in mimeview to + select a specific viewer. If several fields are to be set, they + should be separated with a colon (':') character (which there + is currently no way to escape). Ie: + localfields= rclaptg=gnus:other = val, then + select specifier viewer with + mimetype|tag=... in + mimeview. + + + + + + + Parameters affecting where and how we store things: dbdir @@ -3181,8 +3272,11 @@ skippedPaths = ~/somedir/∗.txt + - Miscellani: + + + Miscellaneous parameters: @@ -3204,57 +3298,12 @@ skippedPaths = ~/somedir/∗.txt - indexstemminglanguages - A list of languages for which the stem - expansion databases will be built. See recollindex(1) or - use the recollindex -l command for - possible values. You can add a stem expansion database for - a different language by using recollindex - -s, but it will be deleted during the next - indexing. Only languages listed in the configuration - file are permanent. - - - - defaultcharset - The name of the character set used for - files that do not contain a character set definition (ie: - plain text files). This can be redefined for any - sub-directory. If it is not set at all, the character set - used is the one defined by the nls environment (LC_ALL, - LC_CTYPE, LANG), or iso8859-1 if nothing is set. - - - filtermaxseconds Maximum filter execution time, after which it is aborted. Some postscript programs just loop... - maildefcharset - This can be used to define the default - character set specifically for mail messages which don't - specify it. This is mainly useful for readpst (libpst) dumps, - which are utf-8 but do not say so. - - - - localfields - This allows setting fields for all - documents under a given directory. Typical usage would be - to set an "rclaptg" field, to be used in - mimeview to select a specific - viewer. If several fields are to be set, they should be - separated with a ':' character (which there is currently no way to - escape). Ie: - localfields= rclaptg=gnus:other = val, then - select specifier viewer with - mimetype|tag=... in - mimeview. - - - filtersdir A directory to search for the external filter scripts used to index some types of files. The @@ -3309,26 +3358,6 @@ skippedPaths = ~/somedir/∗.txt - nocjk - If this set to true, specific east asian - (Chinese Korean Japanese) characters/word splitting is - turned off. This will save a small amount of cpu if you - have no CJK documents. If your document base does include - such text but you are not interested in searching it, - setting nocjk may be a significant time - and space saver. - - - - cjkngramlen - This lets you adjust the size of n-grams - used for indexing CJK text. The default value of 2 is - probably appropriate in most cases. A value of 3 would - allow more precision and efficiency on longer words, but - the index will be approximately twice as large. - - - guesscharset Decide if we try to guess the character set of files if no internal value is available (ie: for diff --git a/src/rcldb/searchdata.cpp b/src/rcldb/searchdata.cpp index 88d239ba..d3f8cca1 100644 --- a/src/rcldb/searchdata.cpp +++ b/src/rcldb/searchdata.cpp @@ -82,7 +82,10 @@ bool SearchData::toNativeQuery(Rcl::Db &db, void *d) m_reason = (*it)->getReason(); return false; } - + if (nq.empty()) { + LOGDEB(("SearchData::toNativeQuery: skipping empty clause\n")); + continue; + } // If this structure is an AND list, must use AND_NOT for excl clauses. // Else this is an OR list, and there can't be excl clauses (checked by // addClause()) @@ -191,10 +194,11 @@ void SearchData::getUTerms(vector& terms) const class TextSplitQ : public TextSplit { public: TextSplitQ(Flags flags, const StopList &_stops) - : TextSplit(flags), stops(_stops), alltermcount(0) + : TextSplit(flags), stops(_stops), alltermcount(0), lastpos(0) {} - bool takeword(const std::string &interm, int , int, int) { + bool takeword(const std::string &interm, int pos, int, int) { alltermcount++; + lastpos = pos LOGDEB1(("TextSplitQ::takeword: %s\n", interm.c_str())); // Check if the first letter is a majuscule in which @@ -233,6 +237,7 @@ class TextSplitQ : public TextSplit { // Count of terms including stopwords: this is for adjusting // phrase/near slack int alltermcount; + int lastpos; }; // A class used to translate a user compound string (*not* a query @@ -456,8 +461,10 @@ void StringToXapianQ::processPhraseOrNear(TextSplitQ *splitData, // Generate an appropriate PHRASE/NEAR query with adjusted slack // For phrases, give a relevance boost like we do for original terms + LOGDEB2(("PHRASE/NEAR: alltermcount %d lastpos %d\n", + splitData->alltermcount, splitData->lastpos)); Xapian::Query xq(op, orqueries.begin(), orqueries.end(), - splitData->alltermcount + slack); + splitData->lastpos + 1 + slack); if (op == Xapian::Query::OP_PHRASE) xq = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, xq, original_term_wqf_booster); @@ -611,7 +618,7 @@ bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p, return false; if (pqueries.empty()) { LOGERR(("SearchDataClauseSimple: resolved to null query\n")); - return false; + return true; } tr.getTerms(m_terms, m_groups); tr.getUTerms(m_uterms);