From d12021b22cb40d3c40234653a0d4c6756edb9a39 Mon Sep 17 00:00:00 2001
From: dockes <none@none>
Date: Thu, 18 Jan 2007 12:09:58 +0000
Subject: [PATCH] handle wildcards in search terms

---
 src/common/textsplit.cpp | 22 ++++++++++--
 src/common/textsplit.h   |  5 +--
 src/rcldb/searchdata.cpp | 73 ++++++++++++++++++++++++++++------------
 3 files changed, 74 insertions(+), 26 deletions(-)
diff --git a/src/common/textsplit.cpp b/src/common/textsplit.cpp
index 24615dc1..076d220b 100644
--- a/src/common/textsplit.cpp
+++ b/src/common/textsplit.cpp
@@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.27 2006-12-08 07:11:17 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.28 2007-01-18 12:09:58 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 /*
  *   This program is free software; you can redistribute it and/or modify
@@ -52,7 +52,7 @@ using namespace std;
 // The array is actually a remnant of the original version which did no utf8
 // It could be reduced to 128, because real (over 128) utf8 chars are now 
 // handled with a set holding all the separator values.
-enum CharClass {LETTER=256, SPACE=257, DIGIT=258};
+enum CharClass {LETTER=256, SPACE=257, DIGIT=258, WILD=259};
 static int charclasses[256];
 
 static set<unsigned int> unicign;
@@ -76,10 +76,14 @@ static void setcharclasses()
     for (i = 0; i < strlen(blankspace); i++)
 	charclasses[int(blankspace[i])] = SPACE;
 
-    char seps[] = "!\"$%&()/<=>[\\]^{|}~:;*`?";
+    char seps[] = "!\"$%&()/<=>[\\]^{|}~:;`";
     for (i = 0; i  < strlen(seps); i++)
 	charclasses[int(seps[i])] = SPACE;
 
+    char wild[] = "*?";
+    for (i = 0; i  < strlen(wild); i++)
+	charclasses[int(wild[i])] = WILD;
+
     char special[] = ".@+-,#'\n\r";
     for (i = 0; i  < strlen(special); i++)
 	charclasses[int(special[i])] = special[i];
@@ -244,6 +248,12 @@ bool TextSplit::text_to_words(const string &in)
 		number = false;
 	    }
 	    break;
+	case WILD:
+	    if (m_flags & TXTS_KEEPWILD)
+		goto NORMALCHAR;
+	    else
+		goto SPACE;
+	    break;
 	case '-':
 	case '+':
 	    if (wordLen == 0) {
@@ -338,6 +348,7 @@ bool TextSplit::text_to_words(const string &in)
 	    break;
 
 	default:
+	NORMALCHAR:
 	    wordLen += it.appendchartostring(span);
 	    break;
 	}
@@ -426,6 +437,7 @@ static string usage =
     "   -S: no output\n"
     "   -s:  only spans\n"
     "   -w:  only words\n"
+    "   -k:  preserve wildcards (?*)\n"
     "   -c: just count words\n"
     " if filename is 'stdin', will read stdin for data (end with ^D)\n"
     "  \n\n"
@@ -443,6 +455,7 @@ static int        op_flags;
 #define OPT_w	  0x2
 #define OPT_S	  0x4
 #define OPT_c     0x8
+#define OPT_k     0x10
 
 int main(int argc, char **argv)
 {
@@ -457,6 +470,7 @@ int main(int argc, char **argv)
 	while (**argv)
 	    switch (*(*argv)++) {
 	    case 'c':	op_flags |= OPT_c; break;
+	    case 'k':	op_flags |= OPT_k; break;
 	    case 's':	op_flags |= OPT_s; break;
 	    case 'S':	op_flags |= OPT_S; break;
 	    case 'w':	op_flags |= OPT_w; break;
@@ -477,6 +491,8 @@ int main(int argc, char **argv)
 	flags = TextSplit::TXTS_ONLYSPANS;
     else if (op_flags&OPT_w)
 	flags = TextSplit::TXTS_NOSPANS;
+    if (op_flags & OPT_k) 
+	flags = (TextSplit::Flags)(flags | TextSplit::TXTS_KEEPWILD); 
 
     string data;
     if (argc == 1) {
diff --git a/src/common/textsplit.h b/src/common/textsplit.h
index 50bcabaf..b3da5dc3 100644
--- a/src/common/textsplit.h
+++ b/src/common/textsplit.h
@@ -16,7 +16,7 @@
  */
 #ifndef _TEXTSPLIT_H_INCLUDED_
 #define _TEXTSPLIT_H_INCLUDED_
-/* @(#$Id: textsplit.h,v 1.15 2006-12-08 07:11:17 dockes Exp $  (C) 2004 J.F.Dockes */
+/* @(#$Id: textsplit.h,v 1.16 2007-01-18 12:09:58 dockes Exp $  (C) 2004 J.F.Dockes */
 
 #include <string>
 #ifndef NO_NAMESPACES
@@ -44,7 +44,8 @@ public:
  */
 class TextSplit {
 public:
-    enum Flags {TXTS_NONE = 0, TXTS_ONLYSPANS = 1, TXTS_NOSPANS = 2};
+    enum Flags {TXTS_NONE = 0, TXTS_ONLYSPANS = 1, TXTS_NOSPANS = 2, 
+		TXTS_KEEPWILD = 4};
     /**
      * Constructor: just store callback object
      */
diff --git a/src/rcldb/searchdata.cpp b/src/rcldb/searchdata.cpp
index d51bce3d..8daae5e4 100644
--- a/src/rcldb/searchdata.cpp
+++ b/src/rcldb/searchdata.cpp
@@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.8 2007-01-17 13:53:41 dockes Exp $ (C) 2006 J.F.Dockes";
+static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.9 2007-01-18 12:09:58 dockes Exp $ (C) 2006 J.F.Dockes";
 #endif
 /*
  *   This program is free software; you can redistribute it and/or modify
@@ -152,7 +152,6 @@ public:
 	: m_db(db), m_stemlang(stmlng) 
     { }
 
-
     bool translate(const string &iq,
 		   const string &prefix,
 		   string &ermsg,
@@ -168,7 +167,8 @@ public:
     }
 
 private:
-    void maybeStemExp(bool dont, const string& term, list<string>& exp);
+    void maybeStemExp(bool dont, const string& term, list<string>& exp, 
+		      string& sterm);
 
     Db&           m_db;
     const string& m_stemlang;
@@ -178,22 +178,37 @@ private:
     vector<vector<string> > m_groups; 
 };
 
-/** Make term dumb and possibly expand it into its stem siblings. */
+/** Unaccent and lowercase term, possibly expand stem and wildcards
+ *
+ * @param nostemexp don't perform stem expansion. This is mainly used to
+ *   prevent stem expansion inside phrases. 2 other factors can turn
+ *   stem expansion off: a null stemlang, resulting from a global user
+ *   preference, or a capitalized term.
+ * @param term input single word
+ * @param exp output expansion list
+ * @param sterm output lower-cased+unaccented version of the input term 
+ *              (only if stem expansion actually occured, else empty)
+ */
 void StringToXapianQ::maybeStemExp(bool nostemexp, 
 				   const string& term, 
-				   list<string>& exp)
+				   list<string>& exp,
+				   string &sterm)
 {
     LOGDEB2(("maybeStemExp: term [%s] stemlang [%s] nostemexp %d\n", 
 	    term.c_str(), m_stemlang.c_str(), nostemexp));
+    sterm.erase();
     if (term.empty()) {
 	exp.clear();
 	return;
     }
-
+    // term1 is lowercase and without diacritics
     string term1;
     dumb_string(term, term1);
 
-    if (m_stemlang.empty())
+    bool haswild = term.find_first_of("*?") != string::npos;
+
+    // No stemming if there are wildcards or prevented globally.
+    if (haswild || m_stemlang.empty())
 	nostemexp = true;
 
     if (!nostemexp) {
@@ -201,7 +216,7 @@ void StringToXapianQ::maybeStemExp(bool nostemexp,
 	// case we do not want to do stem expansion. Note that
 	// the test is convoluted and possibly problematic
 
-	string noacterm,noaclowterm;
+	string noacterm, noaclowterm;
 	if (unacmaybefold(term, noacterm, "UTF-8", false) &&
 	    unacmaybefold(noacterm, noaclowterm, "UTF-8", true)) {
 	    Utf8Iter it1(noacterm);
@@ -209,14 +224,20 @@ void StringToXapianQ::maybeStemExp(bool nostemexp,
 	    if (*it1 != *it2)
 		nostemexp = true;
 	}
-	LOGDEB1(("Term: %s stem expansion: %s\n", term.c_str()));
     }
 
-    if (nostemexp) {
-	exp = list<string>(1, term1);
+    if (nostemexp && !haswild) {
+	// Neither stemming nor wildcard expansion: just the word
+	exp.push_front(term1);
+	exp.resize(1);
     } else {
 	list<TermMatchEntry> l;
-	m_db.termMatch(Rcl::Db::ET_STEM, m_stemlang, term1, l);
+	if (haswild) {
+	    m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, term1, l);
+	} else {
+	    sterm = term1;
+	    m_db.termMatch(Rcl::Db::ET_STEM, m_stemlang, term1, l);
+	}
 	for (list<TermMatchEntry>::const_iterator it = l.begin(); 
 	     it != l.end(); it++) {
 	    exp.push_back(it->term);
@@ -285,7 +306,6 @@ bool StringToXapianQ::translate(const string &iq,
 				list<Xapian::Query> &pqueries,
 				int slack, bool useNear)
 {
-    string qstring = iq;
     LOGDEB2(("StringToXapianQ:: query string: [%s]\n", iq.c_str()));
     ermsg.erase();
     m_terms.clear();
@@ -293,7 +313,7 @@ bool StringToXapianQ::translate(const string &iq,
 
     // Split into words and phrases (word1 word2 "this is a phrase"):
     list<string> phrases;
-    stringToStrings(qstring, phrases);
+    stringToStrings(iq, phrases);
 
     // Then process each word/phrase: split into terms and transform
     // into appropriate Xapian Query
@@ -307,9 +327,13 @@ bool StringToXapianQ::translate(const string &iq,
 	    // a span would fail if we didn't adjust the proximity to
 	    // account for the additional span term which is complicated.
 	    wsQData splitDataS, splitDataW;
-	    TextSplit splitterS(&splitDataS, TextSplit::TXTS_ONLYSPANS);
+	    TextSplit splitterS(&splitDataS, (TextSplit::Flags)
+				(TextSplit::TXTS_ONLYSPANS | 
+				 TextSplit::TXTS_KEEPWILD));
 	    splitterS.text_to_words(*it);
-	    TextSplit splitterW(&splitDataW, TextSplit::TXTS_NOSPANS);
+	    TextSplit splitterW(&splitDataW, (TextSplit::Flags)
+				(TextSplit::TXTS_NOSPANS | 
+				 TextSplit::TXTS_KEEPWILD));
 	    splitterW.text_to_words(*it);
 	    wsQData *splitData = &splitDataS;
 	    if (splitDataS.terms.size() > 1 && 
@@ -324,12 +348,19 @@ bool StringToXapianQ::translate(const string &iq,
 		{
 		    string term = splitData->terms.front();
 		    list<string> exp;  
-		    maybeStemExp(false, term, exp);
+		    string sterm;
+		    maybeStemExp(false, term, exp, sterm);
 		    m_terms.insert(m_terms.end(), exp.begin(), exp.end());
 		    // Push either term or OR of stem-expanded set
 		    addPrefix(exp, prefix);
-		    pqueries.push_back(Xapian::Query(Xapian::Query::OP_OR, 
-						     exp.begin(), exp.end()));
+		    Xapian::Query xq(Xapian::Query::OP_OR, 
+				     exp.begin(), exp.end());
+		    // Give a relevance boost to the original term
+		    if (!sterm.empty()) {
+			xq = Xapian::Query(Xapian::Query::OP_OR, 
+					   xq, Xapian::Query(sterm, 10));
+		    }
+		    pqueries.push_back(xq);
 		}
 		break;
 
@@ -347,9 +378,9 @@ bool StringToXapianQ::translate(const string &iq,
 		    bool nostemexp = 
 			(op == Xapian::Query::OP_PHRASE || hadmultiple) ?
 			true : false;
-
+		    string sterm;
 		    list<string>exp;
-		    maybeStemExp(nostemexp, *it, exp);
+		    maybeStemExp(nostemexp, *it, exp, sterm);
 
 		    groups.push_back(vector<string>(exp.begin(), exp.end()));
 		    addPrefix(exp, prefix);