modified the time at which we unaccent so that we can do the Capitalized->nostemming test on single words (this had been broken by the change of noac/split order done earlier to get japanese to work)
This commit is contained in:
parent
30c46709ba
commit
7dcc7c61c8
@ -42,6 +42,7 @@ using std::set;
|
||||
#include "smallut.h"
|
||||
#include "plaintorich.h"
|
||||
#include "cancelcheck.h"
|
||||
#include "unacpp.h"
|
||||
|
||||
const string PlainToRich::snull = "";
|
||||
|
||||
@ -84,7 +85,10 @@ class myTextSplitCB : public TextSplitCB {
|
||||
// Callback called by the text-to-words breaker for each word
|
||||
virtual bool takeword(const std::string& term, int pos, int bts, int bte) {
|
||||
string dumb;
|
||||
Rcl::dumb_string(term, dumb);
|
||||
if (!unacmaybefold(term, dumb, "UTF-8", true)) {
|
||||
LOGINFO(("PlainToRich::splitter::takeword: unac failed for [%s]\n", term.c_str()));
|
||||
return true;
|
||||
}
|
||||
//LOGDEB2(("Input dumbbed term: '%s' %d %d %d\n", dumb.c_str(),
|
||||
// pos, bts, bte));
|
||||
|
||||
|
||||
@ -784,27 +784,33 @@ private:
|
||||
};
|
||||
|
||||
// Callback for the document to word splitting class during indexation
|
||||
bool mySplitterCB::takeword(const std::string &term, int pos, int, int)
|
||||
bool mySplitterCB::takeword(const std::string &_term, int pos, int, int)
|
||||
{
|
||||
#if 0
|
||||
LOGDEB(("mySplitterCB::takeword:splitCb: [%s]\n", term.c_str()));
|
||||
string printable;
|
||||
if (transcode(term, printable, "UTF-8", "ISO-8859-1")) {
|
||||
LOGDEB((" [%s]\n", printable.c_str()));
|
||||
}
|
||||
LOGDEB(("mySplitterCB::takeword:splitCb: [%s]\n", _term.c_str()));
|
||||
#endif
|
||||
string term;
|
||||
if (!unacmaybefold(_term, term, "UTF-8", true)) {
|
||||
LOGINFO(("Db::splitter::takeword: unac failed for [%s]\n", _term.c_str()));
|
||||
term.clear();
|
||||
// We don't generate a fatal error because of a bad term
|
||||
return true;
|
||||
}
|
||||
|
||||
if (stops.hasStops() && stops.isStop(term)) {
|
||||
LOGDEB1(("Db: takeword [%s] in stop list\n", term.c_str()));
|
||||
return true;
|
||||
}
|
||||
|
||||
// Compute absolute position (pos is relative to current segment),
|
||||
// and remember relative.
|
||||
curpos = pos;
|
||||
pos += basepos;
|
||||
string ermsg;
|
||||
try {
|
||||
if (stops.hasStops() && stops.isStop(term)) {
|
||||
LOGDEB1(("Db: takeword [%s] in stop list\n", term.c_str()));
|
||||
return true;
|
||||
}
|
||||
// Note: 1 is the within document frequency increment. It would
|
||||
// be possible to assign different weigths to doc parts (ie title)
|
||||
// by using a higher value
|
||||
curpos = pos;
|
||||
pos += basepos;
|
||||
doc.add_posting(term, pos, 1);
|
||||
if (!prefix.empty()) {
|
||||
doc.add_posting(prefix + term, pos, 1);
|
||||
@ -815,28 +821,6 @@ bool mySplitterCB::takeword(const std::string &term, int pos, int, int)
|
||||
return false;
|
||||
}
|
||||
|
||||
// Unaccent and lowercase data, replace \n\r with spaces
|
||||
// Removing crlfs is so that we can use the text in the document data fields.
|
||||
// Use unac (with folding extension) for removing accents and casefolding
|
||||
//
|
||||
// Note that we always return true (but set out to "" on error). We don't
|
||||
// want to stop indexation because of a bad string
|
||||
bool dumb_string(const string &in, string &out)
|
||||
{
|
||||
out.clear();
|
||||
if (in.empty())
|
||||
return true;
|
||||
|
||||
string s1 = neutchars(in, "\n\r");
|
||||
if (!unacmaybefold(s1, out, "UTF-8", true)) {
|
||||
LOGINFO(("dumb_string: unac failed for [%s]\n", in.c_str()));
|
||||
out.clear();
|
||||
// See comment at start of func
|
||||
return true;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Let our user set the parameters for abstract processing
|
||||
void Db::setAbstractParams(int idxtrunc, int syntlen, int syntctxlen)
|
||||
{
|
||||
@ -891,14 +875,11 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
|
||||
Xapian::Document newdocument;
|
||||
mySplitterCB splitData(newdocument, m_stops);
|
||||
TextSplit splitter(&splitData);
|
||||
string noacc;
|
||||
|
||||
// Split and index file name as document term(s)
|
||||
LOGDEB2(("Db::add: split file name [%s]\n", fn.c_str()));
|
||||
if (dumb_string(doc.utf8fn, noacc)) {
|
||||
splitter.text_to_words(noacc);
|
||||
splitData.basepos += splitData.curpos + 100;
|
||||
}
|
||||
splitter.text_to_words(doc.utf8fn);
|
||||
splitData.basepos += splitData.curpos + 100;
|
||||
|
||||
// Index textual metadata. These are all indexed as text with
|
||||
// positions, as we may want to do phrase searches with them (this
|
||||
@ -918,12 +899,8 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
|
||||
LOGDEB0(("Db::add: field [%s] pfx [%s]: [%s]\n",
|
||||
meta_it->first.c_str(), pfx.c_str(),
|
||||
meta_it->second.c_str()));
|
||||
if (!dumb_string(meta_it->second, noacc)) {
|
||||
LOGERR(("Db::add: dumb_string failed\n"));
|
||||
return false;
|
||||
}
|
||||
splitData.setprefix(pfx); // Subject
|
||||
splitter.text_to_words(noacc);
|
||||
splitter.text_to_words(meta_it->second);
|
||||
splitData.setprefix(string());
|
||||
splitData.basepos += splitData.curpos + 100;
|
||||
}
|
||||
@ -936,31 +913,28 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
|
||||
|
||||
// Split and index body text
|
||||
LOGDEB2(("Db::add: split body\n"));
|
||||
if (!dumb_string(doc.text, noacc)) {
|
||||
LOGERR(("Db::add: dumb_string failed\n"));
|
||||
return false;
|
||||
}
|
||||
splitter.text_to_words(noacc);
|
||||
splitter.text_to_words(doc.text);
|
||||
|
||||
////// Special terms for other metadata. No positions for these.
|
||||
// Mime type
|
||||
newdocument.add_term("T" + doc.mimetype);
|
||||
|
||||
// Simple file name. This is used for file name searches only. We index
|
||||
// it with a term prefix. utf8fn used to be the full path, but it's now
|
||||
// the simple file name.
|
||||
// Simple file name indexed for file name searches with a term prefix
|
||||
// We also add a term for the filename extension if any.
|
||||
if (dumb_string(doc.utf8fn, noacc) && !noacc.empty()) {
|
||||
// We should truncate after extracting the extension, but this is
|
||||
// a pathological case anyway
|
||||
if (noacc.size() > 230)
|
||||
utf8truncate(noacc, 230);
|
||||
string::size_type pos = noacc.rfind('.');
|
||||
if (pos != string::npos && pos != noacc.length() -1) {
|
||||
newdocument.add_term(string("XE") + noacc.substr(pos+1));
|
||||
if (!doc.utf8fn.empty()) {
|
||||
string fn;
|
||||
if (unacmaybefold(doc.utf8fn, fn, "UTF-8", true)) {
|
||||
// We should truncate after extracting the extension, but this is
|
||||
// a pathological case anyway
|
||||
if (fn.size() > 230)
|
||||
utf8truncate(fn, 230);
|
||||
string::size_type pos = fn.rfind('.');
|
||||
if (pos != string::npos && pos != fn.length() - 1) {
|
||||
newdocument.add_term(string("XE") + fn.substr(pos + 1));
|
||||
}
|
||||
fn = string("XSFN") + fn;
|
||||
newdocument.add_term(fn);
|
||||
}
|
||||
noacc = string("XSFN") + noacc;
|
||||
newdocument.add_term(noacc);
|
||||
}
|
||||
|
||||
// Udi unique term: this is used for file existence/uptodate
|
||||
@ -1329,7 +1303,10 @@ bool Db::purgeFile(const string &udi)
|
||||
bool Db::filenameWildExp(const string& fnexp, list<string>& names)
|
||||
{
|
||||
string pattern;
|
||||
dumb_string(fnexp, pattern);
|
||||
if (!unacmaybefold(fnexp, pattern, "UTF-8", true)) {
|
||||
LOGERR(("Db::filenameWildExp: unac error for [%s]\n", fnexp.c_str()));
|
||||
return false;
|
||||
}
|
||||
names.clear();
|
||||
|
||||
// If pattern is not quoted, and has no wildcards, we add * at
|
||||
@ -1415,7 +1392,10 @@ bool Db::termMatch(MatchType typ, const string &lang,
|
||||
|
||||
// Get rid of capitals and accents
|
||||
string droot;
|
||||
dumb_string(root, droot);
|
||||
if (!unacmaybefold(root, droot, "UTF-8", true)) {
|
||||
LOGERR(("Db::termMatch: unac failed for [%s]\n", root.c_str()));
|
||||
return false;
|
||||
}
|
||||
string nochars = typ == ET_WILD ? wildSpecChars : regSpecChars;
|
||||
|
||||
string prefix;
|
||||
|
||||
@ -243,9 +243,6 @@ private:
|
||||
Db& operator=(const Db &) {return *this;};
|
||||
};
|
||||
|
||||
// Unaccent and lowercase data.
|
||||
extern bool dumb_string(const string &in, string &out);
|
||||
|
||||
#ifndef NO_NAMESPACES
|
||||
}
|
||||
#endif // NO_NAMESPACES
|
||||
|
||||
@ -183,17 +183,39 @@ class wsQData : public TextSplitCB {
|
||||
wsQData(const StopList &_stops)
|
||||
: stops(_stops), alltermcount(0)
|
||||
{}
|
||||
vector<string> terms;
|
||||
bool takeword(const std::string &term, int , int, int) {
|
||||
bool takeword(const std::string &interm, int , int, int) {
|
||||
alltermcount++;
|
||||
LOGDEB1(("wsQData::takeword: %s\n", term.c_str()));
|
||||
if (stops.hasStops() && stops.isStop(term)) {
|
||||
LOGDEB1(("wsQData::takeword [%s] in stop list\n", term.c_str()));
|
||||
LOGDEB1(("wsQData::takeword: %s\n", interm.c_str()));
|
||||
|
||||
// Check if the first letter is a majuscule in which
|
||||
// case we do not want to do stem expansion. Note that
|
||||
// the test is convoluted and possibly problematic
|
||||
string noacterm, noaclowterm;
|
||||
if (!unacmaybefold(interm, noacterm, "UTF-8", false)) {
|
||||
LOGINFO(("SearchData::splitter::takeword: unac failed for [%s]\n", interm.c_str()));
|
||||
return true;
|
||||
}
|
||||
if (!unacmaybefold(noacterm, noaclowterm, "UTF-8", true)) {
|
||||
LOGINFO(("SearchData::splitter::takeword: unac failed for [%s]\n", noacterm.c_str()));
|
||||
return true;
|
||||
}
|
||||
terms.push_back(term);
|
||||
bool nostemexp = false;
|
||||
Utf8Iter it1(noacterm);
|
||||
Utf8Iter it2(noaclowterm);
|
||||
if (*it1 != *it2)
|
||||
nostemexp = true;
|
||||
|
||||
if (stops.hasStops() && stops.isStop(noaclowterm)) {
|
||||
LOGDEB1(("wsQData::takeword [%s] in stop list\n", noaclowterm.c_str()));
|
||||
return true;
|
||||
}
|
||||
terms.push_back(noaclowterm);
|
||||
nostemexps.push_back(nostemexp);
|
||||
return true;
|
||||
}
|
||||
|
||||
vector<string> terms;
|
||||
vector<bool> nostemexps;
|
||||
const StopList &stops;
|
||||
// Count of terms including stopwords: this is for adjusting
|
||||
// phrase/near slack
|
||||
@ -232,7 +254,7 @@ private:
|
||||
void expandTerm(bool dont, const string& term, list<string>& exp,
|
||||
string& sterm);
|
||||
// After splitting entry on whitespace: process non-phrase element
|
||||
void processSimpleSpan(const string& span, list<Xapian::Query> &pqueries);
|
||||
void processSimpleSpan(const string& span, bool nostemexp, list<Xapian::Query> &pqueries);
|
||||
// Process phrase/near element
|
||||
void processPhraseOrNear(wsQData *splitData,
|
||||
list<Xapian::Query> &pqueries,
|
||||
@ -279,18 +301,6 @@ void StringToXapianQ::expandTerm(bool nostemexp,
|
||||
nostemexp = true;
|
||||
|
||||
if (!nostemexp) {
|
||||
// Check if the first letter is a majuscule in which
|
||||
// case we do not want to do stem expansion. Note that
|
||||
// the test is convoluted and possibly problematic
|
||||
|
||||
string noacterm, noaclowterm;
|
||||
if (unacmaybefold(term, noacterm, "UTF-8", false) &&
|
||||
unacmaybefold(noacterm, noaclowterm, "UTF-8", true)) {
|
||||
Utf8Iter it1(noacterm);
|
||||
Utf8Iter it2(noaclowterm);
|
||||
if (*it1 != *it2)
|
||||
nostemexp = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (nostemexp && !haswild) {
|
||||
@ -356,12 +366,12 @@ static void addPrefix(list<string>& terms, const string& prefix)
|
||||
it->insert(0, prefix);
|
||||
}
|
||||
|
||||
void StringToXapianQ::processSimpleSpan(const string& span,
|
||||
void StringToXapianQ::processSimpleSpan(const string& span, bool nostemexp,
|
||||
list<Xapian::Query> &pqueries)
|
||||
{
|
||||
list<string> exp;
|
||||
string sterm; // dumb version of user term
|
||||
expandTerm(false, span, exp, sterm);
|
||||
expandTerm(nostemexp, span, exp, sterm);
|
||||
m_terms.insert(m_terms.end(), exp.begin(), exp.end());
|
||||
addPrefix(exp, m_prefix);
|
||||
// Push either term or OR of stem-expanded set
|
||||
@ -396,12 +406,13 @@ void StringToXapianQ::processPhraseOrNear(wsQData *splitData,
|
||||
vector<vector<string> >groups;
|
||||
|
||||
// Go through the list and perform stem/wildcard expansion for each element
|
||||
vector<bool>::iterator nxit = splitData->nostemexps.begin();
|
||||
for (vector<string>::iterator it = splitData->terms.begin();
|
||||
it != splitData->terms.end(); it++) {
|
||||
it != splitData->terms.end(); it++, nxit++) {
|
||||
// Adjust when we do stem expansion. Not inside phrases, and
|
||||
// some versions of xapian will accept only one OR clause
|
||||
// inside NEAR, all others must be leafs.
|
||||
bool nostemexp = (op == Xapian::Query::OP_PHRASE) || hadmultiple;
|
||||
bool nostemexp = *nxit || (op == Xapian::Query::OP_PHRASE) || hadmultiple;
|
||||
|
||||
string sterm;
|
||||
list<string>exp;
|
||||
@ -434,7 +445,10 @@ void StringToXapianQ::processPhraseOrNear(wsQData *splitData,
|
||||
|
||||
/**
|
||||
* Turn user entry string (NOT query language) into a list of xapian queries.
|
||||
* We just separate words and phrases, and do wildcard and stemp expansion,
|
||||
* We just separate words and phrases, and do wildcard and stem expansion,
|
||||
*
|
||||
* This is used to process data entered into an OR/AND/NEAR/PHRASE field of
|
||||
* the GUI.
|
||||
*
|
||||
* The final list contains one query for each term or phrase
|
||||
* - Elements corresponding to a stem-expanded part are an OP_OR
|
||||
@ -444,7 +458,7 @@ void StringToXapianQ::processPhraseOrNear(wsQData *splitData,
|
||||
* @return the subquery count (either or'd stem-expanded terms or phrase word
|
||||
* count)
|
||||
*/
|
||||
bool StringToXapianQ::processUserString(const string &_iq,
|
||||
bool StringToXapianQ::processUserString(const string &iq,
|
||||
string &ermsg,
|
||||
list<Xapian::Query> &pqueries,
|
||||
const StopList& stops,
|
||||
@ -452,25 +466,18 @@ bool StringToXapianQ::processUserString(const string &_iq,
|
||||
bool useNear
|
||||
)
|
||||
{
|
||||
LOGDEB(("StringToXapianQ:: query string: [%s]\n", _iq.c_str()));
|
||||
LOGDEB(("StringToXapianQ:: query string: [%s]\n", iq.c_str()));
|
||||
ermsg.erase();
|
||||
m_terms.clear();
|
||||
m_groups.clear();
|
||||
|
||||
// First unaccent/normalize the input: do it first so that it
|
||||
// happens in the same order as when indexing: unac then split. As
|
||||
// the character count can change during normalisation, this is
|
||||
// specially important for cjk because the artificial cjk split is
|
||||
// based on character counts
|
||||
string iq;
|
||||
dumb_string(_iq, iq);
|
||||
|
||||
// Simple whitespace-split input into user-level words and
|
||||
// double-quoted phrases: word1 word2 "this is a phrase". The text
|
||||
// splitter may further still decide that the resulting "words"
|
||||
// are really phrases, this depends on separators: [paul@dom.net]
|
||||
// would still be a word (span), but [about:me] will probably be
|
||||
// handled as a phrase.
|
||||
// double-quoted phrases: word1 word2 "this is a phrase".
|
||||
//
|
||||
// The text splitter may further still decide that the resulting
|
||||
// "words" are really phrases, this depends on separators:
|
||||
// [paul@dom.net] would still be a word (span), but [about:me]
|
||||
// will probably be handled as a phrase.
|
||||
list<string> phrases;
|
||||
TextSplit::stringToStrings(iq, phrases);
|
||||
|
||||
@ -516,7 +523,7 @@ bool StringToXapianQ::processUserString(const string &_iq,
|
||||
case 0:
|
||||
continue;// ??
|
||||
case 1:
|
||||
processSimpleSpan(splitData->terms.front(), pqueries);
|
||||
processSimpleSpan(splitData->terms.front(), splitData->nostemexps.front(), pqueries);
|
||||
break;
|
||||
default:
|
||||
processPhraseOrNear(splitData, pqueries, useNear, slack);
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user