modified the time at which we unaccent so that we can do the Capitalized->nostemming test on single words (this had been broken by the change of noac/split order done earlier to get japanese to work)

This commit is contained in:
dockes 2009-01-26 18:30:48 +00:00
parent 30c46709ba
commit 7dcc7c61c8
4 changed files with 96 additions and 108 deletions

View File

@ -42,6 +42,7 @@ using std::set;
#include "smallut.h"
#include "plaintorich.h"
#include "cancelcheck.h"
#include "unacpp.h"
const string PlainToRich::snull = "";
@ -84,7 +85,10 @@ class myTextSplitCB : public TextSplitCB {
// Callback called by the text-to-words breaker for each word
virtual bool takeword(const std::string& term, int pos, int bts, int bte) {
string dumb;
Rcl::dumb_string(term, dumb);
if (!unacmaybefold(term, dumb, "UTF-8", true)) {
LOGINFO(("PlainToRich::splitter::takeword: unac failed for [%s]\n", term.c_str()));
return true;
}
//LOGDEB2(("Input dumbbed term: '%s' %d %d %d\n", dumb.c_str(),
// pos, bts, bte));

View File

@ -784,27 +784,33 @@ private:
};
// Callback for the document to word splitting class during indexation
bool mySplitterCB::takeword(const std::string &term, int pos, int, int)
bool mySplitterCB::takeword(const std::string &_term, int pos, int, int)
{
#if 0
LOGDEB(("mySplitterCB::takeword:splitCb: [%s]\n", term.c_str()));
string printable;
if (transcode(term, printable, "UTF-8", "ISO-8859-1")) {
LOGDEB((" [%s]\n", printable.c_str()));
}
LOGDEB(("mySplitterCB::takeword:splitCb: [%s]\n", _term.c_str()));
#endif
string term;
if (!unacmaybefold(_term, term, "UTF-8", true)) {
LOGINFO(("Db::splitter::takeword: unac failed for [%s]\n", _term.c_str()));
term.clear();
// We don't generate a fatal error because of a bad term
return true;
}
if (stops.hasStops() && stops.isStop(term)) {
LOGDEB1(("Db: takeword [%s] in stop list\n", term.c_str()));
return true;
}
// Compute absolute position (pos is relative to current segment),
// and remember relative.
curpos = pos;
pos += basepos;
string ermsg;
try {
if (stops.hasStops() && stops.isStop(term)) {
LOGDEB1(("Db: takeword [%s] in stop list\n", term.c_str()));
return true;
}
// Note: 1 is the within document frequency increment. It would
// be possible to assign different weigths to doc parts (ie title)
// by using a higher value
curpos = pos;
pos += basepos;
doc.add_posting(term, pos, 1);
if (!prefix.empty()) {
doc.add_posting(prefix + term, pos, 1);
@ -815,28 +821,6 @@ bool mySplitterCB::takeword(const std::string &term, int pos, int, int)
return false;
}
// Unaccent and lowercase data, replace \n\r with spaces
// Removing crlfs is so that we can use the text in the document data fields.
// Use unac (with folding extension) for removing accents and casefolding
//
// Note that we always return true (but set out to "" on error). We don't
// want to stop indexation because of a bad string
bool dumb_string(const string &in, string &out)
{
out.clear();
if (in.empty())
return true;
string s1 = neutchars(in, "\n\r");
if (!unacmaybefold(s1, out, "UTF-8", true)) {
LOGINFO(("dumb_string: unac failed for [%s]\n", in.c_str()));
out.clear();
// See comment at start of func
return true;
}
return true;
}
// Let our user set the parameters for abstract processing
void Db::setAbstractParams(int idxtrunc, int syntlen, int syntctxlen)
{
@ -891,14 +875,11 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
Xapian::Document newdocument;
mySplitterCB splitData(newdocument, m_stops);
TextSplit splitter(&splitData);
string noacc;
// Split and index file name as document term(s)
LOGDEB2(("Db::add: split file name [%s]\n", fn.c_str()));
if (dumb_string(doc.utf8fn, noacc)) {
splitter.text_to_words(noacc);
splitData.basepos += splitData.curpos + 100;
}
splitter.text_to_words(doc.utf8fn);
splitData.basepos += splitData.curpos + 100;
// Index textual metadata. These are all indexed as text with
// positions, as we may want to do phrase searches with them (this
@ -918,12 +899,8 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
LOGDEB0(("Db::add: field [%s] pfx [%s]: [%s]\n",
meta_it->first.c_str(), pfx.c_str(),
meta_it->second.c_str()));
if (!dumb_string(meta_it->second, noacc)) {
LOGERR(("Db::add: dumb_string failed\n"));
return false;
}
splitData.setprefix(pfx); // Subject
splitter.text_to_words(noacc);
splitter.text_to_words(meta_it->second);
splitData.setprefix(string());
splitData.basepos += splitData.curpos + 100;
}
@ -936,31 +913,28 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
// Split and index body text
LOGDEB2(("Db::add: split body\n"));
if (!dumb_string(doc.text, noacc)) {
LOGERR(("Db::add: dumb_string failed\n"));
return false;
}
splitter.text_to_words(noacc);
splitter.text_to_words(doc.text);
////// Special terms for other metadata. No positions for these.
// Mime type
newdocument.add_term("T" + doc.mimetype);
// Simple file name. This is used for file name searches only. We index
// it with a term prefix. utf8fn used to be the full path, but it's now
// the simple file name.
// Simple file name indexed for file name searches with a term prefix
// We also add a term for the filename extension if any.
if (dumb_string(doc.utf8fn, noacc) && !noacc.empty()) {
// We should truncate after extracting the extension, but this is
// a pathological case anyway
if (noacc.size() > 230)
utf8truncate(noacc, 230);
string::size_type pos = noacc.rfind('.');
if (pos != string::npos && pos != noacc.length() -1) {
newdocument.add_term(string("XE") + noacc.substr(pos+1));
if (!doc.utf8fn.empty()) {
string fn;
if (unacmaybefold(doc.utf8fn, fn, "UTF-8", true)) {
// We should truncate after extracting the extension, but this is
// a pathological case anyway
if (fn.size() > 230)
utf8truncate(fn, 230);
string::size_type pos = fn.rfind('.');
if (pos != string::npos && pos != fn.length() - 1) {
newdocument.add_term(string("XE") + fn.substr(pos + 1));
}
fn = string("XSFN") + fn;
newdocument.add_term(fn);
}
noacc = string("XSFN") + noacc;
newdocument.add_term(noacc);
}
// Udi unique term: this is used for file existence/uptodate
@ -1329,7 +1303,10 @@ bool Db::purgeFile(const string &udi)
bool Db::filenameWildExp(const string& fnexp, list<string>& names)
{
string pattern;
dumb_string(fnexp, pattern);
if (!unacmaybefold(fnexp, pattern, "UTF-8", true)) {
LOGERR(("Db::filenameWildExp: unac error for [%s]\n", fnexp.c_str()));
return false;
}
names.clear();
// If pattern is not quoted, and has no wildcards, we add * at
@ -1415,7 +1392,10 @@ bool Db::termMatch(MatchType typ, const string &lang,
// Get rid of capitals and accents
string droot;
dumb_string(root, droot);
if (!unacmaybefold(root, droot, "UTF-8", true)) {
LOGERR(("Db::termMatch: unac failed for [%s]\n", root.c_str()));
return false;
}
string nochars = typ == ET_WILD ? wildSpecChars : regSpecChars;
string prefix;

View File

@ -243,9 +243,6 @@ private:
Db& operator=(const Db &) {return *this;};
};
// Unaccent and lowercase data.
extern bool dumb_string(const string &in, string &out);
#ifndef NO_NAMESPACES
}
#endif // NO_NAMESPACES

View File

@ -183,17 +183,39 @@ class wsQData : public TextSplitCB {
wsQData(const StopList &_stops)
: stops(_stops), alltermcount(0)
{}
vector<string> terms;
bool takeword(const std::string &term, int , int, int) {
bool takeword(const std::string &interm, int , int, int) {
alltermcount++;
LOGDEB1(("wsQData::takeword: %s\n", term.c_str()));
if (stops.hasStops() && stops.isStop(term)) {
LOGDEB1(("wsQData::takeword [%s] in stop list\n", term.c_str()));
LOGDEB1(("wsQData::takeword: %s\n", interm.c_str()));
// Check if the first letter is a majuscule in which
// case we do not want to do stem expansion. Note that
// the test is convoluted and possibly problematic
string noacterm, noaclowterm;
if (!unacmaybefold(interm, noacterm, "UTF-8", false)) {
LOGINFO(("SearchData::splitter::takeword: unac failed for [%s]\n", interm.c_str()));
return true;
}
if (!unacmaybefold(noacterm, noaclowterm, "UTF-8", true)) {
LOGINFO(("SearchData::splitter::takeword: unac failed for [%s]\n", noacterm.c_str()));
return true;
}
terms.push_back(term);
bool nostemexp = false;
Utf8Iter it1(noacterm);
Utf8Iter it2(noaclowterm);
if (*it1 != *it2)
nostemexp = true;
if (stops.hasStops() && stops.isStop(noaclowterm)) {
LOGDEB1(("wsQData::takeword [%s] in stop list\n", noaclowterm.c_str()));
return true;
}
terms.push_back(noaclowterm);
nostemexps.push_back(nostemexp);
return true;
}
vector<string> terms;
vector<bool> nostemexps;
const StopList &stops;
// Count of terms including stopwords: this is for adjusting
// phrase/near slack
@ -232,7 +254,7 @@ private:
void expandTerm(bool dont, const string& term, list<string>& exp,
string& sterm);
// After splitting entry on whitespace: process non-phrase element
void processSimpleSpan(const string& span, list<Xapian::Query> &pqueries);
void processSimpleSpan(const string& span, bool nostemexp, list<Xapian::Query> &pqueries);
// Process phrase/near element
void processPhraseOrNear(wsQData *splitData,
list<Xapian::Query> &pqueries,
@ -279,18 +301,6 @@ void StringToXapianQ::expandTerm(bool nostemexp,
nostemexp = true;
if (!nostemexp) {
// Check if the first letter is a majuscule in which
// case we do not want to do stem expansion. Note that
// the test is convoluted and possibly problematic
string noacterm, noaclowterm;
if (unacmaybefold(term, noacterm, "UTF-8", false) &&
unacmaybefold(noacterm, noaclowterm, "UTF-8", true)) {
Utf8Iter it1(noacterm);
Utf8Iter it2(noaclowterm);
if (*it1 != *it2)
nostemexp = true;
}
}
if (nostemexp && !haswild) {
@ -356,12 +366,12 @@ static void addPrefix(list<string>& terms, const string& prefix)
it->insert(0, prefix);
}
void StringToXapianQ::processSimpleSpan(const string& span,
void StringToXapianQ::processSimpleSpan(const string& span, bool nostemexp,
list<Xapian::Query> &pqueries)
{
list<string> exp;
string sterm; // dumb version of user term
expandTerm(false, span, exp, sterm);
expandTerm(nostemexp, span, exp, sterm);
m_terms.insert(m_terms.end(), exp.begin(), exp.end());
addPrefix(exp, m_prefix);
// Push either term or OR of stem-expanded set
@ -396,12 +406,13 @@ void StringToXapianQ::processPhraseOrNear(wsQData *splitData,
vector<vector<string> >groups;
// Go through the list and perform stem/wildcard expansion for each element
vector<bool>::iterator nxit = splitData->nostemexps.begin();
for (vector<string>::iterator it = splitData->terms.begin();
it != splitData->terms.end(); it++) {
it != splitData->terms.end(); it++, nxit++) {
// Adjust when we do stem expansion. Not inside phrases, and
// some versions of xapian will accept only one OR clause
// inside NEAR, all others must be leafs.
bool nostemexp = (op == Xapian::Query::OP_PHRASE) || hadmultiple;
bool nostemexp = *nxit || (op == Xapian::Query::OP_PHRASE) || hadmultiple;
string sterm;
list<string>exp;
@ -434,7 +445,10 @@ void StringToXapianQ::processPhraseOrNear(wsQData *splitData,
/**
* Turn user entry string (NOT query language) into a list of xapian queries.
* We just separate words and phrases, and do wildcard and stemp expansion,
* We just separate words and phrases, and do wildcard and stem expansion,
*
* This is used to process data entered into an OR/AND/NEAR/PHRASE field of
* the GUI.
*
* The final list contains one query for each term or phrase
* - Elements corresponding to a stem-expanded part are an OP_OR
@ -444,7 +458,7 @@ void StringToXapianQ::processPhraseOrNear(wsQData *splitData,
* @return the subquery count (either or'd stem-expanded terms or phrase word
* count)
*/
bool StringToXapianQ::processUserString(const string &_iq,
bool StringToXapianQ::processUserString(const string &iq,
string &ermsg,
list<Xapian::Query> &pqueries,
const StopList& stops,
@ -452,25 +466,18 @@ bool StringToXapianQ::processUserString(const string &_iq,
bool useNear
)
{
LOGDEB(("StringToXapianQ:: query string: [%s]\n", _iq.c_str()));
LOGDEB(("StringToXapianQ:: query string: [%s]\n", iq.c_str()));
ermsg.erase();
m_terms.clear();
m_groups.clear();
// First unaccent/normalize the input: do it first so that it
// happens in the same order as when indexing: unac then split. As
// the character count can change during normalisation, this is
// specially important for cjk because the artificial cjk split is
// based on character counts
string iq;
dumb_string(_iq, iq);
// Simple whitespace-split input into user-level words and
// double-quoted phrases: word1 word2 "this is a phrase". The text
// splitter may further still decide that the resulting "words"
// are really phrases, this depends on separators: [paul@dom.net]
// would still be a word (span), but [about:me] will probably be
// handled as a phrase.
// double-quoted phrases: word1 word2 "this is a phrase".
//
// The text splitter may further still decide that the resulting
// "words" are really phrases, this depends on separators:
// [paul@dom.net] would still be a word (span), but [about:me]
// will probably be handled as a phrase.
list<string> phrases;
TextSplit::stringToStrings(iq, phrases);
@ -516,7 +523,7 @@ bool StringToXapianQ::processUserString(const string &_iq,
case 0:
continue;// ??
case 1:
processSimpleSpan(splitData->terms.front(), pqueries);
processSimpleSpan(splitData->terms.front(), splitData->nostemexps.front(), pqueries);
break;
default:
processPhraseOrNear(splitData, pqueries, useNear, slack);