Handle the case where unac produces whitespace, which may occur with letter-less accents
This commit is contained in:
parent
94eb3119ce
commit
04cd868950
@ -14,13 +14,12 @@
|
|||||||
* Free Software Foundation, Inc.,
|
* Free Software Foundation, Inc.,
|
||||||
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
|
||||||
#ifndef _TERMPROC_H_INCLUDED_
|
#ifndef _TERMPROC_H_INCLUDED_
|
||||||
#define _TERMPROC_H_INCLUDED_
|
#define _TERMPROC_H_INCLUDED_
|
||||||
|
|
||||||
#include "textsplit.h"
|
#include "textsplit.h"
|
||||||
#include "stoplist.h"
|
#include "stoplist.h"
|
||||||
|
#include "smallut.h"
|
||||||
|
|
||||||
namespace Rcl {
|
namespace Rcl {
|
||||||
|
|
||||||
@ -140,14 +139,43 @@ public:
|
|||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
// It may happen in some weird cases that the output from unac is
|
|
||||||
// empty (if the word actually consisted entirely of diacritics ...)
|
if (otrm.empty()) {
|
||||||
// The consequence is that a phrase search won't work without addional
|
// It may happen in some weird cases that the output from
|
||||||
// slack.
|
// unac is empty (if the word actually consisted entirely
|
||||||
if (otrm.empty())
|
// of diacritics ...) The consequence is that a phrase
|
||||||
|
// search won't work without addional slack.
|
||||||
return true;
|
return true;
|
||||||
else
|
}
|
||||||
return TermProc::takeword(otrm, pos, bs, be);
|
|
||||||
|
// It may also occur that unac introduces spaces in the string
|
||||||
|
// (when removing isolated accents, may happen for Greek
|
||||||
|
// for example). This is a pathological situation. We
|
||||||
|
// index all the resulting terms at the same pos because
|
||||||
|
// the surrounding code is not designed to handle a pos
|
||||||
|
// change in here. This means that phrase searches and
|
||||||
|
// snippets will be wrong, but at least searching for the
|
||||||
|
// terms will work.
|
||||||
|
bool hasspace = false;
|
||||||
|
for (string::const_iterator it = otrm.begin();it < otrm.end();it++) {
|
||||||
|
if (*it == ' ') {
|
||||||
|
hasspace=true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (hasspace) {
|
||||||
|
vector<string> terms;
|
||||||
|
stringToTokens(otrm, terms, " ", true);
|
||||||
|
for (vector<string>::const_iterator it = terms.begin();
|
||||||
|
it < terms.end(); it++) {
|
||||||
|
if (!TermProc::takeword(*it, pos, bs, be)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
return TermProc::takeword(otrm, pos, bs, be);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual bool flush()
|
virtual bool flush()
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user