stoplist: use stringToStrings in place of splitter to support quoted space-containing entries

This commit is contained in:
Jean-Francois Dockes 2011-10-04 16:04:28 +02:00
parent c25272a0d8
commit e4eba0de97
2 changed files with 26 additions and 27 deletions

View File

@ -19,7 +19,7 @@
#include "debuglog.h"
#include "readfile.h"
#include "unacpp.h"
#include "textsplit.h"
#include "smallut.h"
#include "stoplist.h"
#ifndef NO_NAMESPACES
@ -27,40 +27,33 @@ namespace Rcl
{
#endif
class TextSplitSW : public TextSplit {
public:
set<string>& stops;
TextSplitSW(Flags flags, set<string>& stps)
: TextSplit(flags), stops(stps)
{}
virtual bool takeword(const string& term, int, int, int)
{
string dterm;
unacmaybefold(term, dterm, "UTF-8", true);
stops.insert(dterm);
return true;
}
};
bool StopList::setFile(const string &filename)
{
m_hasStops = false;
m_stops.clear();
string stoptext, reason;
if (!file_to_string(filename, stoptext, &reason)) {
LOGDEB(("StopList::StopList: file_to_string(%s) failed: %s\n",
filename.c_str(), reason.c_str()));
LOGDEB0(("StopList::StopList: file_to_string(%s) failed: %s\n",
filename.c_str(), reason.c_str()));
return false;
}
TextSplitSW ts(TextSplit::TXTS_ONLYSPANS, m_stops);
ts.text_to_words(stoptext);
m_hasStops = !m_stops.empty();
set<string> stops;
stringToStrings(stoptext, stops);
for (set<string>::iterator it = stops.begin();
it != stops.end(); it++) {
string dterm;
unacmaybefold(*it, dterm, "UTF-8", true);
m_stops.insert(dterm);
}
return true;
}
// Most sites will have an empty stop list. We try to optimize the
// empty set case as much as possible. empty() is probably sligtly faster than
// find() in this case.
bool StopList::isStop(const string &term) const
{
return m_hasStops ? m_stops.find(term) != m_stops.end() : false;
return m_stops.empty() ? false : m_stops.find(term) != m_stops.end();
}
@ -97,7 +90,7 @@ Usage(void)
}
const string tstwords[] = {
"the", "is", "xweird"
"the", "is", "xweird", "autre", "autre double", "mot1", "mot double",
};
const int tstsz = sizeof(tstwords) / sizeof(string);

View File

@ -27,18 +27,24 @@ namespace Rcl
{
#endif
/**
* A StopList is just a bunch of strings read from a file.
*
* Some of the string may contain whitespace (that's for experimentation with
* stop n-grams), so we take care of dquotes while reading the file. We also
* lowercase and remove accents. The source file should be utf-8.
*/
class StopList {
public:
StopList() : m_hasStops(false) {}
StopList() {}
StopList(const string &filename) {setFile(filename);}
virtual ~StopList() {}
bool setFile(const string &filename);
bool isStop(const string &term) const;
bool hasStops() const {return m_hasStops;}
bool hasStops() const {return !m_stops.empty();}
private:
bool m_hasStops;
set<string> m_stops;
};