stoplist: use stringToStrings in place of splitter to support quoted space-containing entries
This commit is contained in:
parent
c25272a0d8
commit
e4eba0de97
@ -19,7 +19,7 @@
|
||||
#include "debuglog.h"
|
||||
#include "readfile.h"
|
||||
#include "unacpp.h"
|
||||
#include "textsplit.h"
|
||||
#include "smallut.h"
|
||||
#include "stoplist.h"
|
||||
|
||||
#ifndef NO_NAMESPACES
|
||||
@ -27,40 +27,33 @@ namespace Rcl
|
||||
{
|
||||
#endif
|
||||
|
||||
class TextSplitSW : public TextSplit {
|
||||
public:
|
||||
set<string>& stops;
|
||||
TextSplitSW(Flags flags, set<string>& stps)
|
||||
: TextSplit(flags), stops(stps)
|
||||
{}
|
||||
virtual bool takeword(const string& term, int, int, int)
|
||||
{
|
||||
string dterm;
|
||||
unacmaybefold(term, dterm, "UTF-8", true);
|
||||
stops.insert(dterm);
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
bool StopList::setFile(const string &filename)
|
||||
{
|
||||
m_hasStops = false;
|
||||
m_stops.clear();
|
||||
string stoptext, reason;
|
||||
if (!file_to_string(filename, stoptext, &reason)) {
|
||||
LOGDEB(("StopList::StopList: file_to_string(%s) failed: %s\n",
|
||||
filename.c_str(), reason.c_str()));
|
||||
LOGDEB0(("StopList::StopList: file_to_string(%s) failed: %s\n",
|
||||
filename.c_str(), reason.c_str()));
|
||||
return false;
|
||||
}
|
||||
TextSplitSW ts(TextSplit::TXTS_ONLYSPANS, m_stops);
|
||||
ts.text_to_words(stoptext);
|
||||
m_hasStops = !m_stops.empty();
|
||||
set<string> stops;
|
||||
stringToStrings(stoptext, stops);
|
||||
for (set<string>::iterator it = stops.begin();
|
||||
it != stops.end(); it++) {
|
||||
string dterm;
|
||||
unacmaybefold(*it, dterm, "UTF-8", true);
|
||||
m_stops.insert(dterm);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// Most sites will have an empty stop list. We try to optimize the
|
||||
// empty set case as much as possible. empty() is probably sligtly faster than
|
||||
// find() in this case.
|
||||
bool StopList::isStop(const string &term) const
|
||||
{
|
||||
return m_hasStops ? m_stops.find(term) != m_stops.end() : false;
|
||||
return m_stops.empty() ? false : m_stops.find(term) != m_stops.end();
|
||||
}
|
||||
|
||||
|
||||
@ -97,7 +90,7 @@ Usage(void)
|
||||
}
|
||||
|
||||
const string tstwords[] = {
|
||||
"the", "is", "xweird"
|
||||
"the", "is", "xweird", "autre", "autre double", "mot1", "mot double",
|
||||
};
|
||||
const int tstsz = sizeof(tstwords) / sizeof(string);
|
||||
|
||||
|
||||
@ -27,18 +27,24 @@ namespace Rcl
|
||||
{
|
||||
#endif
|
||||
|
||||
/**
|
||||
* A StopList is just a bunch of strings read from a file.
|
||||
*
|
||||
* Some of the string may contain whitespace (that's for experimentation with
|
||||
* stop n-grams), so we take care of dquotes while reading the file. We also
|
||||
* lowercase and remove accents. The source file should be utf-8.
|
||||
*/
|
||||
class StopList {
|
||||
public:
|
||||
StopList() : m_hasStops(false) {}
|
||||
StopList() {}
|
||||
StopList(const string &filename) {setFile(filename);}
|
||||
virtual ~StopList() {}
|
||||
|
||||
bool setFile(const string &filename);
|
||||
bool isStop(const string &term) const;
|
||||
bool hasStops() const {return m_hasStops;}
|
||||
bool hasStops() const {return !m_stops.empty();}
|
||||
|
||||
private:
|
||||
bool m_hasStops;
|
||||
set<string> m_stops;
|
||||
};
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user