stoplist: use stringToStrings in place of splitter to support quoted space-containing entries
This commit is contained in:
parent
c25272a0d8
commit
e4eba0de97
@ -19,7 +19,7 @@
|
|||||||
#include "debuglog.h"
|
#include "debuglog.h"
|
||||||
#include "readfile.h"
|
#include "readfile.h"
|
||||||
#include "unacpp.h"
|
#include "unacpp.h"
|
||||||
#include "textsplit.h"
|
#include "smallut.h"
|
||||||
#include "stoplist.h"
|
#include "stoplist.h"
|
||||||
|
|
||||||
#ifndef NO_NAMESPACES
|
#ifndef NO_NAMESPACES
|
||||||
@ -27,40 +27,33 @@ namespace Rcl
|
|||||||
{
|
{
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
class TextSplitSW : public TextSplit {
|
|
||||||
public:
|
|
||||||
set<string>& stops;
|
|
||||||
TextSplitSW(Flags flags, set<string>& stps)
|
|
||||||
: TextSplit(flags), stops(stps)
|
|
||||||
{}
|
|
||||||
virtual bool takeword(const string& term, int, int, int)
|
|
||||||
{
|
|
||||||
string dterm;
|
|
||||||
unacmaybefold(term, dterm, "UTF-8", true);
|
|
||||||
stops.insert(dterm);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
bool StopList::setFile(const string &filename)
|
bool StopList::setFile(const string &filename)
|
||||||
{
|
{
|
||||||
m_hasStops = false;
|
|
||||||
m_stops.clear();
|
m_stops.clear();
|
||||||
string stoptext, reason;
|
string stoptext, reason;
|
||||||
if (!file_to_string(filename, stoptext, &reason)) {
|
if (!file_to_string(filename, stoptext, &reason)) {
|
||||||
LOGDEB(("StopList::StopList: file_to_string(%s) failed: %s\n",
|
LOGDEB0(("StopList::StopList: file_to_string(%s) failed: %s\n",
|
||||||
filename.c_str(), reason.c_str()));
|
filename.c_str(), reason.c_str()));
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
TextSplitSW ts(TextSplit::TXTS_ONLYSPANS, m_stops);
|
set<string> stops;
|
||||||
ts.text_to_words(stoptext);
|
stringToStrings(stoptext, stops);
|
||||||
m_hasStops = !m_stops.empty();
|
for (set<string>::iterator it = stops.begin();
|
||||||
|
it != stops.end(); it++) {
|
||||||
|
string dterm;
|
||||||
|
unacmaybefold(*it, dterm, "UTF-8", true);
|
||||||
|
m_stops.insert(dterm);
|
||||||
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Most sites will have an empty stop list. We try to optimize the
|
||||||
|
// empty set case as much as possible. empty() is probably sligtly faster than
|
||||||
|
// find() in this case.
|
||||||
bool StopList::isStop(const string &term) const
|
bool StopList::isStop(const string &term) const
|
||||||
{
|
{
|
||||||
return m_hasStops ? m_stops.find(term) != m_stops.end() : false;
|
return m_stops.empty() ? false : m_stops.find(term) != m_stops.end();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -97,7 +90,7 @@ Usage(void)
|
|||||||
}
|
}
|
||||||
|
|
||||||
const string tstwords[] = {
|
const string tstwords[] = {
|
||||||
"the", "is", "xweird"
|
"the", "is", "xweird", "autre", "autre double", "mot1", "mot double",
|
||||||
};
|
};
|
||||||
const int tstsz = sizeof(tstwords) / sizeof(string);
|
const int tstsz = sizeof(tstwords) / sizeof(string);
|
||||||
|
|
||||||
|
|||||||
@ -27,18 +27,24 @@ namespace Rcl
|
|||||||
{
|
{
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A StopList is just a bunch of strings read from a file.
|
||||||
|
*
|
||||||
|
* Some of the string may contain whitespace (that's for experimentation with
|
||||||
|
* stop n-grams), so we take care of dquotes while reading the file. We also
|
||||||
|
* lowercase and remove accents. The source file should be utf-8.
|
||||||
|
*/
|
||||||
class StopList {
|
class StopList {
|
||||||
public:
|
public:
|
||||||
StopList() : m_hasStops(false) {}
|
StopList() {}
|
||||||
StopList(const string &filename) {setFile(filename);}
|
StopList(const string &filename) {setFile(filename);}
|
||||||
virtual ~StopList() {}
|
virtual ~StopList() {}
|
||||||
|
|
||||||
bool setFile(const string &filename);
|
bool setFile(const string &filename);
|
||||||
bool isStop(const string &term) const;
|
bool isStop(const string &term) const;
|
||||||
bool hasStops() const {return m_hasStops;}
|
bool hasStops() const {return !m_stops.empty();}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
bool m_hasStops;
|
|
||||||
set<string> m_stops;
|
set<string> m_stops;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user