diff --git a/src/rcldb/stoplist.cpp b/src/rcldb/stoplist.cpp index 571b7080..dda7c3d9 100644 --- a/src/rcldb/stoplist.cpp +++ b/src/rcldb/stoplist.cpp @@ -19,7 +19,7 @@ #include "debuglog.h" #include "readfile.h" #include "unacpp.h" -#include "textsplit.h" +#include "smallut.h" #include "stoplist.h" #ifndef NO_NAMESPACES @@ -27,40 +27,33 @@ namespace Rcl { #endif -class TextSplitSW : public TextSplit { -public: - set& stops; - TextSplitSW(Flags flags, set& stps) - : TextSplit(flags), stops(stps) - {} - virtual bool takeword(const string& term, int, int, int) - { - string dterm; - unacmaybefold(term, dterm, "UTF-8", true); - stops.insert(dterm); - return true; - } -}; - bool StopList::setFile(const string &filename) { - m_hasStops = false; m_stops.clear(); string stoptext, reason; if (!file_to_string(filename, stoptext, &reason)) { - LOGDEB(("StopList::StopList: file_to_string(%s) failed: %s\n", - filename.c_str(), reason.c_str())); + LOGDEB0(("StopList::StopList: file_to_string(%s) failed: %s\n", + filename.c_str(), reason.c_str())); return false; } - TextSplitSW ts(TextSplit::TXTS_ONLYSPANS, m_stops); - ts.text_to_words(stoptext); - m_hasStops = !m_stops.empty(); + set stops; + stringToStrings(stoptext, stops); + for (set::iterator it = stops.begin(); + it != stops.end(); it++) { + string dterm; + unacmaybefold(*it, dterm, "UTF-8", true); + m_stops.insert(dterm); + } + return true; } +// Most sites will have an empty stop list. We try to optimize the +// empty set case as much as possible. empty() is probably sligtly faster than +// find() in this case. bool StopList::isStop(const string &term) const { - return m_hasStops ? m_stops.find(term) != m_stops.end() : false; + return m_stops.empty() ? false : m_stops.find(term) != m_stops.end(); } @@ -97,7 +90,7 @@ Usage(void) } const string tstwords[] = { - "the", "is", "xweird" + "the", "is", "xweird", "autre", "autre double", "mot1", "mot double", }; const int tstsz = sizeof(tstwords) / sizeof(string); diff --git a/src/rcldb/stoplist.h b/src/rcldb/stoplist.h index de4dd0ea..c035e33b 100644 --- a/src/rcldb/stoplist.h +++ b/src/rcldb/stoplist.h @@ -27,18 +27,24 @@ namespace Rcl { #endif +/** + * A StopList is just a bunch of strings read from a file. + * + * Some of the string may contain whitespace (that's for experimentation with + * stop n-grams), so we take care of dquotes while reading the file. We also + * lowercase and remove accents. The source file should be utf-8. + */ class StopList { public: - StopList() : m_hasStops(false) {} + StopList() {} StopList(const string &filename) {setFile(filename);} virtual ~StopList() {} bool setFile(const string &filename); bool isStop(const string &term) const; - bool hasStops() const {return m_hasStops;} + bool hasStops() const {return !m_stops.empty();} private: - bool m_hasStops; set m_stops; };