minimal experimental stopword functionality

This commit is contained in:
dockes 2007-06-02 08:30:42 +00:00
parent fcb2762048
commit 0f1b917b7b
14 changed files with 259 additions and 35 deletions

View File

@ -1,5 +1,5 @@
#ifndef lint #ifndef lint
static char rcsid[] = "@(#$Id: rclconfig.cpp,v 1.43 2007-02-07 17:17:11 dockes Exp $ (C) 2004 J.F.Dockes"; static char rcsid[] = "@(#$Id: rclconfig.cpp,v 1.44 2007-06-02 08:30:41 dockes Exp $ (C) 2004 J.F.Dockes";
#endif #endif
/* /*
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
@ -461,6 +461,11 @@ string RclConfig::getDbDir()
return path_canon(dbdir); return path_canon(dbdir);
} }
string RclConfig::getStopfile()
{
return path_cat(getConfDir(), "stoplist.txt");
}
list<string> RclConfig::getSkippedNames() list<string> RclConfig::getSkippedNames()
{ {
list<string> skpl; list<string> skpl;

View File

@ -16,7 +16,7 @@
*/ */
#ifndef _RCLCONFIG_H_INCLUDED_ #ifndef _RCLCONFIG_H_INCLUDED_
#define _RCLCONFIG_H_INCLUDED_ #define _RCLCONFIG_H_INCLUDED_
/* @(#$Id: rclconfig.h,v 1.31 2007-02-02 10:12:58 dockes Exp $ (C) 2004 J.F.Dockes */ /* @(#$Id: rclconfig.h,v 1.32 2007-06-02 08:30:41 dockes Exp $ (C) 2004 J.F.Dockes */
#include <list> #include <list>
#include <string> #include <string>
@ -78,8 +78,10 @@ class RclConfig {
/** Get database directory */ /** Get database directory */
string getDbDir(); string getDbDir();
/** Get stoplist file name */
string getStopfile();
/** Get list of skipped names for current keydir */ /** Get list of skipped file names for current keydir */
list<string> getSkippedNames(); list<string> getSkippedNames();
/** Get list of skipped paths patterns. Doesn't depend on the keydir */ /** Get list of skipped paths patterns. Doesn't depend on the keydir */

View File

@ -1,5 +1,5 @@
#ifndef lint #ifndef lint
static char rcsid[] = "@(#$Id: indexer.cpp,v 1.56 2007-05-30 12:31:19 dockes Exp $ (C) 2004 J.F.Dockes"; static char rcsid[] = "@(#$Id: indexer.cpp,v 1.57 2007-06-02 08:30:41 dockes Exp $ (C) 2004 J.F.Dockes";
#endif #endif
/* /*
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
@ -181,7 +181,7 @@ bool DbIndexer::init(bool resetbefore, bool rdonly)
} }
Rcl::Db::OpenMode mode = rdonly ? Rcl::Db::DbRO : Rcl::Db::OpenMode mode = rdonly ? Rcl::Db::DbRO :
resetbefore ? Rcl::Db::DbTrunc : Rcl::Db::DbUpd; resetbefore ? Rcl::Db::DbTrunc : Rcl::Db::DbUpd;
if (!m_db.open(m_dbdir, mode)) { if (!m_db.open(m_dbdir, m_config->getStopfile(), mode)) {
LOGERR(("DbIndexer: error opening database in %s\n", m_dbdir.c_str())); LOGERR(("DbIndexer: error opening database in %s\n", m_dbdir.c_str()));
return false; return false;
} }

View File

@ -8,8 +8,8 @@ LIBS = librcl.a
all: $(LIBS) all: $(LIBS)
OBJS = rclaspell.o rclconfig.o rclinit.o textsplit.o unacpp.o csguess.o indexer.o mimetype.o htmlparse.o myhtmlparse.o mimehandler.o internfile.o mh_exec.o mh_html.o mh_mail.o mh_mbox.o mh_text.o docseq.o docseqdb.o docseqhist.o history.o sortseq.o wasastringtoquery.o wasatorcl.o pathhash.o rcldb.o searchdata.o stemdb.o base64.o conftree.o copyfile.o debuglog.o execmd.o fstreewalk.o idfile.o md5.o mimeparse.o pathut.o readfile.o smallut.o transcode.o wipedir.o x11mon.o OBJS = rclaspell.o rclconfig.o rclinit.o textsplit.o unacpp.o csguess.o indexer.o mimetype.o htmlparse.o myhtmlparse.o mimehandler.o internfile.o mh_exec.o mh_html.o mh_mail.o mh_mbox.o mh_text.o docseq.o docseqdb.o docseqhist.o history.o sortseq.o wasastringtoquery.o wasatorcl.o pathhash.o rcldb.o searchdata.o stemdb.o stoplist.o base64.o conftree.o copyfile.o debuglog.o execmd.o fstreewalk.o idfile.o md5.o mimeparse.o pathut.o readfile.o smallut.o transcode.o wipedir.o x11mon.o
DEPS = rclaspell.dep.stamp rclconfig.dep.stamp rclinit.dep.stamp textsplit.dep.stamp unacpp.dep.stamp csguess.dep.stamp indexer.dep.stamp mimetype.dep.stamp htmlparse.dep.stamp myhtmlparse.dep.stamp mimehandler.dep.stamp internfile.dep.stamp mh_exec.dep.stamp mh_html.dep.stamp mh_mail.dep.stamp mh_mbox.dep.stamp mh_text.dep.stamp docseq.dep.stamp docseqdb.dep.stamp docseqhist.dep.stamp history.dep.stamp sortseq.dep.stamp wasastringtoquery.dep.stamp wasatorcl.dep.stamp pathhash.dep.stamp rcldb.dep.stamp searchdata.dep.stamp stemdb.dep.stamp base64.dep.stamp conftree.dep.stamp copyfile.dep.stamp debuglog.dep.stamp execmd.dep.stamp fstreewalk.dep.stamp idfile.dep.stamp md5.dep.stamp mimeparse.dep.stamp pathut.dep.stamp readfile.dep.stamp smallut.dep.stamp transcode.dep.stamp wipedir.dep.stamp x11mon.dep.stamp DEPS = rclaspell.dep.stamp rclconfig.dep.stamp rclinit.dep.stamp textsplit.dep.stamp unacpp.dep.stamp csguess.dep.stamp indexer.dep.stamp mimetype.dep.stamp htmlparse.dep.stamp myhtmlparse.dep.stamp mimehandler.dep.stamp internfile.dep.stamp mh_exec.dep.stamp mh_html.dep.stamp mh_mail.dep.stamp mh_mbox.dep.stamp mh_text.dep.stamp docseq.dep.stamp docseqdb.dep.stamp docseqhist.dep.stamp history.dep.stamp sortseq.dep.stamp wasastringtoquery.dep.stamp wasatorcl.dep.stamp pathhash.dep.stamp rcldb.dep.stamp searchdata.dep.stamp stemdb.dep.stamp stoplist.dep.stamp base64.dep.stamp conftree.dep.stamp copyfile.dep.stamp debuglog.dep.stamp execmd.dep.stamp fstreewalk.dep.stamp idfile.dep.stamp md5.dep.stamp mimeparse.dep.stamp pathut.dep.stamp readfile.dep.stamp smallut.dep.stamp transcode.dep.stamp wipedir.dep.stamp x11mon.dep.stamp
librcl.a : $(DEPS) $(OBJS) unac.o librcl.a : $(DEPS) $(OBJS) unac.o
ar ru librcl.a $(OBJS) unac.o ar ru librcl.a $(OBJS) unac.o
@ -73,6 +73,8 @@ searchdata.o : ../rcldb/searchdata.cpp
$(CXX) $(ALL_CXXFLAGS) -c ../rcldb/searchdata.cpp $(CXX) $(ALL_CXXFLAGS) -c ../rcldb/searchdata.cpp
stemdb.o : ../rcldb/stemdb.cpp stemdb.o : ../rcldb/stemdb.cpp
$(CXX) $(ALL_CXXFLAGS) -c ../rcldb/stemdb.cpp $(CXX) $(ALL_CXXFLAGS) -c ../rcldb/stemdb.cpp
stoplist.o : ../rcldb/stoplist.cpp
$(CXX) $(ALL_CXXFLAGS) -c ../rcldb/stoplist.cpp
base64.o : ../utils/base64.cpp base64.o : ../utils/base64.cpp
$(CXX) $(ALL_CXXFLAGS) -c ../utils/base64.cpp $(CXX) $(ALL_CXXFLAGS) -c ../utils/base64.cpp
conftree.o : ../utils/conftree.cpp conftree.o : ../utils/conftree.cpp
@ -193,6 +195,9 @@ searchdata.dep.stamp : ../rcldb/searchdata.cpp
stemdb.dep.stamp : ../rcldb/stemdb.cpp stemdb.dep.stamp : ../rcldb/stemdb.cpp
$(CXX) -M $(ALL_CXXFLAGS) ../rcldb/stemdb.cpp > stemdb.dep $(CXX) -M $(ALL_CXXFLAGS) ../rcldb/stemdb.cpp > stemdb.dep
touch stemdb.dep.stamp touch stemdb.dep.stamp
stoplist.dep.stamp : ../rcldb/stoplist.cpp
$(CXX) -M $(ALL_CXXFLAGS) ../rcldb/stoplist.cpp > stoplist.dep
touch stoplist.dep.stamp
base64.dep.stamp : ../utils/base64.cpp base64.dep.stamp : ../utils/base64.cpp
$(CXX) -M $(ALL_CXXFLAGS) ../utils/base64.cpp > base64.dep $(CXX) -M $(ALL_CXXFLAGS) ../utils/base64.cpp > base64.dep
touch base64.dep.stamp touch base64.dep.stamp
@ -266,6 +271,7 @@ include pathhash.dep
include rcldb.dep include rcldb.dep
include searchdata.dep include searchdata.dep
include stemdb.dep include stemdb.dep
include stoplist.dep
include base64.dep include base64.dep
include conftree.dep include conftree.dep
include copyfile.dep include copyfile.dep

View File

@ -32,6 +32,7 @@ ${depth}/rcldb/pathhash.cpp \
${depth}/rcldb/rcldb.cpp \ ${depth}/rcldb/rcldb.cpp \
${depth}/rcldb/searchdata.cpp \ ${depth}/rcldb/searchdata.cpp \
${depth}/rcldb/stemdb.cpp \ ${depth}/rcldb/stemdb.cpp \
${depth}/rcldb/stoplist.cpp \
${depth}/utils/base64.cpp \ ${depth}/utils/base64.cpp \
${depth}/utils/conftree.cpp \ ${depth}/utils/conftree.cpp \
${depth}/utils/copyfile.cpp \ ${depth}/utils/copyfile.cpp \

View File

@ -1,5 +1,5 @@
#ifndef lint #ifndef lint
static char rcsid[] = "@(#$Id: main.cpp,v 1.59 2007-05-21 13:30:21 dockes Exp $ (C) 2005 J.F.Dockes"; static char rcsid[] = "@(#$Id: main.cpp,v 1.60 2007-06-02 08:30:41 dockes Exp $ (C) 2005 J.F.Dockes";
#endif #endif
/* /*
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
@ -102,7 +102,8 @@ bool maybeOpenDb(string &reason, bool force)
LOGDEB(("main: adding [%s]\n", it->c_str())); LOGDEB(("main: adding [%s]\n", it->c_str()));
rcldb->addQueryDb(*it); rcldb->addQueryDb(*it);
} }
if (!rcldb->isopen() && !rcldb->open(dbdir, Rcl::Db::DbRO, qopts)) { if (!rcldb->isopen() && !rcldb->open(dbdir, rclconfig->getStopfile(),
Rcl::Db::DbRO, qopts)) {
reason = "Could not open database in " + reason = "Could not open database in " +
dbdir + " wait for indexing to complete?"; dbdir + " wait for indexing to complete?";
return false; return false;

24
src/rcldb/Makefile Normal file
View File

@ -0,0 +1,24 @@
# @(#$Id: Makefile,v 1.1 2007-06-02 08:30:42 dockes Exp $ (C) 2005 J.F.Dockes
depth = ..
include $(depth)/mk/sysconf
# Only test executables get build in here
PROGS = stoplist
all: $(BIGLIB) $(PROGS)
$(BIGLIB): force
cd $(depth)/lib;$(MAKE)
force:
STOPLIST_OBJS= trstoplist.o $(BIGLIB)
stoplist : $(STOPLIST_OBJS)
$(CXX) $(ALL_CXXFLAGS) -o stoplist $(STOPLIST_OBJS) \
$(LIBICONV) $(LIBSYS)
trstoplist.o : stoplist.cpp
$(CXX) $(ALL_CXXFLAGS) -DTEST_STOPLIST -c -o trstoplist.o \
stoplist.cpp
clean:
rm -f *.o $(PROGS)

View File

@ -1,5 +1,5 @@
#ifndef lint #ifndef lint
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.110 2007-05-30 12:30:38 dockes Exp $ (C) 2004 J.F.Dockes"; static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.111 2007-06-02 08:30:42 dockes Exp $ (C) 2004 J.F.Dockes";
#endif #endif
/* /*
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
@ -528,7 +528,7 @@ Db::~Db()
LOGERR(("Db::~Db: got exception: %s\n", ermsg)); LOGERR(("Db::~Db: got exception: %s\n", ermsg));
} }
bool Db::open(const string& dir, OpenMode mode, int qops) bool Db::open(const string& dir, const string &stops, OpenMode mode, int qops)
{ {
bool keep_updated = (qops & QO_KEEP_UPDATED) != 0; bool keep_updated = (qops & QO_KEEP_UPDATED) != 0;
qops &= ~QO_KEEP_UPDATED; qops &= ~QO_KEEP_UPDATED;
@ -543,6 +543,9 @@ bool Db::open(const string& dir, OpenMode mode, int qops)
if (!close()) if (!close())
return false; return false;
} }
if (!stops.empty())
m_stops.setFile(stops);
const char *ermsg = "Unknown"; const char *ermsg = "Unknown";
try { try {
switch (mode) { switch (mode) {
@ -652,7 +655,7 @@ bool Db::reOpen()
if (m_ndb && m_ndb->m_isopen) { if (m_ndb && m_ndb->m_isopen) {
if (!close()) if (!close())
return false; return false;
if (!open(m_basedir, m_mode, m_qOpts | QO_KEEP_UPDATED)) { if (!open(m_basedir, "", m_mode, m_qOpts | QO_KEEP_UPDATED)) {
return false; return false;
} }
} }
@ -737,8 +740,9 @@ class mySplitterCB : public TextSplitCB {
Xapian::termpos basepos; // Base for document section Xapian::termpos basepos; // Base for document section
Xapian::termpos curpos; // Current position. Used to set basepos for the Xapian::termpos curpos; // Current position. Used to set basepos for the
// following section // following section
mySplitterCB(Xapian::Document &d) StopList &stops;
: doc(d), basepos(1), curpos(0) mySplitterCB(Xapian::Document &d, StopList &_stops)
: doc(d), basepos(1), curpos(0), stops(_stops)
{} {}
bool takeword(const std::string &term, int pos, int, int); bool takeword(const std::string &term, int pos, int, int);
void setprefix(const string& pref) {prefix = pref;} void setprefix(const string& pref) {prefix = pref;}
@ -762,6 +766,10 @@ bool mySplitterCB::takeword(const std::string &term, int pos, int, int)
const char *ermsg; const char *ermsg;
try { try {
if (stops.hasStops() && stops.isStop(term)) {
LOGDEB1(("Db: takeword [%s] in stop list\n", term.c_str()));
return true;
}
// Note: 1 is the within document frequency increment. It would // Note: 1 is the within document frequency increment. It would
// be possible to assign different weigths to doc parts (ie title) // be possible to assign different weigths to doc parts (ie title)
// by using a higher value // by using a higher value
@ -849,7 +857,7 @@ bool Db::add(const string &fn, const Doc &idoc,
Xapian::Document newdocument; Xapian::Document newdocument;
mySplitterCB splitData(newdocument); mySplitterCB splitData(newdocument, m_stops);
TextSplit splitter(&splitData); TextSplit splitter(&splitData);

View File

@ -16,7 +16,7 @@
*/ */
#ifndef _DB_H_INCLUDED_ #ifndef _DB_H_INCLUDED_
#define _DB_H_INCLUDED_ #define _DB_H_INCLUDED_
/* @(#$Id: rcldb.h,v 1.47 2007-05-22 07:40:00 dockes Exp $ (C) 2004 J.F.Dockes */ /* @(#$Id: rcldb.h,v 1.48 2007-06-02 08:30:42 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string> #include <string>
#include <list> #include <list>
@ -24,6 +24,7 @@
#include "refcntr.h" #include "refcntr.h"
#include "rcldoc.h" #include "rcldoc.h"
#include "stoplist.h"
#ifndef NO_NAMESPACES #ifndef NO_NAMESPACES
using std::string; using std::string;
@ -77,7 +78,8 @@ class Db {
// KEEP_UPDATED is internal use by reOpen() only // KEEP_UPDATED is internal use by reOpen() only
enum QueryOpts {QO_NONE=0, QO_STEM = 1, QO_KEEP_UPDATED = 8}; enum QueryOpts {QO_NONE=0, QO_STEM = 1, QO_KEEP_UPDATED = 8};
bool open(const string &dbdir, OpenMode mode, int qops = QO_NONE); bool open(const string &dbdir, const string &stoplistfn,
OpenMode mode, int qops = QO_NONE);
bool close(); bool close();
bool isopen(); bool isopen();
@ -172,11 +174,13 @@ class Db {
/** Filename wildcard expansion */ /** Filename wildcard expansion */
bool filenameWildExp(const string& exp, list<string>& names); bool filenameWildExp(const string& exp, list<string>& names);
string getReason(){return m_reason;} string getReason() const {return m_reason;}
/** Adjust flush threshold */ /** Adjust flush threshold */
void setFlushMb(int mb) {m_flushmb = mb;} void setFlushMb(int mb) {m_flushmb = mb;}
const StopList& getStopList() const {return m_stops;}
private: private:
string m_filterTopDir; // Current query filter on subtree top directory string m_filterTopDir; // Current query filter on subtree top directory
@ -217,6 +221,8 @@ private:
vector<bool> updated; vector<bool> updated;
StopList m_stops;
bool reOpen(); // Close/open, same mode/opts bool reOpen(); // Close/open, same mode/opts
bool stemExpand(const string &lang, const string &s, bool stemExpand(const string &lang, const string &s,
list<TermMatchEntry>& result, int max = -1); list<TermMatchEntry>& result, int max = -1);

View File

@ -1,5 +1,5 @@
#ifndef lint #ifndef lint
static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.13 2007-02-13 10:58:31 dockes Exp $ (C) 2006 J.F.Dockes"; static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.14 2007-06-02 08:30:42 dockes Exp $ (C) 2006 J.F.Dockes";
#endif #endif
/* /*
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
@ -32,6 +32,7 @@ static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.13 2007-02-13 10:58:31 dockes
#include "textsplit.h" #include "textsplit.h"
#include "unacpp.h" #include "unacpp.h"
#include "utf8iter.h" #include "utf8iter.h"
#include "stoplist.h"
#ifndef NO_NAMESPACES #ifndef NO_NAMESPACES
using namespace std; using namespace std;
@ -136,12 +137,23 @@ bool SearchData::getTerms(vector<string>& terms,
// terms and phrases. // terms and phrases.
class wsQData : public TextSplitCB { class wsQData : public TextSplitCB {
public: public:
wsQData(const StopList &_stops)
: stops(_stops), alltermcount(0)
{}
vector<string> terms; vector<string> terms;
bool takeword(const std::string &term, int , int, int) { bool takeword(const std::string &term, int , int, int) {
alltermcount++;
LOGDEB1(("wsQData::takeword: %s\n", term.c_str())); LOGDEB1(("wsQData::takeword: %s\n", term.c_str()));
if (stops.hasStops() && stops.isStop(term)) {
LOGDEB1(("wsQData::takeword [%s] in stop list\n", term.c_str()));
return true;
}
terms.push_back(term); terms.push_back(term);
return true; return true;
} }
const StopList &stops;
int alltermcount; // Count of terms including stopwords: this is
// for adjusting phrase/near slack
}; };
/** /**
@ -158,10 +170,11 @@ public:
{ } { }
bool processUserString(const string &iq, bool processUserString(const string &iq,
const string &prefix, const string &prefix,
string &ermsg, string &ermsg,
list<Xapian::Query> &pqueries, list<Xapian::Query> &pqueries,
int slack = 0, bool useNear = false); const StopList &stops,
int slack = 0, bool useNear = false);
bool getTerms(vector<string>& terms, bool getTerms(vector<string>& terms,
vector<vector<string> >& groups) vector<vector<string> >& groups)
@ -313,7 +326,9 @@ bool StringToXapianQ::processUserString(const string &iq,
const string &prefix, const string &prefix,
string &ermsg, string &ermsg,
list<Xapian::Query> &pqueries, list<Xapian::Query> &pqueries,
int slack, bool useNear) const StopList& stops,
int slack, bool useNear
)
{ {
LOGDEB(("StringToXapianQ:: query string: [%s]\n", iq.c_str())); LOGDEB(("StringToXapianQ:: query string: [%s]\n", iq.c_str()));
ermsg.erase(); ermsg.erase();
@ -339,7 +354,7 @@ bool StringToXapianQ::processUserString(const string &iq,
// we need to use a word split, else a phrase query including // we need to use a word split, else a phrase query including
// a span would fail if we didn't adjust the proximity to // a span would fail if we didn't adjust the proximity to
// account for the additional span term which is complicated. // account for the additional span term which is complicated.
wsQData splitDataS, splitDataW; wsQData splitDataS(stops), splitDataW(stops);
TextSplit splitterS(&splitDataS, (TextSplit::Flags) TextSplit splitterS(&splitDataS, (TextSplit::Flags)
(TextSplit::TXTS_ONLYSPANS | (TextSplit::TXTS_ONLYSPANS |
TextSplit::TXTS_KEEPWILD)); TextSplit::TXTS_KEEPWILD));
@ -418,7 +433,8 @@ bool StringToXapianQ::processUserString(const string &iq,
pqueries.push_back(Xapian::Query(op, pqueries.push_back(Xapian::Query(op,
orqueries.begin(), orqueries.begin(),
orqueries.end(), orqueries.end(),
splitData->terms.size() + slack)); splitData->alltermcount
+ slack));
// Add NEAR/PHRASE groups to the highlighting data. Must // Add NEAR/PHRASE groups to the highlighting data. Must
// push all combinations // push all combinations
vector<vector<string> > allcombs; vector<vector<string> > allcombs;
@ -508,7 +524,8 @@ bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p,
(m_parentSearch == 0 && !m_haveWildCards); (m_parentSearch == 0 && !m_haveWildCards);
StringToXapianQ tr(db, l_stemlang, doBoostUserTerm); StringToXapianQ tr(db, l_stemlang, doBoostUserTerm);
if (!tr.processUserString(m_text, prefix, m_reason, pqueries)) if (!tr.processUserString(m_text, prefix, m_reason, pqueries,
db.getStopList()))
return false; return false;
if (pqueries.empty()) { if (pqueries.empty()) {
LOGERR(("SearchDataClauseSimple: resolved to null query\n")); LOGERR(("SearchDataClauseSimple: resolved to null query\n"));
@ -570,7 +587,8 @@ bool SearchDataClauseDist::toNativeQuery(Rcl::Db &db, void *p,
string s = string("\"") + m_text + string("\""); string s = string("\"") + m_text + string("\"");
bool useNear = (m_tp == SCLT_NEAR); bool useNear = (m_tp == SCLT_NEAR);
StringToXapianQ tr(db, l_stemlang, doBoostUserTerm); StringToXapianQ tr(db, l_stemlang, doBoostUserTerm);
if (!tr.processUserString(s, prefix, m_reason, pqueries, m_slack, useNear)) if (!tr.processUserString(s, prefix, m_reason, pqueries, db.getStopList(),
m_slack, useNear))
return false; return false;
if (pqueries.empty()) { if (pqueries.empty()) {
LOGERR(("SearchDataClauseDist: resolved to null query\n")); LOGERR(("SearchDataClauseDist: resolved to null query\n"));

104
src/rcldb/stoplist.cpp Normal file
View File

@ -0,0 +1,104 @@
#ifndef lint
static char rcsid[] = "@(#$Id: stoplist.cpp,v 1.1 2007-06-02 08:30:42 dockes Exp $ (C) 2007 J.F.Dockes";
#endif
#ifndef TEST_STOPLIST
#include "debuglog.h"
#include "readfile.h"
#include "unacpp.h"
#include "stoplist.h"
#ifndef NO_NAMESPACES
namespace Rcl
{
#endif
bool StopList::setFile(const string &filename)
{
m_hasStops = false;
m_stops.clear();
string stoptext, reason;
if (!file_to_string(filename, stoptext, &reason)) {
LOGDEB(("StopList::StopList: file_to_string(%s) failed: %s\n",
filename.c_str(), reason.c_str()));
return false;
}
TextSplit ts(this, TextSplit::TXTS_ONLYSPANS);
ts.text_to_words(stoptext);
return true;
}
bool StopList::takeword(const string& term, int, int, int)
{
string dterm;
unacmaybefold(term, dterm, "UTF-8", true);
LOGDEB2(("StopList::takeword: inserting [%s]\n", dterm.c_str()));
m_hasStops = true;
m_stops.insert(dterm);
return true;
}
bool StopList::isStop(const string &term) const
{
return m_hasStops ? m_stops.find(term) != m_stops.end() : false;
}
#ifndef NO_NAMESPACES
}
#endif
#else // TEST_STOPLIST
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <errno.h>
#include <string.h>
#include <string>
#include <iostream>
#include "stoplist.h"
using namespace std;
using namespace Rcl;
static char *thisprog;
static char usage [] =
"trstoplist stopstermsfile\n\n"
;
static void
Usage(void)
{
fprintf(stderr, "%s: usage:\n%s", thisprog, usage);
exit(1);
}
const string tstwords[] = {
"the", "is", "xweird"
};
const int tstsz = sizeof(tstwords) / sizeof(string);
int main(int argc, char **argv)
{
int count = 10;
thisprog = argv[0];
argc--; argv++;
if (argc != 1)
Usage();
string filename = argv[0]; argc--;
StopList sl(filename);
for (int i = 0; i < tstsz; i++) {
const string &tst = tstwords[i];
cout << "[" << tst << "] " <<
(sl.isStop(tst) ? "in stop list" : "not in stop list") << endl;
}
exit(0);
}
#endif // TEST_STOPLIST

37
src/rcldb/stoplist.h Normal file
View File

@ -0,0 +1,37 @@
#ifndef _STOPLIST_H_INCLUDED_
#define _STOPLIST_H_INCLUDED_
/* @(#$Id: stoplist.h,v 1.1 2007-06-02 08:30:42 dockes Exp $ (C) 2006 J.F.Dockes */
#include <set>
#include <string>
#include "textsplit.h"
#ifndef NO_NAMESPACES
using std::set;
using std::string;
namespace Rcl
{
#endif
class StopList : public TextSplitCB {
public:
StopList() : m_hasStops(false) {}
StopList(const string &filename) {setFile(filename);}
virtual ~StopList() {}
bool setFile(const string &filename);
bool isStop(const string &term) const;
bool hasStops() const {return m_hasStops;}
virtual bool takeword(const string& term, int pos, int bts, int bte);
private:
bool m_hasStops;
set<string> m_stops;
};
#ifndef NO_NAMESPACES
}
#endif
#endif /* _STOPLIST_H_INCLUDED_ */

View File

@ -1,5 +1,5 @@
#ifndef lint #ifndef lint
static char rcsid[] = "@(#$Id: readfile.cpp,v 1.3 2006-01-23 13:32:28 dockes Exp $ (C) 2004 J.F.Dockes"; static char rcsid[] = "@(#$Id: readfile.cpp,v 1.4 2007-06-02 08:30:42 dockes Exp $ (C) 2004 J.F.Dockes";
#endif #endif
/* /*
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
@ -32,20 +32,28 @@ using std::string;
#include "readfile.h" #include "readfile.h"
bool file_to_string(const string &fn, string &data) bool file_to_string(const string &fn, string &data, string *reason)
{ {
#define ERRBUFSZ 200
char errbuf[ERRBUFSZ];
bool ret = false; bool ret = false;
int fd = open(fn.c_str(), O_RDONLY|O_STREAMING); int fd = open(fn.c_str(), O_RDONLY|O_STREAMING);
if (fd < 0) { if (fd < 0) {
// perror("open"); if (reason) {
strerror_r(errno, errbuf, ERRBUFSZ);
*reason += string("file_to_string: open failed: ") + errbuf;
}
return false; return false;
} }
char buf[4096]; char buf[4096];
for (;;) { for (;;) {
int n = read(fd, buf, 4096); int n = read(fd, buf, 4096);
if (n < 0) { if (n < 0) {
// perror("read"); if (reason) {
strerror_r(errno, errbuf, ERRBUFSZ);
*reason += string("file_to_string: read failed: ") + errbuf;
}
goto out; goto out;
} }
if (n == 0) if (n == 0)
@ -54,7 +62,10 @@ bool file_to_string(const string &fn, string &data)
try { try {
data.append(buf, n); data.append(buf, n);
} catch (...) { } catch (...) {
// fprintf(stderr, "file_to_string: out of memory\n"); if (reason) {
strerror_r(errno, errbuf, ERRBUFSZ);
*reason += string("file_to_string: out of memory? : ") +errbuf;
}
goto out; goto out;
} }
} }

View File

@ -16,7 +16,7 @@
*/ */
#ifndef _READFILE_H_INCLUDED_ #ifndef _READFILE_H_INCLUDED_
#define _READFILE_H_INCLUDED_ #define _READFILE_H_INCLUDED_
/* @(#$Id: readfile.h,v 1.2 2006-01-30 11:15:28 dockes Exp $ (C) 2004 J.F.Dockes */ /* @(#$Id: readfile.h,v 1.3 2007-06-02 08:30:42 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string> #include <string>
@ -24,6 +24,7 @@
* Read whole file into string. * Read whole file into string.
* @return true for ok, false else * @return true for ok, false else
*/ */
bool file_to_string(const std::string &filename, std::string &data); bool file_to_string(const std::string &filename, std::string &data,
std::string *reason = 0);
#endif /* _READFILE_H_INCLUDED_ */ #endif /* _READFILE_H_INCLUDED_ */