minimal experimental stopword functionality
This commit is contained in:
parent
fcb2762048
commit
0f1b917b7b
@ -1,5 +1,5 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: rclconfig.cpp,v 1.43 2007-02-07 17:17:11 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: rclconfig.cpp,v 1.44 2007-06-02 08:30:41 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
#endif
|
||||
/*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
@ -461,6 +461,11 @@ string RclConfig::getDbDir()
|
||||
return path_canon(dbdir);
|
||||
}
|
||||
|
||||
string RclConfig::getStopfile()
|
||||
{
|
||||
return path_cat(getConfDir(), "stoplist.txt");
|
||||
}
|
||||
|
||||
list<string> RclConfig::getSkippedNames()
|
||||
{
|
||||
list<string> skpl;
|
||||
|
||||
@ -16,7 +16,7 @@
|
||||
*/
|
||||
#ifndef _RCLCONFIG_H_INCLUDED_
|
||||
#define _RCLCONFIG_H_INCLUDED_
|
||||
/* @(#$Id: rclconfig.h,v 1.31 2007-02-02 10:12:58 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
/* @(#$Id: rclconfig.h,v 1.32 2007-06-02 08:30:41 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
|
||||
#include <list>
|
||||
#include <string>
|
||||
@ -78,8 +78,10 @@ class RclConfig {
|
||||
|
||||
/** Get database directory */
|
||||
string getDbDir();
|
||||
/** Get stoplist file name */
|
||||
string getStopfile();
|
||||
|
||||
/** Get list of skipped names for current keydir */
|
||||
/** Get list of skipped file names for current keydir */
|
||||
list<string> getSkippedNames();
|
||||
|
||||
/** Get list of skipped paths patterns. Doesn't depend on the keydir */
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: indexer.cpp,v 1.56 2007-05-30 12:31:19 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: indexer.cpp,v 1.57 2007-06-02 08:30:41 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
#endif
|
||||
/*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
@ -181,7 +181,7 @@ bool DbIndexer::init(bool resetbefore, bool rdonly)
|
||||
}
|
||||
Rcl::Db::OpenMode mode = rdonly ? Rcl::Db::DbRO :
|
||||
resetbefore ? Rcl::Db::DbTrunc : Rcl::Db::DbUpd;
|
||||
if (!m_db.open(m_dbdir, mode)) {
|
||||
if (!m_db.open(m_dbdir, m_config->getStopfile(), mode)) {
|
||||
LOGERR(("DbIndexer: error opening database in %s\n", m_dbdir.c_str()));
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -8,8 +8,8 @@ LIBS = librcl.a
|
||||
|
||||
all: $(LIBS)
|
||||
|
||||
OBJS = rclaspell.o rclconfig.o rclinit.o textsplit.o unacpp.o csguess.o indexer.o mimetype.o htmlparse.o myhtmlparse.o mimehandler.o internfile.o mh_exec.o mh_html.o mh_mail.o mh_mbox.o mh_text.o docseq.o docseqdb.o docseqhist.o history.o sortseq.o wasastringtoquery.o wasatorcl.o pathhash.o rcldb.o searchdata.o stemdb.o base64.o conftree.o copyfile.o debuglog.o execmd.o fstreewalk.o idfile.o md5.o mimeparse.o pathut.o readfile.o smallut.o transcode.o wipedir.o x11mon.o
|
||||
DEPS = rclaspell.dep.stamp rclconfig.dep.stamp rclinit.dep.stamp textsplit.dep.stamp unacpp.dep.stamp csguess.dep.stamp indexer.dep.stamp mimetype.dep.stamp htmlparse.dep.stamp myhtmlparse.dep.stamp mimehandler.dep.stamp internfile.dep.stamp mh_exec.dep.stamp mh_html.dep.stamp mh_mail.dep.stamp mh_mbox.dep.stamp mh_text.dep.stamp docseq.dep.stamp docseqdb.dep.stamp docseqhist.dep.stamp history.dep.stamp sortseq.dep.stamp wasastringtoquery.dep.stamp wasatorcl.dep.stamp pathhash.dep.stamp rcldb.dep.stamp searchdata.dep.stamp stemdb.dep.stamp base64.dep.stamp conftree.dep.stamp copyfile.dep.stamp debuglog.dep.stamp execmd.dep.stamp fstreewalk.dep.stamp idfile.dep.stamp md5.dep.stamp mimeparse.dep.stamp pathut.dep.stamp readfile.dep.stamp smallut.dep.stamp transcode.dep.stamp wipedir.dep.stamp x11mon.dep.stamp
|
||||
OBJS = rclaspell.o rclconfig.o rclinit.o textsplit.o unacpp.o csguess.o indexer.o mimetype.o htmlparse.o myhtmlparse.o mimehandler.o internfile.o mh_exec.o mh_html.o mh_mail.o mh_mbox.o mh_text.o docseq.o docseqdb.o docseqhist.o history.o sortseq.o wasastringtoquery.o wasatorcl.o pathhash.o rcldb.o searchdata.o stemdb.o stoplist.o base64.o conftree.o copyfile.o debuglog.o execmd.o fstreewalk.o idfile.o md5.o mimeparse.o pathut.o readfile.o smallut.o transcode.o wipedir.o x11mon.o
|
||||
DEPS = rclaspell.dep.stamp rclconfig.dep.stamp rclinit.dep.stamp textsplit.dep.stamp unacpp.dep.stamp csguess.dep.stamp indexer.dep.stamp mimetype.dep.stamp htmlparse.dep.stamp myhtmlparse.dep.stamp mimehandler.dep.stamp internfile.dep.stamp mh_exec.dep.stamp mh_html.dep.stamp mh_mail.dep.stamp mh_mbox.dep.stamp mh_text.dep.stamp docseq.dep.stamp docseqdb.dep.stamp docseqhist.dep.stamp history.dep.stamp sortseq.dep.stamp wasastringtoquery.dep.stamp wasatorcl.dep.stamp pathhash.dep.stamp rcldb.dep.stamp searchdata.dep.stamp stemdb.dep.stamp stoplist.dep.stamp base64.dep.stamp conftree.dep.stamp copyfile.dep.stamp debuglog.dep.stamp execmd.dep.stamp fstreewalk.dep.stamp idfile.dep.stamp md5.dep.stamp mimeparse.dep.stamp pathut.dep.stamp readfile.dep.stamp smallut.dep.stamp transcode.dep.stamp wipedir.dep.stamp x11mon.dep.stamp
|
||||
|
||||
librcl.a : $(DEPS) $(OBJS) unac.o
|
||||
ar ru librcl.a $(OBJS) unac.o
|
||||
@ -73,6 +73,8 @@ searchdata.o : ../rcldb/searchdata.cpp
|
||||
$(CXX) $(ALL_CXXFLAGS) -c ../rcldb/searchdata.cpp
|
||||
stemdb.o : ../rcldb/stemdb.cpp
|
||||
$(CXX) $(ALL_CXXFLAGS) -c ../rcldb/stemdb.cpp
|
||||
stoplist.o : ../rcldb/stoplist.cpp
|
||||
$(CXX) $(ALL_CXXFLAGS) -c ../rcldb/stoplist.cpp
|
||||
base64.o : ../utils/base64.cpp
|
||||
$(CXX) $(ALL_CXXFLAGS) -c ../utils/base64.cpp
|
||||
conftree.o : ../utils/conftree.cpp
|
||||
@ -193,6 +195,9 @@ searchdata.dep.stamp : ../rcldb/searchdata.cpp
|
||||
stemdb.dep.stamp : ../rcldb/stemdb.cpp
|
||||
$(CXX) -M $(ALL_CXXFLAGS) ../rcldb/stemdb.cpp > stemdb.dep
|
||||
touch stemdb.dep.stamp
|
||||
stoplist.dep.stamp : ../rcldb/stoplist.cpp
|
||||
$(CXX) -M $(ALL_CXXFLAGS) ../rcldb/stoplist.cpp > stoplist.dep
|
||||
touch stoplist.dep.stamp
|
||||
base64.dep.stamp : ../utils/base64.cpp
|
||||
$(CXX) -M $(ALL_CXXFLAGS) ../utils/base64.cpp > base64.dep
|
||||
touch base64.dep.stamp
|
||||
@ -266,6 +271,7 @@ include pathhash.dep
|
||||
include rcldb.dep
|
||||
include searchdata.dep
|
||||
include stemdb.dep
|
||||
include stoplist.dep
|
||||
include base64.dep
|
||||
include conftree.dep
|
||||
include copyfile.dep
|
||||
|
||||
@ -32,6 +32,7 @@ ${depth}/rcldb/pathhash.cpp \
|
||||
${depth}/rcldb/rcldb.cpp \
|
||||
${depth}/rcldb/searchdata.cpp \
|
||||
${depth}/rcldb/stemdb.cpp \
|
||||
${depth}/rcldb/stoplist.cpp \
|
||||
${depth}/utils/base64.cpp \
|
||||
${depth}/utils/conftree.cpp \
|
||||
${depth}/utils/copyfile.cpp \
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: main.cpp,v 1.59 2007-05-21 13:30:21 dockes Exp $ (C) 2005 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: main.cpp,v 1.60 2007-06-02 08:30:41 dockes Exp $ (C) 2005 J.F.Dockes";
|
||||
#endif
|
||||
/*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
@ -102,7 +102,8 @@ bool maybeOpenDb(string &reason, bool force)
|
||||
LOGDEB(("main: adding [%s]\n", it->c_str()));
|
||||
rcldb->addQueryDb(*it);
|
||||
}
|
||||
if (!rcldb->isopen() && !rcldb->open(dbdir, Rcl::Db::DbRO, qopts)) {
|
||||
if (!rcldb->isopen() && !rcldb->open(dbdir, rclconfig->getStopfile(),
|
||||
Rcl::Db::DbRO, qopts)) {
|
||||
reason = "Could not open database in " +
|
||||
dbdir + " wait for indexing to complete?";
|
||||
return false;
|
||||
|
||||
24
src/rcldb/Makefile
Normal file
24
src/rcldb/Makefile
Normal file
@ -0,0 +1,24 @@
|
||||
# @(#$Id: Makefile,v 1.1 2007-06-02 08:30:42 dockes Exp $ (C) 2005 J.F.Dockes
|
||||
depth = ..
|
||||
include $(depth)/mk/sysconf
|
||||
|
||||
# Only test executables get build in here
|
||||
PROGS = stoplist
|
||||
|
||||
all: $(BIGLIB) $(PROGS)
|
||||
|
||||
$(BIGLIB): force
|
||||
cd $(depth)/lib;$(MAKE)
|
||||
force:
|
||||
|
||||
STOPLIST_OBJS= trstoplist.o $(BIGLIB)
|
||||
stoplist : $(STOPLIST_OBJS)
|
||||
$(CXX) $(ALL_CXXFLAGS) -o stoplist $(STOPLIST_OBJS) \
|
||||
$(LIBICONV) $(LIBSYS)
|
||||
trstoplist.o : stoplist.cpp
|
||||
$(CXX) $(ALL_CXXFLAGS) -DTEST_STOPLIST -c -o trstoplist.o \
|
||||
stoplist.cpp
|
||||
|
||||
clean:
|
||||
rm -f *.o $(PROGS)
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.110 2007-05-30 12:30:38 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.111 2007-06-02 08:30:42 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
#endif
|
||||
/*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
@ -528,7 +528,7 @@ Db::~Db()
|
||||
LOGERR(("Db::~Db: got exception: %s\n", ermsg));
|
||||
}
|
||||
|
||||
bool Db::open(const string& dir, OpenMode mode, int qops)
|
||||
bool Db::open(const string& dir, const string &stops, OpenMode mode, int qops)
|
||||
{
|
||||
bool keep_updated = (qops & QO_KEEP_UPDATED) != 0;
|
||||
qops &= ~QO_KEEP_UPDATED;
|
||||
@ -543,6 +543,9 @@ bool Db::open(const string& dir, OpenMode mode, int qops)
|
||||
if (!close())
|
||||
return false;
|
||||
}
|
||||
if (!stops.empty())
|
||||
m_stops.setFile(stops);
|
||||
|
||||
const char *ermsg = "Unknown";
|
||||
try {
|
||||
switch (mode) {
|
||||
@ -652,7 +655,7 @@ bool Db::reOpen()
|
||||
if (m_ndb && m_ndb->m_isopen) {
|
||||
if (!close())
|
||||
return false;
|
||||
if (!open(m_basedir, m_mode, m_qOpts | QO_KEEP_UPDATED)) {
|
||||
if (!open(m_basedir, "", m_mode, m_qOpts | QO_KEEP_UPDATED)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@ -737,8 +740,9 @@ class mySplitterCB : public TextSplitCB {
|
||||
Xapian::termpos basepos; // Base for document section
|
||||
Xapian::termpos curpos; // Current position. Used to set basepos for the
|
||||
// following section
|
||||
mySplitterCB(Xapian::Document &d)
|
||||
: doc(d), basepos(1), curpos(0)
|
||||
StopList &stops;
|
||||
mySplitterCB(Xapian::Document &d, StopList &_stops)
|
||||
: doc(d), basepos(1), curpos(0), stops(_stops)
|
||||
{}
|
||||
bool takeword(const std::string &term, int pos, int, int);
|
||||
void setprefix(const string& pref) {prefix = pref;}
|
||||
@ -762,6 +766,10 @@ bool mySplitterCB::takeword(const std::string &term, int pos, int, int)
|
||||
|
||||
const char *ermsg;
|
||||
try {
|
||||
if (stops.hasStops() && stops.isStop(term)) {
|
||||
LOGDEB1(("Db: takeword [%s] in stop list\n", term.c_str()));
|
||||
return true;
|
||||
}
|
||||
// Note: 1 is the within document frequency increment. It would
|
||||
// be possible to assign different weigths to doc parts (ie title)
|
||||
// by using a higher value
|
||||
@ -849,7 +857,7 @@ bool Db::add(const string &fn, const Doc &idoc,
|
||||
|
||||
Xapian::Document newdocument;
|
||||
|
||||
mySplitterCB splitData(newdocument);
|
||||
mySplitterCB splitData(newdocument, m_stops);
|
||||
|
||||
TextSplit splitter(&splitData);
|
||||
|
||||
|
||||
@ -16,7 +16,7 @@
|
||||
*/
|
||||
#ifndef _DB_H_INCLUDED_
|
||||
#define _DB_H_INCLUDED_
|
||||
/* @(#$Id: rcldb.h,v 1.47 2007-05-22 07:40:00 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
/* @(#$Id: rcldb.h,v 1.48 2007-06-02 08:30:42 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
|
||||
#include <string>
|
||||
#include <list>
|
||||
@ -24,6 +24,7 @@
|
||||
|
||||
#include "refcntr.h"
|
||||
#include "rcldoc.h"
|
||||
#include "stoplist.h"
|
||||
|
||||
#ifndef NO_NAMESPACES
|
||||
using std::string;
|
||||
@ -77,7 +78,8 @@ class Db {
|
||||
// KEEP_UPDATED is internal use by reOpen() only
|
||||
enum QueryOpts {QO_NONE=0, QO_STEM = 1, QO_KEEP_UPDATED = 8};
|
||||
|
||||
bool open(const string &dbdir, OpenMode mode, int qops = QO_NONE);
|
||||
bool open(const string &dbdir, const string &stoplistfn,
|
||||
OpenMode mode, int qops = QO_NONE);
|
||||
bool close();
|
||||
bool isopen();
|
||||
|
||||
@ -172,11 +174,13 @@ class Db {
|
||||
|
||||
/** Filename wildcard expansion */
|
||||
bool filenameWildExp(const string& exp, list<string>& names);
|
||||
string getReason(){return m_reason;}
|
||||
string getReason() const {return m_reason;}
|
||||
|
||||
/** Adjust flush threshold */
|
||||
void setFlushMb(int mb) {m_flushmb = mb;}
|
||||
|
||||
const StopList& getStopList() const {return m_stops;}
|
||||
|
||||
private:
|
||||
|
||||
string m_filterTopDir; // Current query filter on subtree top directory
|
||||
@ -217,6 +221,8 @@ private:
|
||||
|
||||
vector<bool> updated;
|
||||
|
||||
StopList m_stops;
|
||||
|
||||
bool reOpen(); // Close/open, same mode/opts
|
||||
bool stemExpand(const string &lang, const string &s,
|
||||
list<TermMatchEntry>& result, int max = -1);
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.13 2007-02-13 10:58:31 dockes Exp $ (C) 2006 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.14 2007-06-02 08:30:42 dockes Exp $ (C) 2006 J.F.Dockes";
|
||||
#endif
|
||||
/*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
@ -32,6 +32,7 @@ static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.13 2007-02-13 10:58:31 dockes
|
||||
#include "textsplit.h"
|
||||
#include "unacpp.h"
|
||||
#include "utf8iter.h"
|
||||
#include "stoplist.h"
|
||||
|
||||
#ifndef NO_NAMESPACES
|
||||
using namespace std;
|
||||
@ -136,12 +137,23 @@ bool SearchData::getTerms(vector<string>& terms,
|
||||
// terms and phrases.
|
||||
class wsQData : public TextSplitCB {
|
||||
public:
|
||||
wsQData(const StopList &_stops)
|
||||
: stops(_stops), alltermcount(0)
|
||||
{}
|
||||
vector<string> terms;
|
||||
bool takeword(const std::string &term, int , int, int) {
|
||||
alltermcount++;
|
||||
LOGDEB1(("wsQData::takeword: %s\n", term.c_str()));
|
||||
if (stops.hasStops() && stops.isStop(term)) {
|
||||
LOGDEB1(("wsQData::takeword [%s] in stop list\n", term.c_str()));
|
||||
return true;
|
||||
}
|
||||
terms.push_back(term);
|
||||
return true;
|
||||
}
|
||||
const StopList &stops;
|
||||
int alltermcount; // Count of terms including stopwords: this is
|
||||
// for adjusting phrase/near slack
|
||||
};
|
||||
|
||||
/**
|
||||
@ -158,10 +170,11 @@ public:
|
||||
{ }
|
||||
|
||||
bool processUserString(const string &iq,
|
||||
const string &prefix,
|
||||
string &ermsg,
|
||||
list<Xapian::Query> &pqueries,
|
||||
int slack = 0, bool useNear = false);
|
||||
const string &prefix,
|
||||
string &ermsg,
|
||||
list<Xapian::Query> &pqueries,
|
||||
const StopList &stops,
|
||||
int slack = 0, bool useNear = false);
|
||||
|
||||
bool getTerms(vector<string>& terms,
|
||||
vector<vector<string> >& groups)
|
||||
@ -313,7 +326,9 @@ bool StringToXapianQ::processUserString(const string &iq,
|
||||
const string &prefix,
|
||||
string &ermsg,
|
||||
list<Xapian::Query> &pqueries,
|
||||
int slack, bool useNear)
|
||||
const StopList& stops,
|
||||
int slack, bool useNear
|
||||
)
|
||||
{
|
||||
LOGDEB(("StringToXapianQ:: query string: [%s]\n", iq.c_str()));
|
||||
ermsg.erase();
|
||||
@ -339,7 +354,7 @@ bool StringToXapianQ::processUserString(const string &iq,
|
||||
// we need to use a word split, else a phrase query including
|
||||
// a span would fail if we didn't adjust the proximity to
|
||||
// account for the additional span term which is complicated.
|
||||
wsQData splitDataS, splitDataW;
|
||||
wsQData splitDataS(stops), splitDataW(stops);
|
||||
TextSplit splitterS(&splitDataS, (TextSplit::Flags)
|
||||
(TextSplit::TXTS_ONLYSPANS |
|
||||
TextSplit::TXTS_KEEPWILD));
|
||||
@ -418,7 +433,8 @@ bool StringToXapianQ::processUserString(const string &iq,
|
||||
pqueries.push_back(Xapian::Query(op,
|
||||
orqueries.begin(),
|
||||
orqueries.end(),
|
||||
splitData->terms.size() + slack));
|
||||
splitData->alltermcount
|
||||
+ slack));
|
||||
// Add NEAR/PHRASE groups to the highlighting data. Must
|
||||
// push all combinations
|
||||
vector<vector<string> > allcombs;
|
||||
@ -508,7 +524,8 @@ bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p,
|
||||
(m_parentSearch == 0 && !m_haveWildCards);
|
||||
|
||||
StringToXapianQ tr(db, l_stemlang, doBoostUserTerm);
|
||||
if (!tr.processUserString(m_text, prefix, m_reason, pqueries))
|
||||
if (!tr.processUserString(m_text, prefix, m_reason, pqueries,
|
||||
db.getStopList()))
|
||||
return false;
|
||||
if (pqueries.empty()) {
|
||||
LOGERR(("SearchDataClauseSimple: resolved to null query\n"));
|
||||
@ -570,7 +587,8 @@ bool SearchDataClauseDist::toNativeQuery(Rcl::Db &db, void *p,
|
||||
string s = string("\"") + m_text + string("\"");
|
||||
bool useNear = (m_tp == SCLT_NEAR);
|
||||
StringToXapianQ tr(db, l_stemlang, doBoostUserTerm);
|
||||
if (!tr.processUserString(s, prefix, m_reason, pqueries, m_slack, useNear))
|
||||
if (!tr.processUserString(s, prefix, m_reason, pqueries, db.getStopList(),
|
||||
m_slack, useNear))
|
||||
return false;
|
||||
if (pqueries.empty()) {
|
||||
LOGERR(("SearchDataClauseDist: resolved to null query\n"));
|
||||
|
||||
104
src/rcldb/stoplist.cpp
Normal file
104
src/rcldb/stoplist.cpp
Normal file
@ -0,0 +1,104 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: stoplist.cpp,v 1.1 2007-06-02 08:30:42 dockes Exp $ (C) 2007 J.F.Dockes";
|
||||
#endif
|
||||
#ifndef TEST_STOPLIST
|
||||
#include "debuglog.h"
|
||||
#include "readfile.h"
|
||||
#include "unacpp.h"
|
||||
#include "stoplist.h"
|
||||
|
||||
#ifndef NO_NAMESPACES
|
||||
namespace Rcl
|
||||
{
|
||||
#endif
|
||||
|
||||
bool StopList::setFile(const string &filename)
|
||||
{
|
||||
m_hasStops = false;
|
||||
m_stops.clear();
|
||||
string stoptext, reason;
|
||||
if (!file_to_string(filename, stoptext, &reason)) {
|
||||
LOGDEB(("StopList::StopList: file_to_string(%s) failed: %s\n",
|
||||
filename.c_str(), reason.c_str()));
|
||||
return false;
|
||||
}
|
||||
TextSplit ts(this, TextSplit::TXTS_ONLYSPANS);
|
||||
ts.text_to_words(stoptext);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool StopList::takeword(const string& term, int, int, int)
|
||||
{
|
||||
string dterm;
|
||||
unacmaybefold(term, dterm, "UTF-8", true);
|
||||
LOGDEB2(("StopList::takeword: inserting [%s]\n", dterm.c_str()));
|
||||
m_hasStops = true;
|
||||
m_stops.insert(dterm);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool StopList::isStop(const string &term) const
|
||||
{
|
||||
return m_hasStops ? m_stops.find(term) != m_stops.end() : false;
|
||||
}
|
||||
|
||||
|
||||
#ifndef NO_NAMESPACES
|
||||
}
|
||||
#endif
|
||||
|
||||
#else // TEST_STOPLIST
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <unistd.h>
|
||||
#include <errno.h>
|
||||
#include <string.h>
|
||||
|
||||
#include <string>
|
||||
#include <iostream>
|
||||
|
||||
#include "stoplist.h"
|
||||
|
||||
using namespace std;
|
||||
using namespace Rcl;
|
||||
|
||||
static char *thisprog;
|
||||
|
||||
static char usage [] =
|
||||
"trstoplist stopstermsfile\n\n"
|
||||
;
|
||||
static void
|
||||
Usage(void)
|
||||
{
|
||||
fprintf(stderr, "%s: usage:\n%s", thisprog, usage);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
const string tstwords[] = {
|
||||
"the", "is", "xweird"
|
||||
};
|
||||
const int tstsz = sizeof(tstwords) / sizeof(string);
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
int count = 10;
|
||||
|
||||
thisprog = argv[0];
|
||||
argc--; argv++;
|
||||
|
||||
if (argc != 1)
|
||||
Usage();
|
||||
string filename = argv[0]; argc--;
|
||||
|
||||
StopList sl(filename);
|
||||
|
||||
for (int i = 0; i < tstsz; i++) {
|
||||
const string &tst = tstwords[i];
|
||||
cout << "[" << tst << "] " <<
|
||||
(sl.isStop(tst) ? "in stop list" : "not in stop list") << endl;
|
||||
}
|
||||
exit(0);
|
||||
}
|
||||
|
||||
#endif // TEST_STOPLIST
|
||||
37
src/rcldb/stoplist.h
Normal file
37
src/rcldb/stoplist.h
Normal file
@ -0,0 +1,37 @@
|
||||
#ifndef _STOPLIST_H_INCLUDED_
|
||||
#define _STOPLIST_H_INCLUDED_
|
||||
/* @(#$Id: stoplist.h,v 1.1 2007-06-02 08:30:42 dockes Exp $ (C) 2006 J.F.Dockes */
|
||||
|
||||
#include <set>
|
||||
#include <string>
|
||||
|
||||
#include "textsplit.h"
|
||||
|
||||
#ifndef NO_NAMESPACES
|
||||
using std::set;
|
||||
using std::string;
|
||||
namespace Rcl
|
||||
{
|
||||
#endif
|
||||
|
||||
class StopList : public TextSplitCB {
|
||||
public:
|
||||
StopList() : m_hasStops(false) {}
|
||||
StopList(const string &filename) {setFile(filename);}
|
||||
virtual ~StopList() {}
|
||||
|
||||
bool setFile(const string &filename);
|
||||
bool isStop(const string &term) const;
|
||||
bool hasStops() const {return m_hasStops;}
|
||||
virtual bool takeword(const string& term, int pos, int bts, int bte);
|
||||
|
||||
private:
|
||||
bool m_hasStops;
|
||||
set<string> m_stops;
|
||||
};
|
||||
|
||||
#ifndef NO_NAMESPACES
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* _STOPLIST_H_INCLUDED_ */
|
||||
@ -1,5 +1,5 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: readfile.cpp,v 1.3 2006-01-23 13:32:28 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: readfile.cpp,v 1.4 2007-06-02 08:30:42 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
#endif
|
||||
/*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
@ -32,20 +32,28 @@ using std::string;
|
||||
|
||||
#include "readfile.h"
|
||||
|
||||
bool file_to_string(const string &fn, string &data)
|
||||
bool file_to_string(const string &fn, string &data, string *reason)
|
||||
{
|
||||
#define ERRBUFSZ 200
|
||||
char errbuf[ERRBUFSZ];
|
||||
bool ret = false;
|
||||
|
||||
int fd = open(fn.c_str(), O_RDONLY|O_STREAMING);
|
||||
if (fd < 0) {
|
||||
// perror("open");
|
||||
if (reason) {
|
||||
strerror_r(errno, errbuf, ERRBUFSZ);
|
||||
*reason += string("file_to_string: open failed: ") + errbuf;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
char buf[4096];
|
||||
for (;;) {
|
||||
int n = read(fd, buf, 4096);
|
||||
if (n < 0) {
|
||||
// perror("read");
|
||||
if (reason) {
|
||||
strerror_r(errno, errbuf, ERRBUFSZ);
|
||||
*reason += string("file_to_string: read failed: ") + errbuf;
|
||||
}
|
||||
goto out;
|
||||
}
|
||||
if (n == 0)
|
||||
@ -54,7 +62,10 @@ bool file_to_string(const string &fn, string &data)
|
||||
try {
|
||||
data.append(buf, n);
|
||||
} catch (...) {
|
||||
// fprintf(stderr, "file_to_string: out of memory\n");
|
||||
if (reason) {
|
||||
strerror_r(errno, errbuf, ERRBUFSZ);
|
||||
*reason += string("file_to_string: out of memory? : ") +errbuf;
|
||||
}
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
@ -16,7 +16,7 @@
|
||||
*/
|
||||
#ifndef _READFILE_H_INCLUDED_
|
||||
#define _READFILE_H_INCLUDED_
|
||||
/* @(#$Id: readfile.h,v 1.2 2006-01-30 11:15:28 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
/* @(#$Id: readfile.h,v 1.3 2007-06-02 08:30:42 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
|
||||
#include <string>
|
||||
|
||||
@ -24,6 +24,7 @@
|
||||
* Read whole file into string.
|
||||
* @return true for ok, false else
|
||||
*/
|
||||
bool file_to_string(const std::string &filename, std::string &data);
|
||||
bool file_to_string(const std::string &filename, std::string &data,
|
||||
std::string *reason = 0);
|
||||
|
||||
#endif /* _READFILE_H_INCLUDED_ */
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user