minimal experimental stopword functionality
This commit is contained in:
parent
fcb2762048
commit
0f1b917b7b
@ -1,5 +1,5 @@
|
|||||||
#ifndef lint
|
#ifndef lint
|
||||||
static char rcsid[] = "@(#$Id: rclconfig.cpp,v 1.43 2007-02-07 17:17:11 dockes Exp $ (C) 2004 J.F.Dockes";
|
static char rcsid[] = "@(#$Id: rclconfig.cpp,v 1.44 2007-06-02 08:30:41 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||||
#endif
|
#endif
|
||||||
/*
|
/*
|
||||||
* This program is free software; you can redistribute it and/or modify
|
* This program is free software; you can redistribute it and/or modify
|
||||||
@ -461,6 +461,11 @@ string RclConfig::getDbDir()
|
|||||||
return path_canon(dbdir);
|
return path_canon(dbdir);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
string RclConfig::getStopfile()
|
||||||
|
{
|
||||||
|
return path_cat(getConfDir(), "stoplist.txt");
|
||||||
|
}
|
||||||
|
|
||||||
list<string> RclConfig::getSkippedNames()
|
list<string> RclConfig::getSkippedNames()
|
||||||
{
|
{
|
||||||
list<string> skpl;
|
list<string> skpl;
|
||||||
|
|||||||
@ -16,7 +16,7 @@
|
|||||||
*/
|
*/
|
||||||
#ifndef _RCLCONFIG_H_INCLUDED_
|
#ifndef _RCLCONFIG_H_INCLUDED_
|
||||||
#define _RCLCONFIG_H_INCLUDED_
|
#define _RCLCONFIG_H_INCLUDED_
|
||||||
/* @(#$Id: rclconfig.h,v 1.31 2007-02-02 10:12:58 dockes Exp $ (C) 2004 J.F.Dockes */
|
/* @(#$Id: rclconfig.h,v 1.32 2007-06-02 08:30:41 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||||
|
|
||||||
#include <list>
|
#include <list>
|
||||||
#include <string>
|
#include <string>
|
||||||
@ -78,8 +78,10 @@ class RclConfig {
|
|||||||
|
|
||||||
/** Get database directory */
|
/** Get database directory */
|
||||||
string getDbDir();
|
string getDbDir();
|
||||||
|
/** Get stoplist file name */
|
||||||
|
string getStopfile();
|
||||||
|
|
||||||
/** Get list of skipped names for current keydir */
|
/** Get list of skipped file names for current keydir */
|
||||||
list<string> getSkippedNames();
|
list<string> getSkippedNames();
|
||||||
|
|
||||||
/** Get list of skipped paths patterns. Doesn't depend on the keydir */
|
/** Get list of skipped paths patterns. Doesn't depend on the keydir */
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
#ifndef lint
|
#ifndef lint
|
||||||
static char rcsid[] = "@(#$Id: indexer.cpp,v 1.56 2007-05-30 12:31:19 dockes Exp $ (C) 2004 J.F.Dockes";
|
static char rcsid[] = "@(#$Id: indexer.cpp,v 1.57 2007-06-02 08:30:41 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||||
#endif
|
#endif
|
||||||
/*
|
/*
|
||||||
* This program is free software; you can redistribute it and/or modify
|
* This program is free software; you can redistribute it and/or modify
|
||||||
@ -181,7 +181,7 @@ bool DbIndexer::init(bool resetbefore, bool rdonly)
|
|||||||
}
|
}
|
||||||
Rcl::Db::OpenMode mode = rdonly ? Rcl::Db::DbRO :
|
Rcl::Db::OpenMode mode = rdonly ? Rcl::Db::DbRO :
|
||||||
resetbefore ? Rcl::Db::DbTrunc : Rcl::Db::DbUpd;
|
resetbefore ? Rcl::Db::DbTrunc : Rcl::Db::DbUpd;
|
||||||
if (!m_db.open(m_dbdir, mode)) {
|
if (!m_db.open(m_dbdir, m_config->getStopfile(), mode)) {
|
||||||
LOGERR(("DbIndexer: error opening database in %s\n", m_dbdir.c_str()));
|
LOGERR(("DbIndexer: error opening database in %s\n", m_dbdir.c_str()));
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -8,8 +8,8 @@ LIBS = librcl.a
|
|||||||
|
|
||||||
all: $(LIBS)
|
all: $(LIBS)
|
||||||
|
|
||||||
OBJS = rclaspell.o rclconfig.o rclinit.o textsplit.o unacpp.o csguess.o indexer.o mimetype.o htmlparse.o myhtmlparse.o mimehandler.o internfile.o mh_exec.o mh_html.o mh_mail.o mh_mbox.o mh_text.o docseq.o docseqdb.o docseqhist.o history.o sortseq.o wasastringtoquery.o wasatorcl.o pathhash.o rcldb.o searchdata.o stemdb.o base64.o conftree.o copyfile.o debuglog.o execmd.o fstreewalk.o idfile.o md5.o mimeparse.o pathut.o readfile.o smallut.o transcode.o wipedir.o x11mon.o
|
OBJS = rclaspell.o rclconfig.o rclinit.o textsplit.o unacpp.o csguess.o indexer.o mimetype.o htmlparse.o myhtmlparse.o mimehandler.o internfile.o mh_exec.o mh_html.o mh_mail.o mh_mbox.o mh_text.o docseq.o docseqdb.o docseqhist.o history.o sortseq.o wasastringtoquery.o wasatorcl.o pathhash.o rcldb.o searchdata.o stemdb.o stoplist.o base64.o conftree.o copyfile.o debuglog.o execmd.o fstreewalk.o idfile.o md5.o mimeparse.o pathut.o readfile.o smallut.o transcode.o wipedir.o x11mon.o
|
||||||
DEPS = rclaspell.dep.stamp rclconfig.dep.stamp rclinit.dep.stamp textsplit.dep.stamp unacpp.dep.stamp csguess.dep.stamp indexer.dep.stamp mimetype.dep.stamp htmlparse.dep.stamp myhtmlparse.dep.stamp mimehandler.dep.stamp internfile.dep.stamp mh_exec.dep.stamp mh_html.dep.stamp mh_mail.dep.stamp mh_mbox.dep.stamp mh_text.dep.stamp docseq.dep.stamp docseqdb.dep.stamp docseqhist.dep.stamp history.dep.stamp sortseq.dep.stamp wasastringtoquery.dep.stamp wasatorcl.dep.stamp pathhash.dep.stamp rcldb.dep.stamp searchdata.dep.stamp stemdb.dep.stamp base64.dep.stamp conftree.dep.stamp copyfile.dep.stamp debuglog.dep.stamp execmd.dep.stamp fstreewalk.dep.stamp idfile.dep.stamp md5.dep.stamp mimeparse.dep.stamp pathut.dep.stamp readfile.dep.stamp smallut.dep.stamp transcode.dep.stamp wipedir.dep.stamp x11mon.dep.stamp
|
DEPS = rclaspell.dep.stamp rclconfig.dep.stamp rclinit.dep.stamp textsplit.dep.stamp unacpp.dep.stamp csguess.dep.stamp indexer.dep.stamp mimetype.dep.stamp htmlparse.dep.stamp myhtmlparse.dep.stamp mimehandler.dep.stamp internfile.dep.stamp mh_exec.dep.stamp mh_html.dep.stamp mh_mail.dep.stamp mh_mbox.dep.stamp mh_text.dep.stamp docseq.dep.stamp docseqdb.dep.stamp docseqhist.dep.stamp history.dep.stamp sortseq.dep.stamp wasastringtoquery.dep.stamp wasatorcl.dep.stamp pathhash.dep.stamp rcldb.dep.stamp searchdata.dep.stamp stemdb.dep.stamp stoplist.dep.stamp base64.dep.stamp conftree.dep.stamp copyfile.dep.stamp debuglog.dep.stamp execmd.dep.stamp fstreewalk.dep.stamp idfile.dep.stamp md5.dep.stamp mimeparse.dep.stamp pathut.dep.stamp readfile.dep.stamp smallut.dep.stamp transcode.dep.stamp wipedir.dep.stamp x11mon.dep.stamp
|
||||||
|
|
||||||
librcl.a : $(DEPS) $(OBJS) unac.o
|
librcl.a : $(DEPS) $(OBJS) unac.o
|
||||||
ar ru librcl.a $(OBJS) unac.o
|
ar ru librcl.a $(OBJS) unac.o
|
||||||
@ -73,6 +73,8 @@ searchdata.o : ../rcldb/searchdata.cpp
|
|||||||
$(CXX) $(ALL_CXXFLAGS) -c ../rcldb/searchdata.cpp
|
$(CXX) $(ALL_CXXFLAGS) -c ../rcldb/searchdata.cpp
|
||||||
stemdb.o : ../rcldb/stemdb.cpp
|
stemdb.o : ../rcldb/stemdb.cpp
|
||||||
$(CXX) $(ALL_CXXFLAGS) -c ../rcldb/stemdb.cpp
|
$(CXX) $(ALL_CXXFLAGS) -c ../rcldb/stemdb.cpp
|
||||||
|
stoplist.o : ../rcldb/stoplist.cpp
|
||||||
|
$(CXX) $(ALL_CXXFLAGS) -c ../rcldb/stoplist.cpp
|
||||||
base64.o : ../utils/base64.cpp
|
base64.o : ../utils/base64.cpp
|
||||||
$(CXX) $(ALL_CXXFLAGS) -c ../utils/base64.cpp
|
$(CXX) $(ALL_CXXFLAGS) -c ../utils/base64.cpp
|
||||||
conftree.o : ../utils/conftree.cpp
|
conftree.o : ../utils/conftree.cpp
|
||||||
@ -193,6 +195,9 @@ searchdata.dep.stamp : ../rcldb/searchdata.cpp
|
|||||||
stemdb.dep.stamp : ../rcldb/stemdb.cpp
|
stemdb.dep.stamp : ../rcldb/stemdb.cpp
|
||||||
$(CXX) -M $(ALL_CXXFLAGS) ../rcldb/stemdb.cpp > stemdb.dep
|
$(CXX) -M $(ALL_CXXFLAGS) ../rcldb/stemdb.cpp > stemdb.dep
|
||||||
touch stemdb.dep.stamp
|
touch stemdb.dep.stamp
|
||||||
|
stoplist.dep.stamp : ../rcldb/stoplist.cpp
|
||||||
|
$(CXX) -M $(ALL_CXXFLAGS) ../rcldb/stoplist.cpp > stoplist.dep
|
||||||
|
touch stoplist.dep.stamp
|
||||||
base64.dep.stamp : ../utils/base64.cpp
|
base64.dep.stamp : ../utils/base64.cpp
|
||||||
$(CXX) -M $(ALL_CXXFLAGS) ../utils/base64.cpp > base64.dep
|
$(CXX) -M $(ALL_CXXFLAGS) ../utils/base64.cpp > base64.dep
|
||||||
touch base64.dep.stamp
|
touch base64.dep.stamp
|
||||||
@ -266,6 +271,7 @@ include pathhash.dep
|
|||||||
include rcldb.dep
|
include rcldb.dep
|
||||||
include searchdata.dep
|
include searchdata.dep
|
||||||
include stemdb.dep
|
include stemdb.dep
|
||||||
|
include stoplist.dep
|
||||||
include base64.dep
|
include base64.dep
|
||||||
include conftree.dep
|
include conftree.dep
|
||||||
include copyfile.dep
|
include copyfile.dep
|
||||||
|
|||||||
@ -32,6 +32,7 @@ ${depth}/rcldb/pathhash.cpp \
|
|||||||
${depth}/rcldb/rcldb.cpp \
|
${depth}/rcldb/rcldb.cpp \
|
||||||
${depth}/rcldb/searchdata.cpp \
|
${depth}/rcldb/searchdata.cpp \
|
||||||
${depth}/rcldb/stemdb.cpp \
|
${depth}/rcldb/stemdb.cpp \
|
||||||
|
${depth}/rcldb/stoplist.cpp \
|
||||||
${depth}/utils/base64.cpp \
|
${depth}/utils/base64.cpp \
|
||||||
${depth}/utils/conftree.cpp \
|
${depth}/utils/conftree.cpp \
|
||||||
${depth}/utils/copyfile.cpp \
|
${depth}/utils/copyfile.cpp \
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
#ifndef lint
|
#ifndef lint
|
||||||
static char rcsid[] = "@(#$Id: main.cpp,v 1.59 2007-05-21 13:30:21 dockes Exp $ (C) 2005 J.F.Dockes";
|
static char rcsid[] = "@(#$Id: main.cpp,v 1.60 2007-06-02 08:30:41 dockes Exp $ (C) 2005 J.F.Dockes";
|
||||||
#endif
|
#endif
|
||||||
/*
|
/*
|
||||||
* This program is free software; you can redistribute it and/or modify
|
* This program is free software; you can redistribute it and/or modify
|
||||||
@ -102,7 +102,8 @@ bool maybeOpenDb(string &reason, bool force)
|
|||||||
LOGDEB(("main: adding [%s]\n", it->c_str()));
|
LOGDEB(("main: adding [%s]\n", it->c_str()));
|
||||||
rcldb->addQueryDb(*it);
|
rcldb->addQueryDb(*it);
|
||||||
}
|
}
|
||||||
if (!rcldb->isopen() && !rcldb->open(dbdir, Rcl::Db::DbRO, qopts)) {
|
if (!rcldb->isopen() && !rcldb->open(dbdir, rclconfig->getStopfile(),
|
||||||
|
Rcl::Db::DbRO, qopts)) {
|
||||||
reason = "Could not open database in " +
|
reason = "Could not open database in " +
|
||||||
dbdir + " wait for indexing to complete?";
|
dbdir + " wait for indexing to complete?";
|
||||||
return false;
|
return false;
|
||||||
|
|||||||
24
src/rcldb/Makefile
Normal file
24
src/rcldb/Makefile
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
# @(#$Id: Makefile,v 1.1 2007-06-02 08:30:42 dockes Exp $ (C) 2005 J.F.Dockes
|
||||||
|
depth = ..
|
||||||
|
include $(depth)/mk/sysconf
|
||||||
|
|
||||||
|
# Only test executables get build in here
|
||||||
|
PROGS = stoplist
|
||||||
|
|
||||||
|
all: $(BIGLIB) $(PROGS)
|
||||||
|
|
||||||
|
$(BIGLIB): force
|
||||||
|
cd $(depth)/lib;$(MAKE)
|
||||||
|
force:
|
||||||
|
|
||||||
|
STOPLIST_OBJS= trstoplist.o $(BIGLIB)
|
||||||
|
stoplist : $(STOPLIST_OBJS)
|
||||||
|
$(CXX) $(ALL_CXXFLAGS) -o stoplist $(STOPLIST_OBJS) \
|
||||||
|
$(LIBICONV) $(LIBSYS)
|
||||||
|
trstoplist.o : stoplist.cpp
|
||||||
|
$(CXX) $(ALL_CXXFLAGS) -DTEST_STOPLIST -c -o trstoplist.o \
|
||||||
|
stoplist.cpp
|
||||||
|
|
||||||
|
clean:
|
||||||
|
rm -f *.o $(PROGS)
|
||||||
|
|
||||||
@ -1,5 +1,5 @@
|
|||||||
#ifndef lint
|
#ifndef lint
|
||||||
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.110 2007-05-30 12:30:38 dockes Exp $ (C) 2004 J.F.Dockes";
|
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.111 2007-06-02 08:30:42 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||||
#endif
|
#endif
|
||||||
/*
|
/*
|
||||||
* This program is free software; you can redistribute it and/or modify
|
* This program is free software; you can redistribute it and/or modify
|
||||||
@ -528,7 +528,7 @@ Db::~Db()
|
|||||||
LOGERR(("Db::~Db: got exception: %s\n", ermsg));
|
LOGERR(("Db::~Db: got exception: %s\n", ermsg));
|
||||||
}
|
}
|
||||||
|
|
||||||
bool Db::open(const string& dir, OpenMode mode, int qops)
|
bool Db::open(const string& dir, const string &stops, OpenMode mode, int qops)
|
||||||
{
|
{
|
||||||
bool keep_updated = (qops & QO_KEEP_UPDATED) != 0;
|
bool keep_updated = (qops & QO_KEEP_UPDATED) != 0;
|
||||||
qops &= ~QO_KEEP_UPDATED;
|
qops &= ~QO_KEEP_UPDATED;
|
||||||
@ -543,6 +543,9 @@ bool Db::open(const string& dir, OpenMode mode, int qops)
|
|||||||
if (!close())
|
if (!close())
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
if (!stops.empty())
|
||||||
|
m_stops.setFile(stops);
|
||||||
|
|
||||||
const char *ermsg = "Unknown";
|
const char *ermsg = "Unknown";
|
||||||
try {
|
try {
|
||||||
switch (mode) {
|
switch (mode) {
|
||||||
@ -652,7 +655,7 @@ bool Db::reOpen()
|
|||||||
if (m_ndb && m_ndb->m_isopen) {
|
if (m_ndb && m_ndb->m_isopen) {
|
||||||
if (!close())
|
if (!close())
|
||||||
return false;
|
return false;
|
||||||
if (!open(m_basedir, m_mode, m_qOpts | QO_KEEP_UPDATED)) {
|
if (!open(m_basedir, "", m_mode, m_qOpts | QO_KEEP_UPDATED)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -737,8 +740,9 @@ class mySplitterCB : public TextSplitCB {
|
|||||||
Xapian::termpos basepos; // Base for document section
|
Xapian::termpos basepos; // Base for document section
|
||||||
Xapian::termpos curpos; // Current position. Used to set basepos for the
|
Xapian::termpos curpos; // Current position. Used to set basepos for the
|
||||||
// following section
|
// following section
|
||||||
mySplitterCB(Xapian::Document &d)
|
StopList &stops;
|
||||||
: doc(d), basepos(1), curpos(0)
|
mySplitterCB(Xapian::Document &d, StopList &_stops)
|
||||||
|
: doc(d), basepos(1), curpos(0), stops(_stops)
|
||||||
{}
|
{}
|
||||||
bool takeword(const std::string &term, int pos, int, int);
|
bool takeword(const std::string &term, int pos, int, int);
|
||||||
void setprefix(const string& pref) {prefix = pref;}
|
void setprefix(const string& pref) {prefix = pref;}
|
||||||
@ -762,6 +766,10 @@ bool mySplitterCB::takeword(const std::string &term, int pos, int, int)
|
|||||||
|
|
||||||
const char *ermsg;
|
const char *ermsg;
|
||||||
try {
|
try {
|
||||||
|
if (stops.hasStops() && stops.isStop(term)) {
|
||||||
|
LOGDEB1(("Db: takeword [%s] in stop list\n", term.c_str()));
|
||||||
|
return true;
|
||||||
|
}
|
||||||
// Note: 1 is the within document frequency increment. It would
|
// Note: 1 is the within document frequency increment. It would
|
||||||
// be possible to assign different weigths to doc parts (ie title)
|
// be possible to assign different weigths to doc parts (ie title)
|
||||||
// by using a higher value
|
// by using a higher value
|
||||||
@ -849,7 +857,7 @@ bool Db::add(const string &fn, const Doc &idoc,
|
|||||||
|
|
||||||
Xapian::Document newdocument;
|
Xapian::Document newdocument;
|
||||||
|
|
||||||
mySplitterCB splitData(newdocument);
|
mySplitterCB splitData(newdocument, m_stops);
|
||||||
|
|
||||||
TextSplit splitter(&splitData);
|
TextSplit splitter(&splitData);
|
||||||
|
|
||||||
|
|||||||
@ -16,7 +16,7 @@
|
|||||||
*/
|
*/
|
||||||
#ifndef _DB_H_INCLUDED_
|
#ifndef _DB_H_INCLUDED_
|
||||||
#define _DB_H_INCLUDED_
|
#define _DB_H_INCLUDED_
|
||||||
/* @(#$Id: rcldb.h,v 1.47 2007-05-22 07:40:00 dockes Exp $ (C) 2004 J.F.Dockes */
|
/* @(#$Id: rcldb.h,v 1.48 2007-06-02 08:30:42 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <list>
|
#include <list>
|
||||||
@ -24,6 +24,7 @@
|
|||||||
|
|
||||||
#include "refcntr.h"
|
#include "refcntr.h"
|
||||||
#include "rcldoc.h"
|
#include "rcldoc.h"
|
||||||
|
#include "stoplist.h"
|
||||||
|
|
||||||
#ifndef NO_NAMESPACES
|
#ifndef NO_NAMESPACES
|
||||||
using std::string;
|
using std::string;
|
||||||
@ -77,7 +78,8 @@ class Db {
|
|||||||
// KEEP_UPDATED is internal use by reOpen() only
|
// KEEP_UPDATED is internal use by reOpen() only
|
||||||
enum QueryOpts {QO_NONE=0, QO_STEM = 1, QO_KEEP_UPDATED = 8};
|
enum QueryOpts {QO_NONE=0, QO_STEM = 1, QO_KEEP_UPDATED = 8};
|
||||||
|
|
||||||
bool open(const string &dbdir, OpenMode mode, int qops = QO_NONE);
|
bool open(const string &dbdir, const string &stoplistfn,
|
||||||
|
OpenMode mode, int qops = QO_NONE);
|
||||||
bool close();
|
bool close();
|
||||||
bool isopen();
|
bool isopen();
|
||||||
|
|
||||||
@ -172,11 +174,13 @@ class Db {
|
|||||||
|
|
||||||
/** Filename wildcard expansion */
|
/** Filename wildcard expansion */
|
||||||
bool filenameWildExp(const string& exp, list<string>& names);
|
bool filenameWildExp(const string& exp, list<string>& names);
|
||||||
string getReason(){return m_reason;}
|
string getReason() const {return m_reason;}
|
||||||
|
|
||||||
/** Adjust flush threshold */
|
/** Adjust flush threshold */
|
||||||
void setFlushMb(int mb) {m_flushmb = mb;}
|
void setFlushMb(int mb) {m_flushmb = mb;}
|
||||||
|
|
||||||
|
const StopList& getStopList() const {return m_stops;}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
|
||||||
string m_filterTopDir; // Current query filter on subtree top directory
|
string m_filterTopDir; // Current query filter on subtree top directory
|
||||||
@ -217,6 +221,8 @@ private:
|
|||||||
|
|
||||||
vector<bool> updated;
|
vector<bool> updated;
|
||||||
|
|
||||||
|
StopList m_stops;
|
||||||
|
|
||||||
bool reOpen(); // Close/open, same mode/opts
|
bool reOpen(); // Close/open, same mode/opts
|
||||||
bool stemExpand(const string &lang, const string &s,
|
bool stemExpand(const string &lang, const string &s,
|
||||||
list<TermMatchEntry>& result, int max = -1);
|
list<TermMatchEntry>& result, int max = -1);
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
#ifndef lint
|
#ifndef lint
|
||||||
static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.13 2007-02-13 10:58:31 dockes Exp $ (C) 2006 J.F.Dockes";
|
static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.14 2007-06-02 08:30:42 dockes Exp $ (C) 2006 J.F.Dockes";
|
||||||
#endif
|
#endif
|
||||||
/*
|
/*
|
||||||
* This program is free software; you can redistribute it and/or modify
|
* This program is free software; you can redistribute it and/or modify
|
||||||
@ -32,6 +32,7 @@ static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.13 2007-02-13 10:58:31 dockes
|
|||||||
#include "textsplit.h"
|
#include "textsplit.h"
|
||||||
#include "unacpp.h"
|
#include "unacpp.h"
|
||||||
#include "utf8iter.h"
|
#include "utf8iter.h"
|
||||||
|
#include "stoplist.h"
|
||||||
|
|
||||||
#ifndef NO_NAMESPACES
|
#ifndef NO_NAMESPACES
|
||||||
using namespace std;
|
using namespace std;
|
||||||
@ -136,12 +137,23 @@ bool SearchData::getTerms(vector<string>& terms,
|
|||||||
// terms and phrases.
|
// terms and phrases.
|
||||||
class wsQData : public TextSplitCB {
|
class wsQData : public TextSplitCB {
|
||||||
public:
|
public:
|
||||||
|
wsQData(const StopList &_stops)
|
||||||
|
: stops(_stops), alltermcount(0)
|
||||||
|
{}
|
||||||
vector<string> terms;
|
vector<string> terms;
|
||||||
bool takeword(const std::string &term, int , int, int) {
|
bool takeword(const std::string &term, int , int, int) {
|
||||||
|
alltermcount++;
|
||||||
LOGDEB1(("wsQData::takeword: %s\n", term.c_str()));
|
LOGDEB1(("wsQData::takeword: %s\n", term.c_str()));
|
||||||
|
if (stops.hasStops() && stops.isStop(term)) {
|
||||||
|
LOGDEB1(("wsQData::takeword [%s] in stop list\n", term.c_str()));
|
||||||
|
return true;
|
||||||
|
}
|
||||||
terms.push_back(term);
|
terms.push_back(term);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
const StopList &stops;
|
||||||
|
int alltermcount; // Count of terms including stopwords: this is
|
||||||
|
// for adjusting phrase/near slack
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -158,10 +170,11 @@ public:
|
|||||||
{ }
|
{ }
|
||||||
|
|
||||||
bool processUserString(const string &iq,
|
bool processUserString(const string &iq,
|
||||||
const string &prefix,
|
const string &prefix,
|
||||||
string &ermsg,
|
string &ermsg,
|
||||||
list<Xapian::Query> &pqueries,
|
list<Xapian::Query> &pqueries,
|
||||||
int slack = 0, bool useNear = false);
|
const StopList &stops,
|
||||||
|
int slack = 0, bool useNear = false);
|
||||||
|
|
||||||
bool getTerms(vector<string>& terms,
|
bool getTerms(vector<string>& terms,
|
||||||
vector<vector<string> >& groups)
|
vector<vector<string> >& groups)
|
||||||
@ -313,7 +326,9 @@ bool StringToXapianQ::processUserString(const string &iq,
|
|||||||
const string &prefix,
|
const string &prefix,
|
||||||
string &ermsg,
|
string &ermsg,
|
||||||
list<Xapian::Query> &pqueries,
|
list<Xapian::Query> &pqueries,
|
||||||
int slack, bool useNear)
|
const StopList& stops,
|
||||||
|
int slack, bool useNear
|
||||||
|
)
|
||||||
{
|
{
|
||||||
LOGDEB(("StringToXapianQ:: query string: [%s]\n", iq.c_str()));
|
LOGDEB(("StringToXapianQ:: query string: [%s]\n", iq.c_str()));
|
||||||
ermsg.erase();
|
ermsg.erase();
|
||||||
@ -339,7 +354,7 @@ bool StringToXapianQ::processUserString(const string &iq,
|
|||||||
// we need to use a word split, else a phrase query including
|
// we need to use a word split, else a phrase query including
|
||||||
// a span would fail if we didn't adjust the proximity to
|
// a span would fail if we didn't adjust the proximity to
|
||||||
// account for the additional span term which is complicated.
|
// account for the additional span term which is complicated.
|
||||||
wsQData splitDataS, splitDataW;
|
wsQData splitDataS(stops), splitDataW(stops);
|
||||||
TextSplit splitterS(&splitDataS, (TextSplit::Flags)
|
TextSplit splitterS(&splitDataS, (TextSplit::Flags)
|
||||||
(TextSplit::TXTS_ONLYSPANS |
|
(TextSplit::TXTS_ONLYSPANS |
|
||||||
TextSplit::TXTS_KEEPWILD));
|
TextSplit::TXTS_KEEPWILD));
|
||||||
@ -418,7 +433,8 @@ bool StringToXapianQ::processUserString(const string &iq,
|
|||||||
pqueries.push_back(Xapian::Query(op,
|
pqueries.push_back(Xapian::Query(op,
|
||||||
orqueries.begin(),
|
orqueries.begin(),
|
||||||
orqueries.end(),
|
orqueries.end(),
|
||||||
splitData->terms.size() + slack));
|
splitData->alltermcount
|
||||||
|
+ slack));
|
||||||
// Add NEAR/PHRASE groups to the highlighting data. Must
|
// Add NEAR/PHRASE groups to the highlighting data. Must
|
||||||
// push all combinations
|
// push all combinations
|
||||||
vector<vector<string> > allcombs;
|
vector<vector<string> > allcombs;
|
||||||
@ -508,7 +524,8 @@ bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p,
|
|||||||
(m_parentSearch == 0 && !m_haveWildCards);
|
(m_parentSearch == 0 && !m_haveWildCards);
|
||||||
|
|
||||||
StringToXapianQ tr(db, l_stemlang, doBoostUserTerm);
|
StringToXapianQ tr(db, l_stemlang, doBoostUserTerm);
|
||||||
if (!tr.processUserString(m_text, prefix, m_reason, pqueries))
|
if (!tr.processUserString(m_text, prefix, m_reason, pqueries,
|
||||||
|
db.getStopList()))
|
||||||
return false;
|
return false;
|
||||||
if (pqueries.empty()) {
|
if (pqueries.empty()) {
|
||||||
LOGERR(("SearchDataClauseSimple: resolved to null query\n"));
|
LOGERR(("SearchDataClauseSimple: resolved to null query\n"));
|
||||||
@ -570,7 +587,8 @@ bool SearchDataClauseDist::toNativeQuery(Rcl::Db &db, void *p,
|
|||||||
string s = string("\"") + m_text + string("\"");
|
string s = string("\"") + m_text + string("\"");
|
||||||
bool useNear = (m_tp == SCLT_NEAR);
|
bool useNear = (m_tp == SCLT_NEAR);
|
||||||
StringToXapianQ tr(db, l_stemlang, doBoostUserTerm);
|
StringToXapianQ tr(db, l_stemlang, doBoostUserTerm);
|
||||||
if (!tr.processUserString(s, prefix, m_reason, pqueries, m_slack, useNear))
|
if (!tr.processUserString(s, prefix, m_reason, pqueries, db.getStopList(),
|
||||||
|
m_slack, useNear))
|
||||||
return false;
|
return false;
|
||||||
if (pqueries.empty()) {
|
if (pqueries.empty()) {
|
||||||
LOGERR(("SearchDataClauseDist: resolved to null query\n"));
|
LOGERR(("SearchDataClauseDist: resolved to null query\n"));
|
||||||
|
|||||||
104
src/rcldb/stoplist.cpp
Normal file
104
src/rcldb/stoplist.cpp
Normal file
@ -0,0 +1,104 @@
|
|||||||
|
#ifndef lint
|
||||||
|
static char rcsid[] = "@(#$Id: stoplist.cpp,v 1.1 2007-06-02 08:30:42 dockes Exp $ (C) 2007 J.F.Dockes";
|
||||||
|
#endif
|
||||||
|
#ifndef TEST_STOPLIST
|
||||||
|
#include "debuglog.h"
|
||||||
|
#include "readfile.h"
|
||||||
|
#include "unacpp.h"
|
||||||
|
#include "stoplist.h"
|
||||||
|
|
||||||
|
#ifndef NO_NAMESPACES
|
||||||
|
namespace Rcl
|
||||||
|
{
|
||||||
|
#endif
|
||||||
|
|
||||||
|
bool StopList::setFile(const string &filename)
|
||||||
|
{
|
||||||
|
m_hasStops = false;
|
||||||
|
m_stops.clear();
|
||||||
|
string stoptext, reason;
|
||||||
|
if (!file_to_string(filename, stoptext, &reason)) {
|
||||||
|
LOGDEB(("StopList::StopList: file_to_string(%s) failed: %s\n",
|
||||||
|
filename.c_str(), reason.c_str()));
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
TextSplit ts(this, TextSplit::TXTS_ONLYSPANS);
|
||||||
|
ts.text_to_words(stoptext);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool StopList::takeword(const string& term, int, int, int)
|
||||||
|
{
|
||||||
|
string dterm;
|
||||||
|
unacmaybefold(term, dterm, "UTF-8", true);
|
||||||
|
LOGDEB2(("StopList::takeword: inserting [%s]\n", dterm.c_str()));
|
||||||
|
m_hasStops = true;
|
||||||
|
m_stops.insert(dterm);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool StopList::isStop(const string &term) const
|
||||||
|
{
|
||||||
|
return m_hasStops ? m_stops.find(term) != m_stops.end() : false;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#ifndef NO_NAMESPACES
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#else // TEST_STOPLIST
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <unistd.h>
|
||||||
|
#include <errno.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <iostream>
|
||||||
|
|
||||||
|
#include "stoplist.h"
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
using namespace Rcl;
|
||||||
|
|
||||||
|
static char *thisprog;
|
||||||
|
|
||||||
|
static char usage [] =
|
||||||
|
"trstoplist stopstermsfile\n\n"
|
||||||
|
;
|
||||||
|
static void
|
||||||
|
Usage(void)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "%s: usage:\n%s", thisprog, usage);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
const string tstwords[] = {
|
||||||
|
"the", "is", "xweird"
|
||||||
|
};
|
||||||
|
const int tstsz = sizeof(tstwords) / sizeof(string);
|
||||||
|
|
||||||
|
int main(int argc, char **argv)
|
||||||
|
{
|
||||||
|
int count = 10;
|
||||||
|
|
||||||
|
thisprog = argv[0];
|
||||||
|
argc--; argv++;
|
||||||
|
|
||||||
|
if (argc != 1)
|
||||||
|
Usage();
|
||||||
|
string filename = argv[0]; argc--;
|
||||||
|
|
||||||
|
StopList sl(filename);
|
||||||
|
|
||||||
|
for (int i = 0; i < tstsz; i++) {
|
||||||
|
const string &tst = tstwords[i];
|
||||||
|
cout << "[" << tst << "] " <<
|
||||||
|
(sl.isStop(tst) ? "in stop list" : "not in stop list") << endl;
|
||||||
|
}
|
||||||
|
exit(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif // TEST_STOPLIST
|
||||||
37
src/rcldb/stoplist.h
Normal file
37
src/rcldb/stoplist.h
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
#ifndef _STOPLIST_H_INCLUDED_
|
||||||
|
#define _STOPLIST_H_INCLUDED_
|
||||||
|
/* @(#$Id: stoplist.h,v 1.1 2007-06-02 08:30:42 dockes Exp $ (C) 2006 J.F.Dockes */
|
||||||
|
|
||||||
|
#include <set>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
#include "textsplit.h"
|
||||||
|
|
||||||
|
#ifndef NO_NAMESPACES
|
||||||
|
using std::set;
|
||||||
|
using std::string;
|
||||||
|
namespace Rcl
|
||||||
|
{
|
||||||
|
#endif
|
||||||
|
|
||||||
|
class StopList : public TextSplitCB {
|
||||||
|
public:
|
||||||
|
StopList() : m_hasStops(false) {}
|
||||||
|
StopList(const string &filename) {setFile(filename);}
|
||||||
|
virtual ~StopList() {}
|
||||||
|
|
||||||
|
bool setFile(const string &filename);
|
||||||
|
bool isStop(const string &term) const;
|
||||||
|
bool hasStops() const {return m_hasStops;}
|
||||||
|
virtual bool takeword(const string& term, int pos, int bts, int bte);
|
||||||
|
|
||||||
|
private:
|
||||||
|
bool m_hasStops;
|
||||||
|
set<string> m_stops;
|
||||||
|
};
|
||||||
|
|
||||||
|
#ifndef NO_NAMESPACES
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif /* _STOPLIST_H_INCLUDED_ */
|
||||||
@ -1,5 +1,5 @@
|
|||||||
#ifndef lint
|
#ifndef lint
|
||||||
static char rcsid[] = "@(#$Id: readfile.cpp,v 1.3 2006-01-23 13:32:28 dockes Exp $ (C) 2004 J.F.Dockes";
|
static char rcsid[] = "@(#$Id: readfile.cpp,v 1.4 2007-06-02 08:30:42 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||||
#endif
|
#endif
|
||||||
/*
|
/*
|
||||||
* This program is free software; you can redistribute it and/or modify
|
* This program is free software; you can redistribute it and/or modify
|
||||||
@ -32,20 +32,28 @@ using std::string;
|
|||||||
|
|
||||||
#include "readfile.h"
|
#include "readfile.h"
|
||||||
|
|
||||||
bool file_to_string(const string &fn, string &data)
|
bool file_to_string(const string &fn, string &data, string *reason)
|
||||||
{
|
{
|
||||||
|
#define ERRBUFSZ 200
|
||||||
|
char errbuf[ERRBUFSZ];
|
||||||
bool ret = false;
|
bool ret = false;
|
||||||
|
|
||||||
int fd = open(fn.c_str(), O_RDONLY|O_STREAMING);
|
int fd = open(fn.c_str(), O_RDONLY|O_STREAMING);
|
||||||
if (fd < 0) {
|
if (fd < 0) {
|
||||||
// perror("open");
|
if (reason) {
|
||||||
|
strerror_r(errno, errbuf, ERRBUFSZ);
|
||||||
|
*reason += string("file_to_string: open failed: ") + errbuf;
|
||||||
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
char buf[4096];
|
char buf[4096];
|
||||||
for (;;) {
|
for (;;) {
|
||||||
int n = read(fd, buf, 4096);
|
int n = read(fd, buf, 4096);
|
||||||
if (n < 0) {
|
if (n < 0) {
|
||||||
// perror("read");
|
if (reason) {
|
||||||
|
strerror_r(errno, errbuf, ERRBUFSZ);
|
||||||
|
*reason += string("file_to_string: read failed: ") + errbuf;
|
||||||
|
}
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
if (n == 0)
|
if (n == 0)
|
||||||
@ -54,7 +62,10 @@ bool file_to_string(const string &fn, string &data)
|
|||||||
try {
|
try {
|
||||||
data.append(buf, n);
|
data.append(buf, n);
|
||||||
} catch (...) {
|
} catch (...) {
|
||||||
// fprintf(stderr, "file_to_string: out of memory\n");
|
if (reason) {
|
||||||
|
strerror_r(errno, errbuf, ERRBUFSZ);
|
||||||
|
*reason += string("file_to_string: out of memory? : ") +errbuf;
|
||||||
|
}
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -16,7 +16,7 @@
|
|||||||
*/
|
*/
|
||||||
#ifndef _READFILE_H_INCLUDED_
|
#ifndef _READFILE_H_INCLUDED_
|
||||||
#define _READFILE_H_INCLUDED_
|
#define _READFILE_H_INCLUDED_
|
||||||
/* @(#$Id: readfile.h,v 1.2 2006-01-30 11:15:28 dockes Exp $ (C) 2004 J.F.Dockes */
|
/* @(#$Id: readfile.h,v 1.3 2007-06-02 08:30:42 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
@ -24,6 +24,7 @@
|
|||||||
* Read whole file into string.
|
* Read whole file into string.
|
||||||
* @return true for ok, false else
|
* @return true for ok, false else
|
||||||
*/
|
*/
|
||||||
bool file_to_string(const std::string &filename, std::string &data);
|
bool file_to_string(const std::string &filename, std::string &data,
|
||||||
|
std::string *reason = 0);
|
||||||
|
|
||||||
#endif /* _READFILE_H_INCLUDED_ */
|
#endif /* _READFILE_H_INCLUDED_ */
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user