added field/prefixes for author and title + command line query language
This commit is contained in:
parent
ee85be5c61
commit
1d683ad411
@ -1,5 +1,5 @@
|
|||||||
#ifndef lint
|
#ifndef lint
|
||||||
static char rcsid[] = "@(#$Id: internfile.cpp,v 1.24 2007-01-15 13:06:38 dockes Exp $ (C) 2004 J.F.Dockes";
|
static char rcsid[] = "@(#$Id: internfile.cpp,v 1.25 2007-01-17 13:53:40 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||||
#endif
|
#endif
|
||||||
/*
|
/*
|
||||||
* This program is free software; you can redistribute it and/or modify
|
* This program is free software; you can redistribute it and/or modify
|
||||||
@ -236,6 +236,7 @@ static inline bool getKeyValue(const map<string, string>& docdata,
|
|||||||
}
|
}
|
||||||
|
|
||||||
static const string keyab("abstract");
|
static const string keyab("abstract");
|
||||||
|
static const string keyau("author");
|
||||||
static const string keycs("charset");
|
static const string keycs("charset");
|
||||||
static const string keyct("content");
|
static const string keyct("content");
|
||||||
static const string keyfn("filename");
|
static const string keyfn("filename");
|
||||||
@ -251,6 +252,7 @@ bool FileInterner::dijontorcl(Rcl::Doc& doc)
|
|||||||
Dijon::Filter *df = m_handlers.back();
|
Dijon::Filter *df = m_handlers.back();
|
||||||
const std::map<std::string, std::string>& docdata = df->get_meta_data();
|
const std::map<std::string, std::string>& docdata = df->get_meta_data();
|
||||||
|
|
||||||
|
getKeyValue(docdata, keyau, doc.author);
|
||||||
getKeyValue(docdata, keyoc, doc.origcharset);
|
getKeyValue(docdata, keyoc, doc.origcharset);
|
||||||
getKeyValue(docdata, keyct, doc.text);
|
getKeyValue(docdata, keyct, doc.text);
|
||||||
getKeyValue(docdata, keytt, doc.title);
|
getKeyValue(docdata, keytt, doc.title);
|
||||||
|
|||||||
@ -122,6 +122,7 @@ bool MimeHandlerHtml::next_document()
|
|||||||
m_metaData["charset"] = "utf-8";
|
m_metaData["charset"] = "utf-8";
|
||||||
m_metaData["title"] = result.title;
|
m_metaData["title"] = result.title;
|
||||||
m_metaData["keywords"] = result.keywords;
|
m_metaData["keywords"] = result.keywords;
|
||||||
|
m_metaData["author"] = result.author;
|
||||||
m_metaData["modificationdate"] = result.dmtime;
|
m_metaData["modificationdate"] = result.dmtime;
|
||||||
m_metaData["sample"] = result.sample;
|
m_metaData["sample"] = result.sample;
|
||||||
m_metaData["mimetype"] = "text/plain";
|
m_metaData["mimetype"] = "text/plain";
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
#ifndef lint
|
#ifndef lint
|
||||||
static char rcsid[] = "@(#$Id: mh_mail.cpp,v 1.28 2007-01-13 10:28:37 dockes Exp $ (C) 2005 J.F.Dockes";
|
static char rcsid[] = "@(#$Id: mh_mail.cpp,v 1.29 2007-01-17 13:53:40 dockes Exp $ (C) 2005 J.F.Dockes";
|
||||||
#endif
|
#endif
|
||||||
/*
|
/*
|
||||||
* This program is free software; you can redistribute it and/or modify
|
* This program is free software; you can redistribute it and/or modify
|
||||||
@ -217,6 +217,9 @@ bool MimeHandlerMail::processMsg(Binc::MimePart *doc, int depth)
|
|||||||
if (doc->h.getFirstHeader("From", hi)) {
|
if (doc->h.getFirstHeader("From", hi)) {
|
||||||
rfc2047_decode(hi.getValue(), transcoded);
|
rfc2047_decode(hi.getValue(), transcoded);
|
||||||
text += string("From: ") + transcoded + string("\n");
|
text += string("From: ") + transcoded + string("\n");
|
||||||
|
if (depth == 1) {
|
||||||
|
m_metaData["author"] = transcoded;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if (doc->h.getFirstHeader("To", hi)) {
|
if (doc->h.getFirstHeader("To", hi)) {
|
||||||
rfc2047_decode(hi.getValue(), transcoded);
|
rfc2047_decode(hi.getValue(), transcoded);
|
||||||
@ -245,7 +248,7 @@ bool MimeHandlerMail::processMsg(Binc::MimePart *doc, int depth)
|
|||||||
}
|
}
|
||||||
text += '\n';
|
text += '\n';
|
||||||
|
|
||||||
LOGDEB2(("MimeHandlerMail::rocessMsg:ismultipart %d mime subtype '%s'\n",
|
LOGDEB2(("MimeHandlerMail::processMsg:ismultipart %d mime subtype '%s'\n",
|
||||||
doc->isMultipart(), doc->getSubType().c_str()));
|
doc->isMultipart(), doc->getSubType().c_str()));
|
||||||
walkmime(doc, depth);
|
walkmime(doc, depth);
|
||||||
|
|
||||||
|
|||||||
@ -154,6 +154,11 @@ MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
|
|||||||
string tmp = i->second;
|
string tmp = i->second;
|
||||||
decode_entities(tmp);
|
decode_entities(tmp);
|
||||||
keywords += tmp;
|
keywords += tmp;
|
||||||
|
} else if (name == "author") {
|
||||||
|
if (!author.empty()) author += ' ';
|
||||||
|
string tmp = i->second;
|
||||||
|
decode_entities(tmp);
|
||||||
|
author += tmp;
|
||||||
} else if (name == "date") {
|
} else if (name == "date") {
|
||||||
// Yes this doesnt exist. It's output by filters
|
// Yes this doesnt exist. It's output by filters
|
||||||
// And the format isn't even standard http/html
|
// And the format isn't even standard http/html
|
||||||
@ -168,19 +173,6 @@ MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
|
|||||||
dmtime = ascuxtime;
|
dmtime = ascuxtime;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#if 0 // We're not a robot, so we don't care about robots metainfo
|
|
||||||
else if (name == "robots") {
|
|
||||||
string val = i->second;
|
|
||||||
decode_entities(val);
|
|
||||||
lowercase_term(val);
|
|
||||||
if (val.find("none") != string::npos ||
|
|
||||||
val.find("noindex") != string::npos) {
|
|
||||||
indexing_allowed = false;
|
|
||||||
LOGDEB1(("myhtmlparse: robots/noindex\n"));
|
|
||||||
throw false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif // 0
|
|
||||||
} else if ((j = p.find("http-equiv")) != p.end()) {
|
} else if ((j = p.find("http-equiv")) != p.end()) {
|
||||||
string hequiv = j->second;
|
string hequiv = j->second;
|
||||||
lowercase_term(hequiv);
|
lowercase_term(hequiv);
|
||||||
|
|||||||
@ -37,7 +37,7 @@ class MyHtmlParser : public HtmlParser {
|
|||||||
bool in_body_tag;
|
bool in_body_tag;
|
||||||
bool in_pre_tag;
|
bool in_pre_tag;
|
||||||
bool pending_space;
|
bool pending_space;
|
||||||
string title, sample, keywords, dump, dmtime;
|
string title, sample, keywords, dump, dmtime, author;
|
||||||
string ocharset; // This is the charset our user thinks the doc was
|
string ocharset; // This is the charset our user thinks the doc was
|
||||||
string charset; // This is the charset it was supposedly converted to
|
string charset; // This is the charset it was supposedly converted to
|
||||||
string doccharset; // Set this to value of charset parameter in header
|
string doccharset; // Set this to value of charset parameter in header
|
||||||
|
|||||||
@ -8,8 +8,8 @@ LIBS = librcl.a
|
|||||||
|
|
||||||
all: $(LIBS)
|
all: $(LIBS)
|
||||||
|
|
||||||
OBJS = rclaspell.o rclconfig.o rclinit.o textsplit.o unacpp.o csguess.o indexer.o mimetype.o htmlparse.o myhtmlparse.o mimehandler.o internfile.o mh_exec.o mh_html.o mh_mail.o mh_mbox.o mh_text.o docseq.o history.o sortseq.o pathhash.o rcldb.o searchdata.o stemdb.o base64.o conftree.o copyfile.o debuglog.o execmd.o fstreewalk.o idfile.o md5.o mimeparse.o pathut.o readfile.o smallut.o transcode.o wipedir.o x11mon.o
|
OBJS = rclaspell.o rclconfig.o rclinit.o textsplit.o unacpp.o csguess.o indexer.o mimetype.o htmlparse.o myhtmlparse.o mimehandler.o internfile.o mh_exec.o mh_html.o mh_mail.o mh_mbox.o mh_text.o docseq.o history.o sortseq.o wasastringtoquery.o wasatorcl.o pathhash.o rcldb.o searchdata.o stemdb.o base64.o conftree.o copyfile.o debuglog.o execmd.o fstreewalk.o idfile.o md5.o mimeparse.o pathut.o readfile.o smallut.o transcode.o wipedir.o x11mon.o
|
||||||
DEPS = rclaspell.dep.stamp rclconfig.dep.stamp rclinit.dep.stamp textsplit.dep.stamp unacpp.dep.stamp csguess.dep.stamp indexer.dep.stamp mimetype.dep.stamp htmlparse.dep.stamp myhtmlparse.dep.stamp mimehandler.dep.stamp internfile.dep.stamp mh_exec.dep.stamp mh_html.dep.stamp mh_mail.dep.stamp mh_mbox.dep.stamp mh_text.dep.stamp docseq.dep.stamp history.dep.stamp sortseq.dep.stamp pathhash.dep.stamp rcldb.dep.stamp searchdata.dep.stamp stemdb.dep.stamp base64.dep.stamp conftree.dep.stamp copyfile.dep.stamp debuglog.dep.stamp execmd.dep.stamp fstreewalk.dep.stamp idfile.dep.stamp md5.dep.stamp mimeparse.dep.stamp pathut.dep.stamp readfile.dep.stamp smallut.dep.stamp transcode.dep.stamp wipedir.dep.stamp x11mon.dep.stamp
|
DEPS = rclaspell.dep.stamp rclconfig.dep.stamp rclinit.dep.stamp textsplit.dep.stamp unacpp.dep.stamp csguess.dep.stamp indexer.dep.stamp mimetype.dep.stamp htmlparse.dep.stamp myhtmlparse.dep.stamp mimehandler.dep.stamp internfile.dep.stamp mh_exec.dep.stamp mh_html.dep.stamp mh_mail.dep.stamp mh_mbox.dep.stamp mh_text.dep.stamp docseq.dep.stamp history.dep.stamp sortseq.dep.stamp wasastringtoquery.dep.stamp wasatorcl.dep.stamp pathhash.dep.stamp rcldb.dep.stamp searchdata.dep.stamp stemdb.dep.stamp base64.dep.stamp conftree.dep.stamp copyfile.dep.stamp debuglog.dep.stamp execmd.dep.stamp fstreewalk.dep.stamp idfile.dep.stamp md5.dep.stamp mimeparse.dep.stamp pathut.dep.stamp readfile.dep.stamp smallut.dep.stamp transcode.dep.stamp wipedir.dep.stamp x11mon.dep.stamp
|
||||||
|
|
||||||
librcl.a : $(DEPS) $(OBJS) unac.o
|
librcl.a : $(DEPS) $(OBJS) unac.o
|
||||||
ar ru librcl.a $(OBJS) unac.o
|
ar ru librcl.a $(OBJS) unac.o
|
||||||
@ -57,6 +57,10 @@ history.o : ../query/history.cpp
|
|||||||
$(CXX) $(ALL_CXXFLAGS) -c ../query/history.cpp
|
$(CXX) $(ALL_CXXFLAGS) -c ../query/history.cpp
|
||||||
sortseq.o : ../query/sortseq.cpp
|
sortseq.o : ../query/sortseq.cpp
|
||||||
$(CXX) $(ALL_CXXFLAGS) -c ../query/sortseq.cpp
|
$(CXX) $(ALL_CXXFLAGS) -c ../query/sortseq.cpp
|
||||||
|
wasastringtoquery.o : ../query/wasastringtoquery.cpp
|
||||||
|
$(CXX) $(ALL_CXXFLAGS) -c ../query/wasastringtoquery.cpp
|
||||||
|
wasatorcl.o : ../query/wasatorcl.cpp
|
||||||
|
$(CXX) $(ALL_CXXFLAGS) -c ../query/wasatorcl.cpp
|
||||||
pathhash.o : ../rcldb/pathhash.cpp
|
pathhash.o : ../rcldb/pathhash.cpp
|
||||||
$(CXX) $(ALL_CXXFLAGS) -c ../rcldb/pathhash.cpp
|
$(CXX) $(ALL_CXXFLAGS) -c ../rcldb/pathhash.cpp
|
||||||
rcldb.o : ../rcldb/rcldb.cpp
|
rcldb.o : ../rcldb/rcldb.cpp
|
||||||
@ -161,6 +165,12 @@ history.dep.stamp : ../query/history.cpp
|
|||||||
sortseq.dep.stamp : ../query/sortseq.cpp
|
sortseq.dep.stamp : ../query/sortseq.cpp
|
||||||
$(CXX) -M $(ALL_CXXFLAGS) ../query/sortseq.cpp > sortseq.dep
|
$(CXX) -M $(ALL_CXXFLAGS) ../query/sortseq.cpp > sortseq.dep
|
||||||
touch sortseq.dep.stamp
|
touch sortseq.dep.stamp
|
||||||
|
wasastringtoquery.dep.stamp : ../query/wasastringtoquery.cpp
|
||||||
|
$(CXX) -M $(ALL_CXXFLAGS) ../query/wasastringtoquery.cpp > wasastringtoquery.dep
|
||||||
|
touch wasastringtoquery.dep.stamp
|
||||||
|
wasatorcl.dep.stamp : ../query/wasatorcl.cpp
|
||||||
|
$(CXX) -M $(ALL_CXXFLAGS) ../query/wasatorcl.cpp > wasatorcl.dep
|
||||||
|
touch wasatorcl.dep.stamp
|
||||||
pathhash.dep.stamp : ../rcldb/pathhash.cpp
|
pathhash.dep.stamp : ../rcldb/pathhash.cpp
|
||||||
$(CXX) -M $(ALL_CXXFLAGS) ../rcldb/pathhash.cpp > pathhash.dep
|
$(CXX) -M $(ALL_CXXFLAGS) ../rcldb/pathhash.cpp > pathhash.dep
|
||||||
touch pathhash.dep.stamp
|
touch pathhash.dep.stamp
|
||||||
@ -238,6 +248,8 @@ include mh_text.dep
|
|||||||
include docseq.dep
|
include docseq.dep
|
||||||
include history.dep
|
include history.dep
|
||||||
include sortseq.dep
|
include sortseq.dep
|
||||||
|
include wasastringtoquery.dep
|
||||||
|
include wasatorcl.dep
|
||||||
include pathhash.dep
|
include pathhash.dep
|
||||||
include rcldb.dep
|
include rcldb.dep
|
||||||
include searchdata.dep
|
include searchdata.dep
|
||||||
|
|||||||
@ -24,6 +24,8 @@ ${depth}/internfile/mh_text.cpp \
|
|||||||
${depth}/query/docseq.cpp \
|
${depth}/query/docseq.cpp \
|
||||||
${depth}/query/history.cpp \
|
${depth}/query/history.cpp \
|
||||||
${depth}/query/sortseq.cpp \
|
${depth}/query/sortseq.cpp \
|
||||||
|
${depth}/query/wasastringtoquery.cpp \
|
||||||
|
${depth}/query/wasatorcl.cpp \
|
||||||
${depth}/rcldb/pathhash.cpp \
|
${depth}/rcldb/pathhash.cpp \
|
||||||
${depth}/rcldb/rcldb.cpp \
|
${depth}/rcldb/rcldb.cpp \
|
||||||
${depth}/rcldb/searchdata.cpp \
|
${depth}/rcldb/searchdata.cpp \
|
||||||
|
|||||||
@ -1,11 +1,11 @@
|
|||||||
depth = ..
|
depth = ..
|
||||||
include $(depth)/mk/sysconf
|
include $(depth)/mk/sysconf
|
||||||
|
|
||||||
PROGS = xadump #trhist qtry qxtry
|
PROGS = xadump rclqlang #trhist qtry qxtry
|
||||||
|
|
||||||
all: $(PROGS)
|
all: $(PROGS)
|
||||||
|
|
||||||
SRCS = xadump.cpp
|
SRCS = xadump.cpp rclqlang.cpp
|
||||||
.cpp.o :
|
.cpp.o :
|
||||||
$(CXX) -c $(ALL_CXXFLAGS) -o $@ $<
|
$(CXX) -c $(ALL_CXXFLAGS) -o $@ $<
|
||||||
|
|
||||||
@ -14,6 +14,11 @@ xadump : $(XADUMP_OBJS)
|
|||||||
$(CXX) $(ALL_CXXFLAGS) -o xadump $(XADUMP_OBJS) \
|
$(CXX) $(ALL_CXXFLAGS) -o xadump $(XADUMP_OBJS) \
|
||||||
$(LIBICONV) $(LIBXAPIAN) $(LIBSYS)
|
$(LIBICONV) $(LIBXAPIAN) $(LIBSYS)
|
||||||
|
|
||||||
|
RCLQLANG_OBJS= rclqlang.o $(BIGLIB)
|
||||||
|
rclqlang : $(RCLQLANG_OBJS)
|
||||||
|
$(CXX) $(ALL_CXXFLAGS) -o rclqlang $(RCLQLANG_OBJS) \
|
||||||
|
$(LIBICONV) $(LIBXAPIAN) $(LIBSYS)
|
||||||
|
|
||||||
HISTORY_OBJS= trhist.o $(BIGLIB) $(MIMELIB)
|
HISTORY_OBJS= trhist.o $(BIGLIB) $(MIMELIB)
|
||||||
trhist : $(HISTORY_OBJS)
|
trhist : $(HISTORY_OBJS)
|
||||||
$(CXX) $(ALL_CXXFLAGS) -o trhist $(HISTORY_OBJS) \
|
$(CXX) $(ALL_CXXFLAGS) -o trhist $(HISTORY_OBJS) \
|
||||||
@ -21,9 +26,19 @@ trhist : $(HISTORY_OBJS)
|
|||||||
trhist.o : history.cpp history.h
|
trhist.o : history.cpp history.h
|
||||||
$(CXX) $(ALL_CXXFLAGS) -DTEST_HISTORY -c -o trhist.o history.cpp
|
$(CXX) $(ALL_CXXFLAGS) -DTEST_HISTORY -c -o trhist.o history.cpp
|
||||||
|
|
||||||
$(BIGLIB):
|
WASASTRINGTOQUERY_OBJS= trwasastrtoq.o $(BIGLIB) $(MIMELIB)
|
||||||
cd $(depth)/lib;make
|
trwasastrtoq : $(WASASTRINGTOQUERY_OBJS)
|
||||||
|
$(CXX) $(ALL_CXXFLAGS) -o trwasastrtoq $(WASASTRINGTOQUERY_OBJS) \
|
||||||
|
$(LIBICONV) $(LIBXAPIAN)
|
||||||
|
trwasastrtoq.o : wasastringtoquery.cpp wasastringtoquery.h
|
||||||
|
$(CXX) $(ALL_CXXFLAGS) -DTEST_WASASTRINGTOQUERY -c \
|
||||||
|
-o trwasastrtoq.o wasastringtoquery.cpp
|
||||||
|
|
||||||
|
$(BIGLIB): force
|
||||||
|
cd $(depth)/lib;$(MAKE)
|
||||||
|
force:
|
||||||
|
|
||||||
|
|
||||||
depend: alldeps.stamp
|
depend: alldeps.stamp
|
||||||
alldeps.stamp : $(SRCS)
|
alldeps.stamp : $(SRCS)
|
||||||
$(CXX) -M $(ALL_CXXFLAGS) $(SRCS) > alldeps
|
$(CXX) -M $(ALL_CXXFLAGS) $(SRCS) > alldeps
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
#ifndef lint
|
#ifndef lint
|
||||||
static char rcsid[] = "@(#$Id: wasastringtoquery.cpp,v 1.3 2006-12-10 17:03:08 dockes Exp $ (C) 2006 J.F.Dockes";
|
static char rcsid[] = "@(#$Id: wasastringtoquery.cpp,v 1.4 2007-01-17 13:53:41 dockes Exp $ (C) 2006 J.F.Dockes";
|
||||||
#endif
|
#endif
|
||||||
/*
|
/*
|
||||||
* This program is free software; you can redistribute it and/or modify
|
* This program is free software; you can redistribute it and/or modify
|
||||||
@ -17,7 +17,7 @@ static char rcsid[] = "@(#$Id: wasastringtoquery.cpp,v 1.3 2006-12-10 17:03:08 d
|
|||||||
* Free Software Foundation, Inc.,
|
* Free Software Foundation, Inc.,
|
||||||
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||||
*/
|
*/
|
||||||
#ifndef TEST_STRINGTOQUERY
|
#ifndef TEST_WASASTRINGTOQUERY
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
@ -25,6 +25,13 @@ static char rcsid[] = "@(#$Id: wasastringtoquery.cpp,v 1.3 2006-12-10 17:03:08 d
|
|||||||
|
|
||||||
#include "wasastringtoquery.h"
|
#include "wasastringtoquery.h"
|
||||||
|
|
||||||
|
//#define DEB_WASASTRINGTOQ 1
|
||||||
|
#ifdef DEB_WASASTRINGTOQ
|
||||||
|
#define DPRINT(X) fprintf X
|
||||||
|
#else
|
||||||
|
#define DPRINT(X)
|
||||||
|
#endif
|
||||||
|
|
||||||
WasaQuery::~WasaQuery()
|
WasaQuery::~WasaQuery()
|
||||||
{
|
{
|
||||||
for (vector<WasaQuery*>::iterator it = m_subs.begin();
|
for (vector<WasaQuery*>::iterator it = m_subs.begin();
|
||||||
@ -61,16 +68,16 @@ void WasaQuery::describe(string &desc) const
|
|||||||
desc += ")";
|
desc += ")";
|
||||||
}
|
}
|
||||||
desc += "(";
|
desc += "(";
|
||||||
|
string fieldspec = m_fieldspec.empty() ? "" : m_fieldspec + ": ";
|
||||||
switch (m_op) {
|
switch (m_op) {
|
||||||
case OP_NULL:
|
case OP_NULL:
|
||||||
desc += "NULL";
|
desc += "NULL";
|
||||||
break;
|
break;
|
||||||
case OP_LEAF:
|
case OP_LEAF:
|
||||||
desc += m_fieldspec.empty() ?
|
desc += fieldspec + m_value;
|
||||||
m_value : m_fieldspec + ":" + m_value;
|
|
||||||
break;
|
break;
|
||||||
case OP_EXCL:
|
case OP_EXCL:
|
||||||
desc += string("NOT (" ) + m_value + ") ";
|
desc += string("NOT (" ) + fieldspec + m_value + ") ";
|
||||||
break;
|
break;
|
||||||
case OP_OR:
|
case OP_OR:
|
||||||
case OP_AND:
|
case OP_AND:
|
||||||
@ -84,6 +91,8 @@ void WasaQuery::describe(string &desc) const
|
|||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
if (desc[desc.length() - 1] == ' ')
|
||||||
|
desc.erase(desc.length() - 1);
|
||||||
desc += ") ";
|
desc += ") ";
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -111,7 +120,7 @@ void WasaQuery::describe(string &desc) const
|
|||||||
* parenthesis increases the index, but we're not interested in all
|
* parenthesis increases the index, but we're not interested in all
|
||||||
*/
|
*/
|
||||||
static const char * parserExpr =
|
static const char * parserExpr =
|
||||||
"([oO][rR])" //1 OR is a special word
|
"([oO][rR])[[:space:]]*" //1 OR is a special word
|
||||||
"|"
|
"|"
|
||||||
"(" //2
|
"(" //2
|
||||||
"([+-])?" //3 Force or exclude indicator
|
"([+-])?" //3 Force or exclude indicator
|
||||||
@ -125,7 +134,7 @@ static const char * parserExpr =
|
|||||||
"|"
|
"|"
|
||||||
"([^[:space:]]+)" //9 ANormalTerm
|
"([^[:space:]]+)" //9 ANormalTerm
|
||||||
")"
|
")"
|
||||||
")"
|
")[[:space:]]*"
|
||||||
;
|
;
|
||||||
|
|
||||||
// For debugging the parser. But see also NMATCH
|
// For debugging the parser. But see also NMATCH
|
||||||
@ -236,17 +245,18 @@ StringToWasaQuery::Internal::stringToQuery(const string& str, string& reason)
|
|||||||
reason = "Internal regular expression handling error";
|
reason = "Internal regular expression handling error";
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
#if 0
|
|
||||||
if (loop) printf("Next part:\n");
|
#ifdef DEB_WASASTRINGTOQ
|
||||||
for (i = 0; i < NMATCH; i++) {
|
if (loop) DPRINT((stderr, "Next part:\n"));
|
||||||
|
for (unsigned int i = 0; i < NMATCH; i++) {
|
||||||
if (m_pmatch[i].rm_so == -1) continue;
|
if (m_pmatch[i].rm_so == -1) continue;
|
||||||
char match[maxmatchlen+1];
|
char match[maxmatchlen+1];
|
||||||
memcpy(match, m_cp + m_pmatch[i].rm_so,
|
memcpy(match, m_cp + m_pmatch[i].rm_so,
|
||||||
m_pmatch[i].rm_eo - m_pmatch[i].rm_so);
|
m_pmatch[i].rm_eo - m_pmatch[i].rm_so);
|
||||||
match[m_pmatch[i].rm_eo - m_pmatch[i].rm_so] = 0;
|
match[m_pmatch[i].rm_eo - m_pmatch[i].rm_so] = 0;
|
||||||
if (matchNames[i][0])
|
if (matchNames[i][0])
|
||||||
printf("%10s: [%s] (%d->%d)\n", matchNames[i], match,
|
DPRINT((stderr, "%10s: [%s] (%d->%d)\n", matchNames[i], match,
|
||||||
(int)m_pmatch[i].rm_so, (int)m_pmatch[i].rm_eo);
|
(int)m_pmatch[i].rm_so, (int)m_pmatch[i].rm_eo));
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
char match[maxmatchlen+1];
|
char match[maxmatchlen+1];
|
||||||
@ -348,14 +358,17 @@ StringToWasaQuery::Internal::stringToQuery(const string& str, string& reason)
|
|||||||
if (prev_or) {
|
if (prev_or) {
|
||||||
// We're in an OR subquery, add new subquery
|
// We're in an OR subquery, add new subquery
|
||||||
orClause->m_subs.push_back(nclause);
|
orClause->m_subs.push_back(nclause);
|
||||||
|
DPRINT((stderr, "Adding to OR chain\n"));
|
||||||
} else {
|
} else {
|
||||||
if (orClause) {
|
if (orClause) {
|
||||||
// Getting out of OR. Add the OR subquery to the main one
|
// Getting out of OR. Add the OR subquery to the main one
|
||||||
query->m_subs.push_back(orClause);
|
query->m_subs.push_back(orClause);
|
||||||
|
DPRINT((stderr, "Adding OR chain to main\n"));
|
||||||
orClause = 0;
|
orClause = 0;
|
||||||
}
|
}
|
||||||
// Add new subquery to main one.
|
// Add new subquery to main one.
|
||||||
query->m_subs.push_back(nclause);
|
query->m_subs.push_back(nclause);
|
||||||
|
DPRINT((stderr, "Adding to main chain\n"));
|
||||||
}
|
}
|
||||||
prev_or = false;
|
prev_or = false;
|
||||||
}
|
}
|
||||||
@ -369,6 +382,12 @@ StringToWasaQuery::Internal::stringToQuery(const string& str, string& reason)
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (orClause) {
|
||||||
|
// Getting out of OR. Add the OR subquery to the main one
|
||||||
|
query->m_subs.push_back(orClause);
|
||||||
|
DPRINT((stderr, "Adding OR chain to main\n"));
|
||||||
|
}
|
||||||
|
|
||||||
regfree(&m_rx);
|
regfree(&m_rx);
|
||||||
m_rxneedsfree = false;
|
m_rxneedsfree = false;
|
||||||
return query;
|
return query;
|
||||||
@ -404,4 +423,4 @@ int main(int argc, char **argv)
|
|||||||
exit(0);
|
exit(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // TEST_STRINGTOQUERY
|
#endif // TEST_WASASTRINGTOQUERY
|
||||||
|
|||||||
@ -1,6 +1,6 @@
|
|||||||
#ifndef _WASASTRINGTOQUERY_H_INCLUDED_
|
#ifndef _WASASTRINGTOQUERY_H_INCLUDED_
|
||||||
#define _WASASTRINGTOQUERY_H_INCLUDED_
|
#define _WASASTRINGTOQUERY_H_INCLUDED_
|
||||||
/* @(#$Id: wasastringtoquery.h,v 1.3 2006-12-10 17:03:08 dockes Exp $ (C) 2006 J.F.Dockes */
|
/* @(#$Id: wasastringtoquery.h,v 1.4 2007-01-17 13:53:41 dockes Exp $ (C) 2006 J.F.Dockes */
|
||||||
/*
|
/*
|
||||||
* This program is free software; you can redistribute it and/or modify
|
* This program is free software; you can redistribute it and/or modify
|
||||||
* it under the terms of the GNU General Public License as published by
|
* it under the terms of the GNU General Public License as published by
|
||||||
@ -40,23 +40,28 @@ public:
|
|||||||
{}
|
{}
|
||||||
~WasaQuery();
|
~WasaQuery();
|
||||||
|
|
||||||
// Get string describing the query tree from this point
|
/** Get string describing the query tree from this point */
|
||||||
void describe(string &desc) const;
|
void describe(string &desc) const;
|
||||||
|
|
||||||
|
/** Op to be performed on either value or subqueries */
|
||||||
WasaQuery::Op m_op;
|
WasaQuery::Op m_op;
|
||||||
|
|
||||||
|
/** Field specification if any (ie: title, author ...) */
|
||||||
string m_fieldspec;
|
string m_fieldspec;
|
||||||
/* Valid for op == OP_LEAF */
|
|
||||||
|
/* String value. Valid for op == OP_LEAF */
|
||||||
string m_value;
|
string m_value;
|
||||||
/* Valid for conjunctions */
|
|
||||||
|
/** Subqueries. Valid for conjunctions */
|
||||||
vector<WasaQuery*> m_subs;
|
vector<WasaQuery*> m_subs;
|
||||||
|
|
||||||
/* Restrict results to some file type, defined by either mime, app group,
|
/** Restrict results to some file type, defined by either mime,
|
||||||
* or extension */
|
* app group, or extension */
|
||||||
enum TypeKind {WQTK_NONE, WQTK_MIME, WQTK_GROUP, WQTK_EXT};
|
enum TypeKind {WQTK_NONE, WQTK_MIME, WQTK_GROUP, WQTK_EXT};
|
||||||
TypeKind m_typeKind;
|
TypeKind m_typeKind;
|
||||||
vector<string> m_types;
|
vector<string> m_types;
|
||||||
|
|
||||||
/* Sort on relevance, date, name or group */
|
/** Sort on relevance, date, name or group */
|
||||||
enum SortKind {WQSK_REL, WQSK_DATE, WQSK_ALPHA, WQSK_GROUP};
|
enum SortKind {WQSK_REL, WQSK_DATE, WQSK_ALPHA, WQSK_GROUP};
|
||||||
vector<SortKind> m_sortSpec;
|
vector<SortKind> m_sortSpec;
|
||||||
};
|
};
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
#ifndef lint
|
#ifndef lint
|
||||||
static char rcsid[] = "@(#$Id: wasatorcl.cpp,v 1.2 2006-12-10 17:03:08 dockes Exp $ (C) 2006 J.F.Dockes";
|
static char rcsid[] = "@(#$Id: wasatorcl.cpp,v 1.3 2007-01-17 13:53:41 dockes Exp $ (C) 2006 J.F.Dockes";
|
||||||
#endif
|
#endif
|
||||||
#ifndef TEST_WASATORCL
|
#ifndef TEST_WASATORCL
|
||||||
|
|
||||||
@ -27,11 +27,13 @@ Rcl::SearchData *wasaQueryToRcl(WasaQuery *wasa)
|
|||||||
if ((*it)->m_value.find_first_of(" \t\n\r") != string::npos) {
|
if ((*it)->m_value.find_first_of(" \t\n\r") != string::npos) {
|
||||||
sdata->addClause
|
sdata->addClause
|
||||||
(new Rcl::SearchDataClauseDist(Rcl::SCLT_PHRASE,
|
(new Rcl::SearchDataClauseDist(Rcl::SCLT_PHRASE,
|
||||||
(*it)->m_value, 0));
|
(*it)->m_value, 0,
|
||||||
|
(*it)->m_fieldspec));
|
||||||
} else {
|
} else {
|
||||||
sdata->addClause
|
sdata->addClause
|
||||||
(new Rcl::SearchDataClauseSimple(Rcl::SCLT_AND,
|
(new Rcl::SearchDataClauseSimple(Rcl::SCLT_AND,
|
||||||
(*it)->m_value));
|
(*it)->m_value,
|
||||||
|
(*it)->m_fieldspec));
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case WasaQuery::OP_EXCL:
|
case WasaQuery::OP_EXCL:
|
||||||
@ -41,7 +43,8 @@ Rcl::SearchData *wasaQueryToRcl(WasaQuery *wasa)
|
|||||||
sdata->addClause
|
sdata->addClause
|
||||||
(new Rcl::SearchDataClauseSimple(Rcl::SCLT_EXCL,
|
(new Rcl::SearchDataClauseSimple(Rcl::SCLT_EXCL,
|
||||||
string("\"") +
|
string("\"") +
|
||||||
(*it)->m_value + "\""));
|
(*it)->m_value + "\"",
|
||||||
|
(*it)->m_fieldspec));
|
||||||
break;
|
break;
|
||||||
case WasaQuery::OP_OR:
|
case WasaQuery::OP_OR:
|
||||||
// Concatenate all OR values as phrases. Hope there are no
|
// Concatenate all OR values as phrases. Hope there are no
|
||||||
@ -55,7 +58,8 @@ Rcl::SearchData *wasaQueryToRcl(WasaQuery *wasa)
|
|||||||
}
|
}
|
||||||
sdata->addClause
|
sdata->addClause
|
||||||
(new Rcl::SearchDataClauseSimple(Rcl::SCLT_OR,
|
(new Rcl::SearchDataClauseSimple(Rcl::SCLT_OR,
|
||||||
orvalue));
|
orvalue,
|
||||||
|
(*it)->m_fieldspec));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -105,7 +109,7 @@ int main(int argc, char *argv[])
|
|||||||
|
|
||||||
if (argc != 1) {
|
if (argc != 1) {
|
||||||
fprintf(stderr, "need one arg\n");
|
fprintf(stderr, "need one arg\n");
|
||||||
exit(1);
|
return 1;
|
||||||
}
|
}
|
||||||
const string str = *argv++;argc--;
|
const string str = *argv++;argc--;
|
||||||
string reason;
|
string reason;
|
||||||
@ -113,14 +117,12 @@ int main(int argc, char *argv[])
|
|||||||
RclConfig *config = recollinit(RCLINIT_NONE, 0, 0, reason, 0);
|
RclConfig *config = recollinit(RCLINIT_NONE, 0, 0, reason, 0);
|
||||||
if (config == 0 || !config->ok()) {
|
if (config == 0 || !config->ok()) {
|
||||||
cerr << "Configuration problem: " << reason << endl;
|
cerr << "Configuration problem: " << reason << endl;
|
||||||
exit(1);
|
return 1;
|
||||||
}
|
}
|
||||||
string dbdir = config->getDbDir();
|
string dbdir = config->getDbDir();
|
||||||
if (dbdir.empty()) {
|
if (dbdir.empty()) {
|
||||||
// Note: this will have to be replaced by a call to a
|
|
||||||
// configuration buildin dialog for initial configuration
|
|
||||||
cerr << "Configuration problem: " << "No dbdir" << endl;
|
cerr << "Configuration problem: " << "No dbdir" << endl;
|
||||||
exit(1);
|
return 1;
|
||||||
}
|
}
|
||||||
Rcl::Db rcldb;
|
Rcl::Db rcldb;
|
||||||
if (!rcldb.open(dbdir, Rcl::Db::DbRO, 0)) {
|
if (!rcldb.open(dbdir, Rcl::Db::DbRO, 0)) {
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
#ifndef lint
|
#ifndef lint
|
||||||
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.101 2006-12-19 12:11:21 dockes Exp $ (C) 2004 J.F.Dockes";
|
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.102 2007-01-17 13:53:41 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||||
#endif
|
#endif
|
||||||
/*
|
/*
|
||||||
* This program is free software; you can redistribute it and/or modify
|
* This program is free software; you can redistribute it and/or modify
|
||||||
@ -80,6 +80,7 @@ namespace Rcl {
|
|||||||
// Synthetic abstract marker (to discriminate from abstract actually
|
// Synthetic abstract marker (to discriminate from abstract actually
|
||||||
// found in doc)
|
// found in doc)
|
||||||
const static string rclSyntAbs = "?!#@";
|
const static string rclSyntAbs = "?!#@";
|
||||||
|
const static string emptystring;
|
||||||
|
|
||||||
// A class for data and methods that would have to expose
|
// A class for data and methods that would have to expose
|
||||||
// Xapian-specific stuff if they were in Rcl::Db. There could actually be
|
// Xapian-specific stuff if they were in Rcl::Db. There could actually be
|
||||||
@ -703,15 +704,24 @@ bool Db::isopen()
|
|||||||
return m_ndb->m_isopen;
|
return m_ndb->m_isopen;
|
||||||
}
|
}
|
||||||
|
|
||||||
// A small class to hold state while splitting text
|
// The text splitter callback class which receives words from the
|
||||||
|
// splitter and adds postings to the Xapian document.
|
||||||
class mySplitterCB : public TextSplitCB {
|
class mySplitterCB : public TextSplitCB {
|
||||||
public:
|
public:
|
||||||
Xapian::Document &doc;
|
Xapian::Document &doc; // Xapian document
|
||||||
Xapian::termpos basepos; // Base for document section
|
Xapian::termpos basepos; // Base for document section
|
||||||
Xapian::termpos curpos; // Last position sent to callback
|
Xapian::termpos curpos; // Current position. Used to set basepos for the
|
||||||
mySplitterCB(Xapian::Document &d) : doc(d), basepos(1), curpos(0)
|
// following section
|
||||||
|
mySplitterCB(Xapian::Document &d)
|
||||||
|
: doc(d), basepos(1), curpos(0)
|
||||||
{}
|
{}
|
||||||
bool takeword(const std::string &term, int pos, int, int);
|
bool takeword(const std::string &term, int pos, int, int);
|
||||||
|
void setprefix(const string& pref) {prefix = pref;}
|
||||||
|
|
||||||
|
private:
|
||||||
|
// If prefix is set, we also add a posting for the prefixed terms
|
||||||
|
// (ie: for titles, add postings for both "term" and "Sterm")
|
||||||
|
string prefix;
|
||||||
};
|
};
|
||||||
|
|
||||||
// Callback for the document to word splitting class during indexation
|
// Callback for the document to word splitting class during indexation
|
||||||
@ -731,7 +741,11 @@ bool mySplitterCB::takeword(const std::string &term, int pos, int, int)
|
|||||||
// be possible to assign different weigths to doc parts (ie title)
|
// be possible to assign different weigths to doc parts (ie title)
|
||||||
// by using a higher value
|
// by using a higher value
|
||||||
curpos = pos;
|
curpos = pos;
|
||||||
doc.add_posting(term, basepos + curpos, 1);
|
pos += basepos;
|
||||||
|
doc.add_posting(term, pos, 1);
|
||||||
|
if (!prefix.empty()) {
|
||||||
|
doc.add_posting(prefix + term, pos, 1);
|
||||||
|
}
|
||||||
return true;
|
return true;
|
||||||
} catch (const Xapian::Error &e) {
|
} catch (const Xapian::Error &e) {
|
||||||
ermsg = e.get_msg().c_str();
|
ermsg = e.get_msg().c_str();
|
||||||
@ -804,8 +818,9 @@ bool Db::add(const string &fn, const Doc &idoc,
|
|||||||
doc.abstract = truncate_to_word(doc.abstract, m_idxAbsTruncLen);
|
doc.abstract = truncate_to_word(doc.abstract, m_idxAbsTruncLen);
|
||||||
}
|
}
|
||||||
doc.abstract = neutchars(doc.abstract, "\n\r");
|
doc.abstract = neutchars(doc.abstract, "\n\r");
|
||||||
doc.title = truncate_to_word(doc.title, 100);
|
doc.title = neutchars(truncate_to_word(doc.title, 150), "\n\r");
|
||||||
doc.keywords = truncate_to_word(doc.keywords, 300);
|
doc.author = neutchars(truncate_to_word(doc.author, 150), "\n\r");
|
||||||
|
doc.keywords = neutchars(truncate_to_word(doc.keywords, 300), "\n\r");
|
||||||
|
|
||||||
Xapian::Document newdocument;
|
Xapian::Document newdocument;
|
||||||
|
|
||||||
@ -824,13 +839,30 @@ bool Db::add(const string &fn, const Doc &idoc,
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Split and index title
|
// Split and index title
|
||||||
LOGDEB2(("Db::add: split title [%s]\n", doc.title.c_str()));
|
if (!doc.title.empty()) {
|
||||||
if (!dumb_string(doc.title, noacc)) {
|
LOGDEB2(("Db::add: split title [%s]\n", doc.title.c_str()));
|
||||||
LOGERR(("Db::add: dumb_string failed\n"));
|
if (!dumb_string(doc.title, noacc)) {
|
||||||
return false;
|
LOGERR(("Db::add: dumb_string failed\n"));
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
splitData.setprefix("S"); // Subject
|
||||||
|
splitter.text_to_words(noacc);
|
||||||
|
splitData.setprefix(emptystring);
|
||||||
|
splitData.basepos += splitData.curpos + 100;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Split and index author
|
||||||
|
if (!doc.author.empty()) {
|
||||||
|
LOGDEB2(("Db::add: split author [%s]\n", doc.author.c_str()));
|
||||||
|
if (!dumb_string(doc.author, noacc)) {
|
||||||
|
LOGERR(("Db::add: dumb_string failed\n"));
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
splitData.setprefix("A");
|
||||||
|
splitter.text_to_words(noacc);
|
||||||
|
splitData.setprefix(emptystring);
|
||||||
|
splitData.basepos += splitData.curpos + 100;
|
||||||
}
|
}
|
||||||
splitter.text_to_words(noacc);
|
|
||||||
splitData.basepos += splitData.curpos + 100;
|
|
||||||
|
|
||||||
// Split and index body
|
// Split and index body
|
||||||
LOGDEB2(("Db::add: split body\n"));
|
LOGDEB2(("Db::add: split body\n"));
|
||||||
@ -842,13 +874,17 @@ bool Db::add(const string &fn, const Doc &idoc,
|
|||||||
splitData.basepos += splitData.curpos + 100;
|
splitData.basepos += splitData.curpos + 100;
|
||||||
|
|
||||||
// Split and index keywords
|
// Split and index keywords
|
||||||
LOGDEB2(("Db::add: split kw [%s]\n", doc.keywords.c_str()));
|
if (!doc.keywords.empty()) {
|
||||||
if (!dumb_string(doc.keywords, noacc)) {
|
LOGDEB2(("Db::add: split kw [%s]\n", doc.keywords.c_str()));
|
||||||
LOGERR(("Db::add: dumb_string failed\n"));
|
if (!dumb_string(doc.keywords, noacc)) {
|
||||||
return false;
|
LOGERR(("Db::add: dumb_string failed\n"));
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
splitData.setprefix("K");
|
||||||
|
splitter.text_to_words(noacc);
|
||||||
|
splitData.setprefix(emptystring);
|
||||||
|
splitData.basepos += splitData.curpos + 100;
|
||||||
}
|
}
|
||||||
splitter.text_to_words(noacc);
|
|
||||||
splitData.basepos += splitData.curpos + 100;
|
|
||||||
|
|
||||||
// Split and index abstract. We don't do this if it is synthetic
|
// Split and index abstract. We don't do this if it is synthetic
|
||||||
// any more (this used to give a relevance boost to the beginning
|
// any more (this used to give a relevance boost to the beginning
|
||||||
@ -946,6 +982,9 @@ bool Db::add(const string &fn, const Doc &idoc,
|
|||||||
record += "\ncaption=" + doc.title;
|
record += "\ncaption=" + doc.title;
|
||||||
record += "\nkeywords=" + doc.keywords;
|
record += "\nkeywords=" + doc.keywords;
|
||||||
record += "\nabstract=" + doc.abstract;
|
record += "\nabstract=" + doc.abstract;
|
||||||
|
if (!doc.author.empty()) {
|
||||||
|
record += "\nauthor=" + doc.author;
|
||||||
|
}
|
||||||
record += "\n";
|
record += "\n";
|
||||||
LOGDEB1(("Newdocument data: %s\n", record.c_str()));
|
LOGDEB1(("Newdocument data: %s\n", record.c_str()));
|
||||||
newdocument.set_data(record);
|
newdocument.set_data(record);
|
||||||
|
|||||||
@ -16,7 +16,7 @@
|
|||||||
*/
|
*/
|
||||||
#ifndef _RCLDOC_H_INCLUDED_
|
#ifndef _RCLDOC_H_INCLUDED_
|
||||||
#define _RCLDOC_H_INCLUDED_
|
#define _RCLDOC_H_INCLUDED_
|
||||||
/* @(#$Id: rcldoc.h,v 1.1 2006-12-14 14:54:13 dockes Exp $ (C) 2006 J.F.Dockes */
|
/* @(#$Id: rcldoc.h,v 1.2 2007-01-17 13:53:41 dockes Exp $ (C) 2006 J.F.Dockes */
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
@ -48,6 +48,7 @@ class Doc {
|
|||||||
string origcharset; // Charset we transcoded from (in case we want back)
|
string origcharset; // Charset we transcoded from (in case we want back)
|
||||||
// Possibly set by handler
|
// Possibly set by handler
|
||||||
string title; // Possibly set by handler
|
string title; // Possibly set by handler
|
||||||
|
string author; // Possibly set by handler
|
||||||
string keywords; // Possibly set by handler
|
string keywords; // Possibly set by handler
|
||||||
string abstract; // Possibly set by handler
|
string abstract; // Possibly set by handler
|
||||||
bool syntabs; // true if abstract is just the top of doc, not an
|
bool syntabs; // true if abstract is just the top of doc, not an
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
#ifndef lint
|
#ifndef lint
|
||||||
static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.7 2006-12-19 12:11:21 dockes Exp $ (C) 2006 J.F.Dockes";
|
static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.8 2007-01-17 13:53:41 dockes Exp $ (C) 2006 J.F.Dockes";
|
||||||
#endif
|
#endif
|
||||||
/*
|
/*
|
||||||
* This program is free software; you can redistribute it and/or modify
|
* This program is free software; you can redistribute it and/or modify
|
||||||
@ -154,6 +154,7 @@ public:
|
|||||||
|
|
||||||
|
|
||||||
bool translate(const string &iq,
|
bool translate(const string &iq,
|
||||||
|
const string &prefix,
|
||||||
string &ermsg,
|
string &ermsg,
|
||||||
list<Xapian::Query> &pqueries,
|
list<Xapian::Query> &pqueries,
|
||||||
int slack = 0, bool useNear = false);
|
int slack = 0, bool useNear = false);
|
||||||
@ -257,6 +258,14 @@ void multiply_groups(vector<vector<string> >::const_iterator vvit,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void addPrefix(list<string>& terms, const string& prefix)
|
||||||
|
{
|
||||||
|
if (prefix.empty())
|
||||||
|
return;
|
||||||
|
for (list<string>::iterator it = terms.begin(); it != terms.end(); it++)
|
||||||
|
it->insert(0, prefix);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Turn string into list of xapian queries. There is little
|
* Turn string into list of xapian queries. There is little
|
||||||
* interpretation done on the string (no +term -term or filename:term
|
* interpretation done on the string (no +term -term or filename:term
|
||||||
@ -271,6 +280,7 @@ void multiply_groups(vector<vector<string> >::const_iterator vvit,
|
|||||||
* count)
|
* count)
|
||||||
*/
|
*/
|
||||||
bool StringToXapianQ::translate(const string &iq,
|
bool StringToXapianQ::translate(const string &iq,
|
||||||
|
const string &prefix,
|
||||||
string &ermsg,
|
string &ermsg,
|
||||||
list<Xapian::Query> &pqueries,
|
list<Xapian::Query> &pqueries,
|
||||||
int slack, bool useNear)
|
int slack, bool useNear)
|
||||||
@ -301,24 +311,25 @@ bool StringToXapianQ::translate(const string &iq,
|
|||||||
splitterS.text_to_words(*it);
|
splitterS.text_to_words(*it);
|
||||||
TextSplit splitterW(&splitDataW, TextSplit::TXTS_NOSPANS);
|
TextSplit splitterW(&splitDataW, TextSplit::TXTS_NOSPANS);
|
||||||
splitterW.text_to_words(*it);
|
splitterW.text_to_words(*it);
|
||||||
wsQData& splitData = splitDataS;
|
wsQData *splitData = &splitDataS;
|
||||||
if (splitDataS.terms.size() > 1 && splitDataS.terms.size() !=
|
if (splitDataS.terms.size() > 1 &&
|
||||||
splitDataW.terms.size())
|
splitDataS.terms.size() != splitDataW.terms.size())
|
||||||
splitData = splitDataW;
|
splitData = &splitDataW;
|
||||||
|
|
||||||
LOGDEB1(("strToXapianQ: splitter term count: %d\n",
|
LOGDEB1(("strToXapianQ: splitter term count: %d\n",
|
||||||
splitData.terms.size()));
|
splitData->terms.size()));
|
||||||
switch(splitData.terms.size()) {
|
switch(splitData->terms.size()) {
|
||||||
case 0: continue;// ??
|
case 0: continue;// ??
|
||||||
case 1: // Not a real phrase: one term
|
case 1: // Not a real phrase: one term
|
||||||
{
|
{
|
||||||
string term = splitData.terms.front();
|
string term = splitData->terms.front();
|
||||||
list<string> exp;
|
list<string> exp;
|
||||||
maybeStemExp(false, term, exp);
|
maybeStemExp(false, term, exp);
|
||||||
|
m_terms.insert(m_terms.end(), exp.begin(), exp.end());
|
||||||
// Push either term or OR of stem-expanded set
|
// Push either term or OR of stem-expanded set
|
||||||
|
addPrefix(exp, prefix);
|
||||||
pqueries.push_back(Xapian::Query(Xapian::Query::OP_OR,
|
pqueries.push_back(Xapian::Query(Xapian::Query::OP_OR,
|
||||||
exp.begin(), exp.end()));
|
exp.begin(), exp.end()));
|
||||||
m_terms.insert(m_terms.end(), exp.begin(), exp.end());
|
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
@ -329,8 +340,8 @@ bool StringToXapianQ::translate(const string &iq,
|
|||||||
list<Xapian::Query> orqueries;
|
list<Xapian::Query> orqueries;
|
||||||
bool hadmultiple = false;
|
bool hadmultiple = false;
|
||||||
vector<vector<string> >groups;
|
vector<vector<string> >groups;
|
||||||
for (vector<string>::iterator it = splitData.terms.begin();
|
for (vector<string>::iterator it = splitData->terms.begin();
|
||||||
it != splitData.terms.end(); it++) {
|
it != splitData->terms.end(); it++) {
|
||||||
// Some version of xapian will accept only one OR clause
|
// Some version of xapian will accept only one OR clause
|
||||||
// inside NEAR, all others must be leafs
|
// inside NEAR, all others must be leafs
|
||||||
bool nostemexp =
|
bool nostemexp =
|
||||||
@ -341,6 +352,7 @@ bool StringToXapianQ::translate(const string &iq,
|
|||||||
maybeStemExp(nostemexp, *it, exp);
|
maybeStemExp(nostemexp, *it, exp);
|
||||||
|
|
||||||
groups.push_back(vector<string>(exp.begin(), exp.end()));
|
groups.push_back(vector<string>(exp.begin(), exp.end()));
|
||||||
|
addPrefix(exp, prefix);
|
||||||
orqueries.push_back(Xapian::Query(Xapian::Query::OP_OR,
|
orqueries.push_back(Xapian::Query(Xapian::Query::OP_OR,
|
||||||
exp.begin(), exp.end()));
|
exp.begin(), exp.end()));
|
||||||
#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
|
#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
|
||||||
@ -352,7 +364,7 @@ bool StringToXapianQ::translate(const string &iq,
|
|||||||
pqueries.push_back(Xapian::Query(op,
|
pqueries.push_back(Xapian::Query(op,
|
||||||
orqueries.begin(),
|
orqueries.begin(),
|
||||||
orqueries.end(),
|
orqueries.end(),
|
||||||
splitData.terms.size() + slack));
|
splitData->terms.size() + slack));
|
||||||
// Add NEAR/PHRASE groups to the highlighting data. Must
|
// Add NEAR/PHRASE groups to the highlighting data. Must
|
||||||
// push all combinations
|
// push all combinations
|
||||||
vector<vector<string> > allcombs;
|
vector<vector<string> > allcombs;
|
||||||
@ -378,6 +390,28 @@ bool StringToXapianQ::translate(const string &iq,
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Try to translate field specification into field prefix. This should
|
||||||
|
// probably be an Rcl::Db method and much more configurable (store
|
||||||
|
// prefix translation list in config ?)
|
||||||
|
static string fieldToPrefix(const string& i_field)
|
||||||
|
{
|
||||||
|
static map<string, string> fldToPrefs;
|
||||||
|
if (fldToPrefs.empty()) {
|
||||||
|
fldToPrefs["title"] = "S";
|
||||||
|
fldToPrefs["caption"] = "S";
|
||||||
|
fldToPrefs["subject"] = "S";
|
||||||
|
fldToPrefs["author"] = "A";
|
||||||
|
fldToPrefs["from"] = "A";
|
||||||
|
fldToPrefs["keyword"] = "K";
|
||||||
|
}
|
||||||
|
string fld(i_field);
|
||||||
|
stringtolower(fld);
|
||||||
|
map<string, string>::const_iterator it = fldToPrefs.find(fld);
|
||||||
|
if (it != fldToPrefs.end())
|
||||||
|
return it->second;
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
// Translate a simple OR, AND, or EXCL search clause.
|
// Translate a simple OR, AND, or EXCL search clause.
|
||||||
bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p,
|
bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p,
|
||||||
const string& stemlang)
|
const string& stemlang)
|
||||||
@ -397,9 +431,12 @@ bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p,
|
|||||||
LOGERR(("SearchDataClauseSimple: bad m_tp %d\n", m_tp));
|
LOGERR(("SearchDataClauseSimple: bad m_tp %d\n", m_tp));
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
string prefix;
|
||||||
|
if (!m_field.empty())
|
||||||
|
prefix = fieldToPrefix(m_field);
|
||||||
list<Xapian::Query> pqueries;
|
list<Xapian::Query> pqueries;
|
||||||
StringToXapianQ tr(db, stemlang);
|
StringToXapianQ tr(db, stemlang);
|
||||||
if (!tr.translate(m_text, m_reason, pqueries))
|
if (!tr.translate(m_text, prefix, m_reason, pqueries))
|
||||||
return false;
|
return false;
|
||||||
if (pqueries.empty()) {
|
if (pqueries.empty()) {
|
||||||
LOGERR(("SearchDataClauseSimple: resolved to null query\n"));
|
LOGERR(("SearchDataClauseSimple: resolved to null query\n"));
|
||||||
@ -437,12 +474,16 @@ bool SearchDataClauseDist::toNativeQuery(Rcl::Db &db, void *p,
|
|||||||
list<Xapian::Query> pqueries;
|
list<Xapian::Query> pqueries;
|
||||||
Xapian::Query nq;
|
Xapian::Query nq;
|
||||||
|
|
||||||
|
string prefix;
|
||||||
|
if (!m_field.empty())
|
||||||
|
prefix = fieldToPrefix(m_field);
|
||||||
|
|
||||||
// Use stringToXapianQueries to lowercase and simplify the phrase
|
// Use stringToXapianQueries to lowercase and simplify the phrase
|
||||||
// terms etc. The result should be a single element list
|
// terms etc. The result should be a single element list
|
||||||
string s = string("\"") + m_text + string("\"");
|
string s = string("\"") + m_text + string("\"");
|
||||||
bool useNear = m_tp == SCLT_NEAR;
|
bool useNear = m_tp == SCLT_NEAR;
|
||||||
StringToXapianQ tr(db, stemlang);
|
StringToXapianQ tr(db, stemlang);
|
||||||
if (!tr.translate(s, m_reason, pqueries, m_slack, useNear))
|
if (!tr.translate(s, prefix, m_reason, pqueries, m_slack, useNear))
|
||||||
return false;
|
return false;
|
||||||
if (pqueries.empty()) {
|
if (pqueries.empty()) {
|
||||||
LOGERR(("SearchDataClauseDist: resolved to null query\n"));
|
LOGERR(("SearchDataClauseDist: resolved to null query\n"));
|
||||||
|
|||||||
@ -16,7 +16,7 @@
|
|||||||
*/
|
*/
|
||||||
#ifndef _SEARCHDATA_H_INCLUDED_
|
#ifndef _SEARCHDATA_H_INCLUDED_
|
||||||
#define _SEARCHDATA_H_INCLUDED_
|
#define _SEARCHDATA_H_INCLUDED_
|
||||||
/* @(#$Id: searchdata.h,v 1.7 2006-12-05 15:17:13 dockes Exp $ (C) 2004 J.F.Dockes */
|
/* @(#$Id: searchdata.h,v 1.8 2007-01-17 13:53:41 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Structures to hold data coming almost directly from the gui
|
* Structures to hold data coming almost directly from the gui
|
||||||
@ -46,11 +46,22 @@ enum SClType {
|
|||||||
class SearchDataClause;
|
class SearchDataClause;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Holder for a list of search clauses. Some of the clauses may be be reference
|
Data structure representing A Recoll query.
|
||||||
* to other subqueries in the future. For now, they just reflect user entry in
|
This is currently simply a list of search clauses.
|
||||||
* a query field: type, some text and possibly a distance. Each clause may
|
|
||||||
* hold several queries in the Xapian sense, for exemple several terms
|
For now, clauses in the list just reflect user entry in a query
|
||||||
* and phrases as would result from ["this is a phrase" term1 term2]
|
field: some text, a clause type (AND/OR/NEAR etc.) and possibly a
|
||||||
|
distance. Each clause may hold several queries in the Xapian sense,
|
||||||
|
for exemple several terms and phrases as would result from
|
||||||
|
["this is a phrase" term1 term2]
|
||||||
|
|
||||||
|
This means that SearchData will be translated into a Xapian
|
||||||
|
Query tree of depth 2.
|
||||||
|
|
||||||
|
The structure might be extended in the future so that some of the
|
||||||
|
clauses may be references to other subqueries (there doesn't seem to
|
||||||
|
be an urgent need for this)
|
||||||
|
|
||||||
*/
|
*/
|
||||||
class SearchData {
|
class SearchData {
|
||||||
public:
|
public:
|
||||||
@ -134,15 +145,19 @@ protected:
|
|||||||
*/
|
*/
|
||||||
class SearchDataClauseSimple : public SearchDataClause {
|
class SearchDataClauseSimple : public SearchDataClause {
|
||||||
public:
|
public:
|
||||||
SearchDataClauseSimple(SClType tp, string txt)
|
SearchDataClauseSimple(SClType tp, const string& txt,
|
||||||
: SearchDataClause(tp), m_text(txt), m_slack(0) {}
|
const string& fld = "")
|
||||||
|
: SearchDataClause(tp), m_text(txt), m_field(fld), m_slack(0) {}
|
||||||
|
|
||||||
virtual ~SearchDataClauseSimple() {}
|
virtual ~SearchDataClauseSimple() {}
|
||||||
|
|
||||||
|
/** Translate to Xapian query */
|
||||||
virtual bool toNativeQuery(Rcl::Db &db, void *, const string& stemlang);
|
virtual bool toNativeQuery(Rcl::Db &db, void *, const string& stemlang);
|
||||||
|
|
||||||
virtual bool getTerms(vector<string>& terms,
|
/** Retrieve query terms and term groups. This is used for highlighting */
|
||||||
vector<vector<string> >& groups,
|
virtual bool getTerms(vector<string>& terms, /* Single terms */
|
||||||
vector<int>& gslks) const
|
vector<vector<string> >& groups, /* Prox grps */
|
||||||
|
vector<int>& gslks) const /* Prox slacks */
|
||||||
{
|
{
|
||||||
terms.insert(terms.end(), m_terms.begin(), m_terms.end());
|
terms.insert(terms.end(), m_terms.begin(), m_terms.end());
|
||||||
groups.insert(groups.end(), m_groups.begin(), m_groups.end());
|
groups.insert(groups.end(), m_groups.begin(), m_groups.end());
|
||||||
@ -151,7 +166,8 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
string m_text;
|
string m_text; // Raw user entry text.
|
||||||
|
string m_field; // Field specification if any
|
||||||
// Single terms and phrases resulting from breaking up m_text;
|
// Single terms and phrases resulting from breaking up m_text;
|
||||||
// valid after toNativeQuery() call
|
// valid after toNativeQuery() call
|
||||||
vector<string> m_terms;
|
vector<string> m_terms;
|
||||||
@ -161,10 +177,10 @@ protected:
|
|||||||
int m_slack;
|
int m_slack;
|
||||||
};
|
};
|
||||||
|
|
||||||
/** Filename search. */
|
/** Filename search clause. */
|
||||||
class SearchDataClauseFilename : public SearchDataClauseSimple {
|
class SearchDataClauseFilename : public SearchDataClauseSimple {
|
||||||
public:
|
public:
|
||||||
SearchDataClauseFilename(string txt)
|
SearchDataClauseFilename(const string& txt)
|
||||||
: SearchDataClauseSimple(SCLT_FILENAME, txt) {}
|
: SearchDataClauseSimple(SCLT_FILENAME, txt) {}
|
||||||
virtual ~SearchDataClauseFilename() {}
|
virtual ~SearchDataClauseFilename() {}
|
||||||
virtual bool toNativeQuery(Rcl::Db &db, void *, const string& stemlang);
|
virtual bool toNativeQuery(Rcl::Db &db, void *, const string& stemlang);
|
||||||
@ -176,8 +192,9 @@ public:
|
|||||||
*/
|
*/
|
||||||
class SearchDataClauseDist : public SearchDataClauseSimple {
|
class SearchDataClauseDist : public SearchDataClauseSimple {
|
||||||
public:
|
public:
|
||||||
SearchDataClauseDist(SClType tp, string txt, int slack)
|
SearchDataClauseDist(SClType tp, const string& txt, int slack,
|
||||||
: SearchDataClauseSimple(tp, txt) {m_slack = slack;}
|
const string& fld = "")
|
||||||
|
: SearchDataClauseSimple(tp, txt, fld) {m_slack = slack;}
|
||||||
virtual ~SearchDataClauseDist() {}
|
virtual ~SearchDataClauseDist() {}
|
||||||
|
|
||||||
virtual bool toNativeQuery(Rcl::Db &db, void *, const string& stemlang);
|
virtual bool toNativeQuery(Rcl::Db &db, void *, const string& stemlang);
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user