added field/prefixes for author and title + command line query language
This commit is contained in:
parent
ee85be5c61
commit
1d683ad411
@ -1,5 +1,5 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: internfile.cpp,v 1.24 2007-01-15 13:06:38 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: internfile.cpp,v 1.25 2007-01-17 13:53:40 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
#endif
|
||||
/*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
@ -236,6 +236,7 @@ static inline bool getKeyValue(const map<string, string>& docdata,
|
||||
}
|
||||
|
||||
static const string keyab("abstract");
|
||||
static const string keyau("author");
|
||||
static const string keycs("charset");
|
||||
static const string keyct("content");
|
||||
static const string keyfn("filename");
|
||||
@ -251,6 +252,7 @@ bool FileInterner::dijontorcl(Rcl::Doc& doc)
|
||||
Dijon::Filter *df = m_handlers.back();
|
||||
const std::map<std::string, std::string>& docdata = df->get_meta_data();
|
||||
|
||||
getKeyValue(docdata, keyau, doc.author);
|
||||
getKeyValue(docdata, keyoc, doc.origcharset);
|
||||
getKeyValue(docdata, keyct, doc.text);
|
||||
getKeyValue(docdata, keytt, doc.title);
|
||||
|
||||
@ -122,6 +122,7 @@ bool MimeHandlerHtml::next_document()
|
||||
m_metaData["charset"] = "utf-8";
|
||||
m_metaData["title"] = result.title;
|
||||
m_metaData["keywords"] = result.keywords;
|
||||
m_metaData["author"] = result.author;
|
||||
m_metaData["modificationdate"] = result.dmtime;
|
||||
m_metaData["sample"] = result.sample;
|
||||
m_metaData["mimetype"] = "text/plain";
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: mh_mail.cpp,v 1.28 2007-01-13 10:28:37 dockes Exp $ (C) 2005 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: mh_mail.cpp,v 1.29 2007-01-17 13:53:40 dockes Exp $ (C) 2005 J.F.Dockes";
|
||||
#endif
|
||||
/*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
@ -217,6 +217,9 @@ bool MimeHandlerMail::processMsg(Binc::MimePart *doc, int depth)
|
||||
if (doc->h.getFirstHeader("From", hi)) {
|
||||
rfc2047_decode(hi.getValue(), transcoded);
|
||||
text += string("From: ") + transcoded + string("\n");
|
||||
if (depth == 1) {
|
||||
m_metaData["author"] = transcoded;
|
||||
}
|
||||
}
|
||||
if (doc->h.getFirstHeader("To", hi)) {
|
||||
rfc2047_decode(hi.getValue(), transcoded);
|
||||
@ -245,7 +248,7 @@ bool MimeHandlerMail::processMsg(Binc::MimePart *doc, int depth)
|
||||
}
|
||||
text += '\n';
|
||||
|
||||
LOGDEB2(("MimeHandlerMail::rocessMsg:ismultipart %d mime subtype '%s'\n",
|
||||
LOGDEB2(("MimeHandlerMail::processMsg:ismultipart %d mime subtype '%s'\n",
|
||||
doc->isMultipart(), doc->getSubType().c_str()));
|
||||
walkmime(doc, depth);
|
||||
|
||||
|
||||
@ -154,6 +154,11 @@ MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
|
||||
string tmp = i->second;
|
||||
decode_entities(tmp);
|
||||
keywords += tmp;
|
||||
} else if (name == "author") {
|
||||
if (!author.empty()) author += ' ';
|
||||
string tmp = i->second;
|
||||
decode_entities(tmp);
|
||||
author += tmp;
|
||||
} else if (name == "date") {
|
||||
// Yes this doesnt exist. It's output by filters
|
||||
// And the format isn't even standard http/html
|
||||
@ -168,19 +173,6 @@ MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
|
||||
dmtime = ascuxtime;
|
||||
}
|
||||
}
|
||||
#if 0 // We're not a robot, so we don't care about robots metainfo
|
||||
else if (name == "robots") {
|
||||
string val = i->second;
|
||||
decode_entities(val);
|
||||
lowercase_term(val);
|
||||
if (val.find("none") != string::npos ||
|
||||
val.find("noindex") != string::npos) {
|
||||
indexing_allowed = false;
|
||||
LOGDEB1(("myhtmlparse: robots/noindex\n"));
|
||||
throw false;
|
||||
}
|
||||
}
|
||||
#endif // 0
|
||||
} else if ((j = p.find("http-equiv")) != p.end()) {
|
||||
string hequiv = j->second;
|
||||
lowercase_term(hequiv);
|
||||
|
||||
@ -37,7 +37,7 @@ class MyHtmlParser : public HtmlParser {
|
||||
bool in_body_tag;
|
||||
bool in_pre_tag;
|
||||
bool pending_space;
|
||||
string title, sample, keywords, dump, dmtime;
|
||||
string title, sample, keywords, dump, dmtime, author;
|
||||
string ocharset; // This is the charset our user thinks the doc was
|
||||
string charset; // This is the charset it was supposedly converted to
|
||||
string doccharset; // Set this to value of charset parameter in header
|
||||
|
||||
@ -8,8 +8,8 @@ LIBS = librcl.a
|
||||
|
||||
all: $(LIBS)
|
||||
|
||||
OBJS = rclaspell.o rclconfig.o rclinit.o textsplit.o unacpp.o csguess.o indexer.o mimetype.o htmlparse.o myhtmlparse.o mimehandler.o internfile.o mh_exec.o mh_html.o mh_mail.o mh_mbox.o mh_text.o docseq.o history.o sortseq.o pathhash.o rcldb.o searchdata.o stemdb.o base64.o conftree.o copyfile.o debuglog.o execmd.o fstreewalk.o idfile.o md5.o mimeparse.o pathut.o readfile.o smallut.o transcode.o wipedir.o x11mon.o
|
||||
DEPS = rclaspell.dep.stamp rclconfig.dep.stamp rclinit.dep.stamp textsplit.dep.stamp unacpp.dep.stamp csguess.dep.stamp indexer.dep.stamp mimetype.dep.stamp htmlparse.dep.stamp myhtmlparse.dep.stamp mimehandler.dep.stamp internfile.dep.stamp mh_exec.dep.stamp mh_html.dep.stamp mh_mail.dep.stamp mh_mbox.dep.stamp mh_text.dep.stamp docseq.dep.stamp history.dep.stamp sortseq.dep.stamp pathhash.dep.stamp rcldb.dep.stamp searchdata.dep.stamp stemdb.dep.stamp base64.dep.stamp conftree.dep.stamp copyfile.dep.stamp debuglog.dep.stamp execmd.dep.stamp fstreewalk.dep.stamp idfile.dep.stamp md5.dep.stamp mimeparse.dep.stamp pathut.dep.stamp readfile.dep.stamp smallut.dep.stamp transcode.dep.stamp wipedir.dep.stamp x11mon.dep.stamp
|
||||
OBJS = rclaspell.o rclconfig.o rclinit.o textsplit.o unacpp.o csguess.o indexer.o mimetype.o htmlparse.o myhtmlparse.o mimehandler.o internfile.o mh_exec.o mh_html.o mh_mail.o mh_mbox.o mh_text.o docseq.o history.o sortseq.o wasastringtoquery.o wasatorcl.o pathhash.o rcldb.o searchdata.o stemdb.o base64.o conftree.o copyfile.o debuglog.o execmd.o fstreewalk.o idfile.o md5.o mimeparse.o pathut.o readfile.o smallut.o transcode.o wipedir.o x11mon.o
|
||||
DEPS = rclaspell.dep.stamp rclconfig.dep.stamp rclinit.dep.stamp textsplit.dep.stamp unacpp.dep.stamp csguess.dep.stamp indexer.dep.stamp mimetype.dep.stamp htmlparse.dep.stamp myhtmlparse.dep.stamp mimehandler.dep.stamp internfile.dep.stamp mh_exec.dep.stamp mh_html.dep.stamp mh_mail.dep.stamp mh_mbox.dep.stamp mh_text.dep.stamp docseq.dep.stamp history.dep.stamp sortseq.dep.stamp wasastringtoquery.dep.stamp wasatorcl.dep.stamp pathhash.dep.stamp rcldb.dep.stamp searchdata.dep.stamp stemdb.dep.stamp base64.dep.stamp conftree.dep.stamp copyfile.dep.stamp debuglog.dep.stamp execmd.dep.stamp fstreewalk.dep.stamp idfile.dep.stamp md5.dep.stamp mimeparse.dep.stamp pathut.dep.stamp readfile.dep.stamp smallut.dep.stamp transcode.dep.stamp wipedir.dep.stamp x11mon.dep.stamp
|
||||
|
||||
librcl.a : $(DEPS) $(OBJS) unac.o
|
||||
ar ru librcl.a $(OBJS) unac.o
|
||||
@ -57,6 +57,10 @@ history.o : ../query/history.cpp
|
||||
$(CXX) $(ALL_CXXFLAGS) -c ../query/history.cpp
|
||||
sortseq.o : ../query/sortseq.cpp
|
||||
$(CXX) $(ALL_CXXFLAGS) -c ../query/sortseq.cpp
|
||||
wasastringtoquery.o : ../query/wasastringtoquery.cpp
|
||||
$(CXX) $(ALL_CXXFLAGS) -c ../query/wasastringtoquery.cpp
|
||||
wasatorcl.o : ../query/wasatorcl.cpp
|
||||
$(CXX) $(ALL_CXXFLAGS) -c ../query/wasatorcl.cpp
|
||||
pathhash.o : ../rcldb/pathhash.cpp
|
||||
$(CXX) $(ALL_CXXFLAGS) -c ../rcldb/pathhash.cpp
|
||||
rcldb.o : ../rcldb/rcldb.cpp
|
||||
@ -161,6 +165,12 @@ history.dep.stamp : ../query/history.cpp
|
||||
sortseq.dep.stamp : ../query/sortseq.cpp
|
||||
$(CXX) -M $(ALL_CXXFLAGS) ../query/sortseq.cpp > sortseq.dep
|
||||
touch sortseq.dep.stamp
|
||||
wasastringtoquery.dep.stamp : ../query/wasastringtoquery.cpp
|
||||
$(CXX) -M $(ALL_CXXFLAGS) ../query/wasastringtoquery.cpp > wasastringtoquery.dep
|
||||
touch wasastringtoquery.dep.stamp
|
||||
wasatorcl.dep.stamp : ../query/wasatorcl.cpp
|
||||
$(CXX) -M $(ALL_CXXFLAGS) ../query/wasatorcl.cpp > wasatorcl.dep
|
||||
touch wasatorcl.dep.stamp
|
||||
pathhash.dep.stamp : ../rcldb/pathhash.cpp
|
||||
$(CXX) -M $(ALL_CXXFLAGS) ../rcldb/pathhash.cpp > pathhash.dep
|
||||
touch pathhash.dep.stamp
|
||||
@ -238,6 +248,8 @@ include mh_text.dep
|
||||
include docseq.dep
|
||||
include history.dep
|
||||
include sortseq.dep
|
||||
include wasastringtoquery.dep
|
||||
include wasatorcl.dep
|
||||
include pathhash.dep
|
||||
include rcldb.dep
|
||||
include searchdata.dep
|
||||
|
||||
@ -24,6 +24,8 @@ ${depth}/internfile/mh_text.cpp \
|
||||
${depth}/query/docseq.cpp \
|
||||
${depth}/query/history.cpp \
|
||||
${depth}/query/sortseq.cpp \
|
||||
${depth}/query/wasastringtoquery.cpp \
|
||||
${depth}/query/wasatorcl.cpp \
|
||||
${depth}/rcldb/pathhash.cpp \
|
||||
${depth}/rcldb/rcldb.cpp \
|
||||
${depth}/rcldb/searchdata.cpp \
|
||||
|
||||
@ -1,11 +1,11 @@
|
||||
depth = ..
|
||||
include $(depth)/mk/sysconf
|
||||
|
||||
PROGS = xadump #trhist qtry qxtry
|
||||
PROGS = xadump rclqlang #trhist qtry qxtry
|
||||
|
||||
all: $(PROGS)
|
||||
|
||||
SRCS = xadump.cpp
|
||||
SRCS = xadump.cpp rclqlang.cpp
|
||||
.cpp.o :
|
||||
$(CXX) -c $(ALL_CXXFLAGS) -o $@ $<
|
||||
|
||||
@ -14,6 +14,11 @@ xadump : $(XADUMP_OBJS)
|
||||
$(CXX) $(ALL_CXXFLAGS) -o xadump $(XADUMP_OBJS) \
|
||||
$(LIBICONV) $(LIBXAPIAN) $(LIBSYS)
|
||||
|
||||
RCLQLANG_OBJS= rclqlang.o $(BIGLIB)
|
||||
rclqlang : $(RCLQLANG_OBJS)
|
||||
$(CXX) $(ALL_CXXFLAGS) -o rclqlang $(RCLQLANG_OBJS) \
|
||||
$(LIBICONV) $(LIBXAPIAN) $(LIBSYS)
|
||||
|
||||
HISTORY_OBJS= trhist.o $(BIGLIB) $(MIMELIB)
|
||||
trhist : $(HISTORY_OBJS)
|
||||
$(CXX) $(ALL_CXXFLAGS) -o trhist $(HISTORY_OBJS) \
|
||||
@ -21,9 +26,19 @@ trhist : $(HISTORY_OBJS)
|
||||
trhist.o : history.cpp history.h
|
||||
$(CXX) $(ALL_CXXFLAGS) -DTEST_HISTORY -c -o trhist.o history.cpp
|
||||
|
||||
$(BIGLIB):
|
||||
cd $(depth)/lib;make
|
||||
|
||||
WASASTRINGTOQUERY_OBJS= trwasastrtoq.o $(BIGLIB) $(MIMELIB)
|
||||
trwasastrtoq : $(WASASTRINGTOQUERY_OBJS)
|
||||
$(CXX) $(ALL_CXXFLAGS) -o trwasastrtoq $(WASASTRINGTOQUERY_OBJS) \
|
||||
$(LIBICONV) $(LIBXAPIAN)
|
||||
trwasastrtoq.o : wasastringtoquery.cpp wasastringtoquery.h
|
||||
$(CXX) $(ALL_CXXFLAGS) -DTEST_WASASTRINGTOQUERY -c \
|
||||
-o trwasastrtoq.o wasastringtoquery.cpp
|
||||
|
||||
$(BIGLIB): force
|
||||
cd $(depth)/lib;$(MAKE)
|
||||
force:
|
||||
|
||||
|
||||
depend: alldeps.stamp
|
||||
alldeps.stamp : $(SRCS)
|
||||
$(CXX) -M $(ALL_CXXFLAGS) $(SRCS) > alldeps
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: wasastringtoquery.cpp,v 1.3 2006-12-10 17:03:08 dockes Exp $ (C) 2006 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: wasastringtoquery.cpp,v 1.4 2007-01-17 13:53:41 dockes Exp $ (C) 2006 J.F.Dockes";
|
||||
#endif
|
||||
/*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
@ -17,7 +17,7 @@ static char rcsid[] = "@(#$Id: wasastringtoquery.cpp,v 1.3 2006-12-10 17:03:08 d
|
||||
* Free Software Foundation, Inc.,
|
||||
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
*/
|
||||
#ifndef TEST_STRINGTOQUERY
|
||||
#ifndef TEST_WASASTRINGTOQUERY
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
@ -25,6 +25,13 @@ static char rcsid[] = "@(#$Id: wasastringtoquery.cpp,v 1.3 2006-12-10 17:03:08 d
|
||||
|
||||
#include "wasastringtoquery.h"
|
||||
|
||||
//#define DEB_WASASTRINGTOQ 1
|
||||
#ifdef DEB_WASASTRINGTOQ
|
||||
#define DPRINT(X) fprintf X
|
||||
#else
|
||||
#define DPRINT(X)
|
||||
#endif
|
||||
|
||||
WasaQuery::~WasaQuery()
|
||||
{
|
||||
for (vector<WasaQuery*>::iterator it = m_subs.begin();
|
||||
@ -61,16 +68,16 @@ void WasaQuery::describe(string &desc) const
|
||||
desc += ")";
|
||||
}
|
||||
desc += "(";
|
||||
string fieldspec = m_fieldspec.empty() ? "" : m_fieldspec + ": ";
|
||||
switch (m_op) {
|
||||
case OP_NULL:
|
||||
desc += "NULL";
|
||||
break;
|
||||
case OP_LEAF:
|
||||
desc += m_fieldspec.empty() ?
|
||||
m_value : m_fieldspec + ":" + m_value;
|
||||
desc += fieldspec + m_value;
|
||||
break;
|
||||
case OP_EXCL:
|
||||
desc += string("NOT (" ) + m_value + ") ";
|
||||
desc += string("NOT (" ) + fieldspec + m_value + ") ";
|
||||
break;
|
||||
case OP_OR:
|
||||
case OP_AND:
|
||||
@ -84,6 +91,8 @@ void WasaQuery::describe(string &desc) const
|
||||
}
|
||||
break;
|
||||
}
|
||||
if (desc[desc.length() - 1] == ' ')
|
||||
desc.erase(desc.length() - 1);
|
||||
desc += ") ";
|
||||
}
|
||||
|
||||
@ -111,7 +120,7 @@ void WasaQuery::describe(string &desc) const
|
||||
* parenthesis increases the index, but we're not interested in all
|
||||
*/
|
||||
static const char * parserExpr =
|
||||
"([oO][rR])" //1 OR is a special word
|
||||
"([oO][rR])[[:space:]]*" //1 OR is a special word
|
||||
"|"
|
||||
"(" //2
|
||||
"([+-])?" //3 Force or exclude indicator
|
||||
@ -125,7 +134,7 @@ static const char * parserExpr =
|
||||
"|"
|
||||
"([^[:space:]]+)" //9 ANormalTerm
|
||||
")"
|
||||
")"
|
||||
")[[:space:]]*"
|
||||
;
|
||||
|
||||
// For debugging the parser. But see also NMATCH
|
||||
@ -236,17 +245,18 @@ StringToWasaQuery::Internal::stringToQuery(const string& str, string& reason)
|
||||
reason = "Internal regular expression handling error";
|
||||
return 0;
|
||||
}
|
||||
#if 0
|
||||
if (loop) printf("Next part:\n");
|
||||
for (i = 0; i < NMATCH; i++) {
|
||||
|
||||
#ifdef DEB_WASASTRINGTOQ
|
||||
if (loop) DPRINT((stderr, "Next part:\n"));
|
||||
for (unsigned int i = 0; i < NMATCH; i++) {
|
||||
if (m_pmatch[i].rm_so == -1) continue;
|
||||
char match[maxmatchlen+1];
|
||||
memcpy(match, m_cp + m_pmatch[i].rm_so,
|
||||
m_pmatch[i].rm_eo - m_pmatch[i].rm_so);
|
||||
match[m_pmatch[i].rm_eo - m_pmatch[i].rm_so] = 0;
|
||||
if (matchNames[i][0])
|
||||
printf("%10s: [%s] (%d->%d)\n", matchNames[i], match,
|
||||
(int)m_pmatch[i].rm_so, (int)m_pmatch[i].rm_eo);
|
||||
DPRINT((stderr, "%10s: [%s] (%d->%d)\n", matchNames[i], match,
|
||||
(int)m_pmatch[i].rm_so, (int)m_pmatch[i].rm_eo));
|
||||
}
|
||||
#endif
|
||||
char match[maxmatchlen+1];
|
||||
@ -348,14 +358,17 @@ StringToWasaQuery::Internal::stringToQuery(const string& str, string& reason)
|
||||
if (prev_or) {
|
||||
// We're in an OR subquery, add new subquery
|
||||
orClause->m_subs.push_back(nclause);
|
||||
DPRINT((stderr, "Adding to OR chain\n"));
|
||||
} else {
|
||||
if (orClause) {
|
||||
// Getting out of OR. Add the OR subquery to the main one
|
||||
query->m_subs.push_back(orClause);
|
||||
DPRINT((stderr, "Adding OR chain to main\n"));
|
||||
orClause = 0;
|
||||
}
|
||||
// Add new subquery to main one.
|
||||
query->m_subs.push_back(nclause);
|
||||
DPRINT((stderr, "Adding to main chain\n"));
|
||||
}
|
||||
prev_or = false;
|
||||
}
|
||||
@ -369,6 +382,12 @@ StringToWasaQuery::Internal::stringToQuery(const string& str, string& reason)
|
||||
break;
|
||||
}
|
||||
|
||||
if (orClause) {
|
||||
// Getting out of OR. Add the OR subquery to the main one
|
||||
query->m_subs.push_back(orClause);
|
||||
DPRINT((stderr, "Adding OR chain to main\n"));
|
||||
}
|
||||
|
||||
regfree(&m_rx);
|
||||
m_rxneedsfree = false;
|
||||
return query;
|
||||
@ -404,4 +423,4 @@ int main(int argc, char **argv)
|
||||
exit(0);
|
||||
}
|
||||
|
||||
#endif // TEST_STRINGTOQUERY
|
||||
#endif // TEST_WASASTRINGTOQUERY
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
#ifndef _WASASTRINGTOQUERY_H_INCLUDED_
|
||||
#define _WASASTRINGTOQUERY_H_INCLUDED_
|
||||
/* @(#$Id: wasastringtoquery.h,v 1.3 2006-12-10 17:03:08 dockes Exp $ (C) 2006 J.F.Dockes */
|
||||
/* @(#$Id: wasastringtoquery.h,v 1.4 2007-01-17 13:53:41 dockes Exp $ (C) 2006 J.F.Dockes */
|
||||
/*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
@ -40,23 +40,28 @@ public:
|
||||
{}
|
||||
~WasaQuery();
|
||||
|
||||
// Get string describing the query tree from this point
|
||||
/** Get string describing the query tree from this point */
|
||||
void describe(string &desc) const;
|
||||
|
||||
/** Op to be performed on either value or subqueries */
|
||||
WasaQuery::Op m_op;
|
||||
|
||||
/** Field specification if any (ie: title, author ...) */
|
||||
string m_fieldspec;
|
||||
/* Valid for op == OP_LEAF */
|
||||
|
||||
/* String value. Valid for op == OP_LEAF */
|
||||
string m_value;
|
||||
/* Valid for conjunctions */
|
||||
|
||||
/** Subqueries. Valid for conjunctions */
|
||||
vector<WasaQuery*> m_subs;
|
||||
|
||||
/* Restrict results to some file type, defined by either mime, app group,
|
||||
* or extension */
|
||||
/** Restrict results to some file type, defined by either mime,
|
||||
* app group, or extension */
|
||||
enum TypeKind {WQTK_NONE, WQTK_MIME, WQTK_GROUP, WQTK_EXT};
|
||||
TypeKind m_typeKind;
|
||||
vector<string> m_types;
|
||||
|
||||
/* Sort on relevance, date, name or group */
|
||||
/** Sort on relevance, date, name or group */
|
||||
enum SortKind {WQSK_REL, WQSK_DATE, WQSK_ALPHA, WQSK_GROUP};
|
||||
vector<SortKind> m_sortSpec;
|
||||
};
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: wasatorcl.cpp,v 1.2 2006-12-10 17:03:08 dockes Exp $ (C) 2006 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: wasatorcl.cpp,v 1.3 2007-01-17 13:53:41 dockes Exp $ (C) 2006 J.F.Dockes";
|
||||
#endif
|
||||
#ifndef TEST_WASATORCL
|
||||
|
||||
@ -27,11 +27,13 @@ Rcl::SearchData *wasaQueryToRcl(WasaQuery *wasa)
|
||||
if ((*it)->m_value.find_first_of(" \t\n\r") != string::npos) {
|
||||
sdata->addClause
|
||||
(new Rcl::SearchDataClauseDist(Rcl::SCLT_PHRASE,
|
||||
(*it)->m_value, 0));
|
||||
(*it)->m_value, 0,
|
||||
(*it)->m_fieldspec));
|
||||
} else {
|
||||
sdata->addClause
|
||||
(new Rcl::SearchDataClauseSimple(Rcl::SCLT_AND,
|
||||
(*it)->m_value));
|
||||
(*it)->m_value,
|
||||
(*it)->m_fieldspec));
|
||||
}
|
||||
break;
|
||||
case WasaQuery::OP_EXCL:
|
||||
@ -41,7 +43,8 @@ Rcl::SearchData *wasaQueryToRcl(WasaQuery *wasa)
|
||||
sdata->addClause
|
||||
(new Rcl::SearchDataClauseSimple(Rcl::SCLT_EXCL,
|
||||
string("\"") +
|
||||
(*it)->m_value + "\""));
|
||||
(*it)->m_value + "\"",
|
||||
(*it)->m_fieldspec));
|
||||
break;
|
||||
case WasaQuery::OP_OR:
|
||||
// Concatenate all OR values as phrases. Hope there are no
|
||||
@ -55,7 +58,8 @@ Rcl::SearchData *wasaQueryToRcl(WasaQuery *wasa)
|
||||
}
|
||||
sdata->addClause
|
||||
(new Rcl::SearchDataClauseSimple(Rcl::SCLT_OR,
|
||||
orvalue));
|
||||
orvalue,
|
||||
(*it)->m_fieldspec));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -105,7 +109,7 @@ int main(int argc, char *argv[])
|
||||
|
||||
if (argc != 1) {
|
||||
fprintf(stderr, "need one arg\n");
|
||||
exit(1);
|
||||
return 1;
|
||||
}
|
||||
const string str = *argv++;argc--;
|
||||
string reason;
|
||||
@ -113,14 +117,12 @@ int main(int argc, char *argv[])
|
||||
RclConfig *config = recollinit(RCLINIT_NONE, 0, 0, reason, 0);
|
||||
if (config == 0 || !config->ok()) {
|
||||
cerr << "Configuration problem: " << reason << endl;
|
||||
exit(1);
|
||||
return 1;
|
||||
}
|
||||
string dbdir = config->getDbDir();
|
||||
if (dbdir.empty()) {
|
||||
// Note: this will have to be replaced by a call to a
|
||||
// configuration buildin dialog for initial configuration
|
||||
cerr << "Configuration problem: " << "No dbdir" << endl;
|
||||
exit(1);
|
||||
return 1;
|
||||
}
|
||||
Rcl::Db rcldb;
|
||||
if (!rcldb.open(dbdir, Rcl::Db::DbRO, 0)) {
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.101 2006-12-19 12:11:21 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.102 2007-01-17 13:53:41 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
#endif
|
||||
/*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
@ -80,6 +80,7 @@ namespace Rcl {
|
||||
// Synthetic abstract marker (to discriminate from abstract actually
|
||||
// found in doc)
|
||||
const static string rclSyntAbs = "?!#@";
|
||||
const static string emptystring;
|
||||
|
||||
// A class for data and methods that would have to expose
|
||||
// Xapian-specific stuff if they were in Rcl::Db. There could actually be
|
||||
@ -703,15 +704,24 @@ bool Db::isopen()
|
||||
return m_ndb->m_isopen;
|
||||
}
|
||||
|
||||
// A small class to hold state while splitting text
|
||||
// The text splitter callback class which receives words from the
|
||||
// splitter and adds postings to the Xapian document.
|
||||
class mySplitterCB : public TextSplitCB {
|
||||
public:
|
||||
Xapian::Document &doc;
|
||||
Xapian::Document &doc; // Xapian document
|
||||
Xapian::termpos basepos; // Base for document section
|
||||
Xapian::termpos curpos; // Last position sent to callback
|
||||
mySplitterCB(Xapian::Document &d) : doc(d), basepos(1), curpos(0)
|
||||
Xapian::termpos curpos; // Current position. Used to set basepos for the
|
||||
// following section
|
||||
mySplitterCB(Xapian::Document &d)
|
||||
: doc(d), basepos(1), curpos(0)
|
||||
{}
|
||||
bool takeword(const std::string &term, int pos, int, int);
|
||||
void setprefix(const string& pref) {prefix = pref;}
|
||||
|
||||
private:
|
||||
// If prefix is set, we also add a posting for the prefixed terms
|
||||
// (ie: for titles, add postings for both "term" and "Sterm")
|
||||
string prefix;
|
||||
};
|
||||
|
||||
// Callback for the document to word splitting class during indexation
|
||||
@ -731,7 +741,11 @@ bool mySplitterCB::takeword(const std::string &term, int pos, int, int)
|
||||
// be possible to assign different weigths to doc parts (ie title)
|
||||
// by using a higher value
|
||||
curpos = pos;
|
||||
doc.add_posting(term, basepos + curpos, 1);
|
||||
pos += basepos;
|
||||
doc.add_posting(term, pos, 1);
|
||||
if (!prefix.empty()) {
|
||||
doc.add_posting(prefix + term, pos, 1);
|
||||
}
|
||||
return true;
|
||||
} catch (const Xapian::Error &e) {
|
||||
ermsg = e.get_msg().c_str();
|
||||
@ -804,8 +818,9 @@ bool Db::add(const string &fn, const Doc &idoc,
|
||||
doc.abstract = truncate_to_word(doc.abstract, m_idxAbsTruncLen);
|
||||
}
|
||||
doc.abstract = neutchars(doc.abstract, "\n\r");
|
||||
doc.title = truncate_to_word(doc.title, 100);
|
||||
doc.keywords = truncate_to_word(doc.keywords, 300);
|
||||
doc.title = neutchars(truncate_to_word(doc.title, 150), "\n\r");
|
||||
doc.author = neutchars(truncate_to_word(doc.author, 150), "\n\r");
|
||||
doc.keywords = neutchars(truncate_to_word(doc.keywords, 300), "\n\r");
|
||||
|
||||
Xapian::Document newdocument;
|
||||
|
||||
@ -824,13 +839,30 @@ bool Db::add(const string &fn, const Doc &idoc,
|
||||
}
|
||||
|
||||
// Split and index title
|
||||
LOGDEB2(("Db::add: split title [%s]\n", doc.title.c_str()));
|
||||
if (!dumb_string(doc.title, noacc)) {
|
||||
LOGERR(("Db::add: dumb_string failed\n"));
|
||||
return false;
|
||||
if (!doc.title.empty()) {
|
||||
LOGDEB2(("Db::add: split title [%s]\n", doc.title.c_str()));
|
||||
if (!dumb_string(doc.title, noacc)) {
|
||||
LOGERR(("Db::add: dumb_string failed\n"));
|
||||
return false;
|
||||
}
|
||||
splitData.setprefix("S"); // Subject
|
||||
splitter.text_to_words(noacc);
|
||||
splitData.setprefix(emptystring);
|
||||
splitData.basepos += splitData.curpos + 100;
|
||||
}
|
||||
|
||||
// Split and index author
|
||||
if (!doc.author.empty()) {
|
||||
LOGDEB2(("Db::add: split author [%s]\n", doc.author.c_str()));
|
||||
if (!dumb_string(doc.author, noacc)) {
|
||||
LOGERR(("Db::add: dumb_string failed\n"));
|
||||
return false;
|
||||
}
|
||||
splitData.setprefix("A");
|
||||
splitter.text_to_words(noacc);
|
||||
splitData.setprefix(emptystring);
|
||||
splitData.basepos += splitData.curpos + 100;
|
||||
}
|
||||
splitter.text_to_words(noacc);
|
||||
splitData.basepos += splitData.curpos + 100;
|
||||
|
||||
// Split and index body
|
||||
LOGDEB2(("Db::add: split body\n"));
|
||||
@ -842,13 +874,17 @@ bool Db::add(const string &fn, const Doc &idoc,
|
||||
splitData.basepos += splitData.curpos + 100;
|
||||
|
||||
// Split and index keywords
|
||||
LOGDEB2(("Db::add: split kw [%s]\n", doc.keywords.c_str()));
|
||||
if (!dumb_string(doc.keywords, noacc)) {
|
||||
LOGERR(("Db::add: dumb_string failed\n"));
|
||||
return false;
|
||||
if (!doc.keywords.empty()) {
|
||||
LOGDEB2(("Db::add: split kw [%s]\n", doc.keywords.c_str()));
|
||||
if (!dumb_string(doc.keywords, noacc)) {
|
||||
LOGERR(("Db::add: dumb_string failed\n"));
|
||||
return false;
|
||||
}
|
||||
splitData.setprefix("K");
|
||||
splitter.text_to_words(noacc);
|
||||
splitData.setprefix(emptystring);
|
||||
splitData.basepos += splitData.curpos + 100;
|
||||
}
|
||||
splitter.text_to_words(noacc);
|
||||
splitData.basepos += splitData.curpos + 100;
|
||||
|
||||
// Split and index abstract. We don't do this if it is synthetic
|
||||
// any more (this used to give a relevance boost to the beginning
|
||||
@ -946,6 +982,9 @@ bool Db::add(const string &fn, const Doc &idoc,
|
||||
record += "\ncaption=" + doc.title;
|
||||
record += "\nkeywords=" + doc.keywords;
|
||||
record += "\nabstract=" + doc.abstract;
|
||||
if (!doc.author.empty()) {
|
||||
record += "\nauthor=" + doc.author;
|
||||
}
|
||||
record += "\n";
|
||||
LOGDEB1(("Newdocument data: %s\n", record.c_str()));
|
||||
newdocument.set_data(record);
|
||||
|
||||
@ -16,7 +16,7 @@
|
||||
*/
|
||||
#ifndef _RCLDOC_H_INCLUDED_
|
||||
#define _RCLDOC_H_INCLUDED_
|
||||
/* @(#$Id: rcldoc.h,v 1.1 2006-12-14 14:54:13 dockes Exp $ (C) 2006 J.F.Dockes */
|
||||
/* @(#$Id: rcldoc.h,v 1.2 2007-01-17 13:53:41 dockes Exp $ (C) 2006 J.F.Dockes */
|
||||
|
||||
#include <string>
|
||||
|
||||
@ -48,6 +48,7 @@ class Doc {
|
||||
string origcharset; // Charset we transcoded from (in case we want back)
|
||||
// Possibly set by handler
|
||||
string title; // Possibly set by handler
|
||||
string author; // Possibly set by handler
|
||||
string keywords; // Possibly set by handler
|
||||
string abstract; // Possibly set by handler
|
||||
bool syntabs; // true if abstract is just the top of doc, not an
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.7 2006-12-19 12:11:21 dockes Exp $ (C) 2006 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.8 2007-01-17 13:53:41 dockes Exp $ (C) 2006 J.F.Dockes";
|
||||
#endif
|
||||
/*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
@ -154,6 +154,7 @@ public:
|
||||
|
||||
|
||||
bool translate(const string &iq,
|
||||
const string &prefix,
|
||||
string &ermsg,
|
||||
list<Xapian::Query> &pqueries,
|
||||
int slack = 0, bool useNear = false);
|
||||
@ -257,6 +258,14 @@ void multiply_groups(vector<vector<string> >::const_iterator vvit,
|
||||
}
|
||||
}
|
||||
|
||||
static void addPrefix(list<string>& terms, const string& prefix)
|
||||
{
|
||||
if (prefix.empty())
|
||||
return;
|
||||
for (list<string>::iterator it = terms.begin(); it != terms.end(); it++)
|
||||
it->insert(0, prefix);
|
||||
}
|
||||
|
||||
/**
|
||||
* Turn string into list of xapian queries. There is little
|
||||
* interpretation done on the string (no +term -term or filename:term
|
||||
@ -271,6 +280,7 @@ void multiply_groups(vector<vector<string> >::const_iterator vvit,
|
||||
* count)
|
||||
*/
|
||||
bool StringToXapianQ::translate(const string &iq,
|
||||
const string &prefix,
|
||||
string &ermsg,
|
||||
list<Xapian::Query> &pqueries,
|
||||
int slack, bool useNear)
|
||||
@ -301,24 +311,25 @@ bool StringToXapianQ::translate(const string &iq,
|
||||
splitterS.text_to_words(*it);
|
||||
TextSplit splitterW(&splitDataW, TextSplit::TXTS_NOSPANS);
|
||||
splitterW.text_to_words(*it);
|
||||
wsQData& splitData = splitDataS;
|
||||
if (splitDataS.terms.size() > 1 && splitDataS.terms.size() !=
|
||||
splitDataW.terms.size())
|
||||
splitData = splitDataW;
|
||||
wsQData *splitData = &splitDataS;
|
||||
if (splitDataS.terms.size() > 1 &&
|
||||
splitDataS.terms.size() != splitDataW.terms.size())
|
||||
splitData = &splitDataW;
|
||||
|
||||
LOGDEB1(("strToXapianQ: splitter term count: %d\n",
|
||||
splitData.terms.size()));
|
||||
switch(splitData.terms.size()) {
|
||||
splitData->terms.size()));
|
||||
switch(splitData->terms.size()) {
|
||||
case 0: continue;// ??
|
||||
case 1: // Not a real phrase: one term
|
||||
{
|
||||
string term = splitData.terms.front();
|
||||
string term = splitData->terms.front();
|
||||
list<string> exp;
|
||||
maybeStemExp(false, term, exp);
|
||||
m_terms.insert(m_terms.end(), exp.begin(), exp.end());
|
||||
// Push either term or OR of stem-expanded set
|
||||
addPrefix(exp, prefix);
|
||||
pqueries.push_back(Xapian::Query(Xapian::Query::OP_OR,
|
||||
exp.begin(), exp.end()));
|
||||
m_terms.insert(m_terms.end(), exp.begin(), exp.end());
|
||||
}
|
||||
break;
|
||||
|
||||
@ -329,8 +340,8 @@ bool StringToXapianQ::translate(const string &iq,
|
||||
list<Xapian::Query> orqueries;
|
||||
bool hadmultiple = false;
|
||||
vector<vector<string> >groups;
|
||||
for (vector<string>::iterator it = splitData.terms.begin();
|
||||
it != splitData.terms.end(); it++) {
|
||||
for (vector<string>::iterator it = splitData->terms.begin();
|
||||
it != splitData->terms.end(); it++) {
|
||||
// Some version of xapian will accept only one OR clause
|
||||
// inside NEAR, all others must be leafs
|
||||
bool nostemexp =
|
||||
@ -341,6 +352,7 @@ bool StringToXapianQ::translate(const string &iq,
|
||||
maybeStemExp(nostemexp, *it, exp);
|
||||
|
||||
groups.push_back(vector<string>(exp.begin(), exp.end()));
|
||||
addPrefix(exp, prefix);
|
||||
orqueries.push_back(Xapian::Query(Xapian::Query::OP_OR,
|
||||
exp.begin(), exp.end()));
|
||||
#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
|
||||
@ -352,7 +364,7 @@ bool StringToXapianQ::translate(const string &iq,
|
||||
pqueries.push_back(Xapian::Query(op,
|
||||
orqueries.begin(),
|
||||
orqueries.end(),
|
||||
splitData.terms.size() + slack));
|
||||
splitData->terms.size() + slack));
|
||||
// Add NEAR/PHRASE groups to the highlighting data. Must
|
||||
// push all combinations
|
||||
vector<vector<string> > allcombs;
|
||||
@ -378,6 +390,28 @@ bool StringToXapianQ::translate(const string &iq,
|
||||
return true;
|
||||
}
|
||||
|
||||
// Try to translate field specification into field prefix. This should
|
||||
// probably be an Rcl::Db method and much more configurable (store
|
||||
// prefix translation list in config ?)
|
||||
static string fieldToPrefix(const string& i_field)
|
||||
{
|
||||
static map<string, string> fldToPrefs;
|
||||
if (fldToPrefs.empty()) {
|
||||
fldToPrefs["title"] = "S";
|
||||
fldToPrefs["caption"] = "S";
|
||||
fldToPrefs["subject"] = "S";
|
||||
fldToPrefs["author"] = "A";
|
||||
fldToPrefs["from"] = "A";
|
||||
fldToPrefs["keyword"] = "K";
|
||||
}
|
||||
string fld(i_field);
|
||||
stringtolower(fld);
|
||||
map<string, string>::const_iterator it = fldToPrefs.find(fld);
|
||||
if (it != fldToPrefs.end())
|
||||
return it->second;
|
||||
return "";
|
||||
}
|
||||
|
||||
// Translate a simple OR, AND, or EXCL search clause.
|
||||
bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p,
|
||||
const string& stemlang)
|
||||
@ -397,9 +431,12 @@ bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p,
|
||||
LOGERR(("SearchDataClauseSimple: bad m_tp %d\n", m_tp));
|
||||
return false;
|
||||
}
|
||||
string prefix;
|
||||
if (!m_field.empty())
|
||||
prefix = fieldToPrefix(m_field);
|
||||
list<Xapian::Query> pqueries;
|
||||
StringToXapianQ tr(db, stemlang);
|
||||
if (!tr.translate(m_text, m_reason, pqueries))
|
||||
if (!tr.translate(m_text, prefix, m_reason, pqueries))
|
||||
return false;
|
||||
if (pqueries.empty()) {
|
||||
LOGERR(("SearchDataClauseSimple: resolved to null query\n"));
|
||||
@ -437,12 +474,16 @@ bool SearchDataClauseDist::toNativeQuery(Rcl::Db &db, void *p,
|
||||
list<Xapian::Query> pqueries;
|
||||
Xapian::Query nq;
|
||||
|
||||
string prefix;
|
||||
if (!m_field.empty())
|
||||
prefix = fieldToPrefix(m_field);
|
||||
|
||||
// Use stringToXapianQueries to lowercase and simplify the phrase
|
||||
// terms etc. The result should be a single element list
|
||||
string s = string("\"") + m_text + string("\"");
|
||||
bool useNear = m_tp == SCLT_NEAR;
|
||||
StringToXapianQ tr(db, stemlang);
|
||||
if (!tr.translate(s, m_reason, pqueries, m_slack, useNear))
|
||||
if (!tr.translate(s, prefix, m_reason, pqueries, m_slack, useNear))
|
||||
return false;
|
||||
if (pqueries.empty()) {
|
||||
LOGERR(("SearchDataClauseDist: resolved to null query\n"));
|
||||
|
||||
@ -16,7 +16,7 @@
|
||||
*/
|
||||
#ifndef _SEARCHDATA_H_INCLUDED_
|
||||
#define _SEARCHDATA_H_INCLUDED_
|
||||
/* @(#$Id: searchdata.h,v 1.7 2006-12-05 15:17:13 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
/* @(#$Id: searchdata.h,v 1.8 2007-01-17 13:53:41 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
|
||||
/**
|
||||
* Structures to hold data coming almost directly from the gui
|
||||
@ -46,11 +46,22 @@ enum SClType {
|
||||
class SearchDataClause;
|
||||
|
||||
/**
|
||||
* Holder for a list of search clauses. Some of the clauses may be be reference
|
||||
* to other subqueries in the future. For now, they just reflect user entry in
|
||||
* a query field: type, some text and possibly a distance. Each clause may
|
||||
* hold several queries in the Xapian sense, for exemple several terms
|
||||
* and phrases as would result from ["this is a phrase" term1 term2]
|
||||
Data structure representing A Recoll query.
|
||||
This is currently simply a list of search clauses.
|
||||
|
||||
For now, clauses in the list just reflect user entry in a query
|
||||
field: some text, a clause type (AND/OR/NEAR etc.) and possibly a
|
||||
distance. Each clause may hold several queries in the Xapian sense,
|
||||
for exemple several terms and phrases as would result from
|
||||
["this is a phrase" term1 term2]
|
||||
|
||||
This means that SearchData will be translated into a Xapian
|
||||
Query tree of depth 2.
|
||||
|
||||
The structure might be extended in the future so that some of the
|
||||
clauses may be references to other subqueries (there doesn't seem to
|
||||
be an urgent need for this)
|
||||
|
||||
*/
|
||||
class SearchData {
|
||||
public:
|
||||
@ -134,15 +145,19 @@ protected:
|
||||
*/
|
||||
class SearchDataClauseSimple : public SearchDataClause {
|
||||
public:
|
||||
SearchDataClauseSimple(SClType tp, string txt)
|
||||
: SearchDataClause(tp), m_text(txt), m_slack(0) {}
|
||||
SearchDataClauseSimple(SClType tp, const string& txt,
|
||||
const string& fld = "")
|
||||
: SearchDataClause(tp), m_text(txt), m_field(fld), m_slack(0) {}
|
||||
|
||||
virtual ~SearchDataClauseSimple() {}
|
||||
|
||||
/** Translate to Xapian query */
|
||||
virtual bool toNativeQuery(Rcl::Db &db, void *, const string& stemlang);
|
||||
|
||||
virtual bool getTerms(vector<string>& terms,
|
||||
vector<vector<string> >& groups,
|
||||
vector<int>& gslks) const
|
||||
/** Retrieve query terms and term groups. This is used for highlighting */
|
||||
virtual bool getTerms(vector<string>& terms, /* Single terms */
|
||||
vector<vector<string> >& groups, /* Prox grps */
|
||||
vector<int>& gslks) const /* Prox slacks */
|
||||
{
|
||||
terms.insert(terms.end(), m_terms.begin(), m_terms.end());
|
||||
groups.insert(groups.end(), m_groups.begin(), m_groups.end());
|
||||
@ -151,7 +166,8 @@ public:
|
||||
}
|
||||
|
||||
protected:
|
||||
string m_text;
|
||||
string m_text; // Raw user entry text.
|
||||
string m_field; // Field specification if any
|
||||
// Single terms and phrases resulting from breaking up m_text;
|
||||
// valid after toNativeQuery() call
|
||||
vector<string> m_terms;
|
||||
@ -161,10 +177,10 @@ protected:
|
||||
int m_slack;
|
||||
};
|
||||
|
||||
/** Filename search. */
|
||||
/** Filename search clause. */
|
||||
class SearchDataClauseFilename : public SearchDataClauseSimple {
|
||||
public:
|
||||
SearchDataClauseFilename(string txt)
|
||||
SearchDataClauseFilename(const string& txt)
|
||||
: SearchDataClauseSimple(SCLT_FILENAME, txt) {}
|
||||
virtual ~SearchDataClauseFilename() {}
|
||||
virtual bool toNativeQuery(Rcl::Db &db, void *, const string& stemlang);
|
||||
@ -176,8 +192,9 @@ public:
|
||||
*/
|
||||
class SearchDataClauseDist : public SearchDataClauseSimple {
|
||||
public:
|
||||
SearchDataClauseDist(SClType tp, string txt, int slack)
|
||||
: SearchDataClauseSimple(tp, txt) {m_slack = slack;}
|
||||
SearchDataClauseDist(SClType tp, const string& txt, int slack,
|
||||
const string& fld = "")
|
||||
: SearchDataClauseSimple(tp, txt, fld) {m_slack = slack;}
|
||||
virtual ~SearchDataClauseDist() {}
|
||||
|
||||
virtual bool toNativeQuery(Rcl::Db &db, void *, const string& stemlang);
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user