added field/prefixes for author and title + command line query language

This commit is contained in:
dockes 2007-01-17 13:53:41 +00:00
parent ee85be5c61
commit 1d683ad411
15 changed files with 256 additions and 105 deletions

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: internfile.cpp,v 1.24 2007-01-15 13:06:38 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: internfile.cpp,v 1.25 2007-01-17 13:53:40 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -236,6 +236,7 @@ static inline bool getKeyValue(const map<string, string>& docdata,
}
static const string keyab("abstract");
static const string keyau("author");
static const string keycs("charset");
static const string keyct("content");
static const string keyfn("filename");
@ -251,6 +252,7 @@ bool FileInterner::dijontorcl(Rcl::Doc& doc)
Dijon::Filter *df = m_handlers.back();
const std::map<std::string, std::string>& docdata = df->get_meta_data();
getKeyValue(docdata, keyau, doc.author);
getKeyValue(docdata, keyoc, doc.origcharset);
getKeyValue(docdata, keyct, doc.text);
getKeyValue(docdata, keytt, doc.title);

View File

@ -122,6 +122,7 @@ bool MimeHandlerHtml::next_document()
m_metaData["charset"] = "utf-8";
m_metaData["title"] = result.title;
m_metaData["keywords"] = result.keywords;
m_metaData["author"] = result.author;
m_metaData["modificationdate"] = result.dmtime;
m_metaData["sample"] = result.sample;
m_metaData["mimetype"] = "text/plain";

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: mh_mail.cpp,v 1.28 2007-01-13 10:28:37 dockes Exp $ (C) 2005 J.F.Dockes";
static char rcsid[] = "@(#$Id: mh_mail.cpp,v 1.29 2007-01-17 13:53:40 dockes Exp $ (C) 2005 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -217,6 +217,9 @@ bool MimeHandlerMail::processMsg(Binc::MimePart *doc, int depth)
if (doc->h.getFirstHeader("From", hi)) {
rfc2047_decode(hi.getValue(), transcoded);
text += string("From: ") + transcoded + string("\n");
if (depth == 1) {
m_metaData["author"] = transcoded;
}
}
if (doc->h.getFirstHeader("To", hi)) {
rfc2047_decode(hi.getValue(), transcoded);
@ -245,7 +248,7 @@ bool MimeHandlerMail::processMsg(Binc::MimePart *doc, int depth)
}
text += '\n';
LOGDEB2(("MimeHandlerMail::rocessMsg:ismultipart %d mime subtype '%s'\n",
LOGDEB2(("MimeHandlerMail::processMsg:ismultipart %d mime subtype '%s'\n",
doc->isMultipart(), doc->getSubType().c_str()));
walkmime(doc, depth);

View File

@ -154,6 +154,11 @@ MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
string tmp = i->second;
decode_entities(tmp);
keywords += tmp;
} else if (name == "author") {
if (!author.empty()) author += ' ';
string tmp = i->second;
decode_entities(tmp);
author += tmp;
} else if (name == "date") {
// Yes this doesnt exist. It's output by filters
// And the format isn't even standard http/html
@ -168,19 +173,6 @@ MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
dmtime = ascuxtime;
}
}
#if 0 // We're not a robot, so we don't care about robots metainfo
else if (name == "robots") {
string val = i->second;
decode_entities(val);
lowercase_term(val);
if (val.find("none") != string::npos ||
val.find("noindex") != string::npos) {
indexing_allowed = false;
LOGDEB1(("myhtmlparse: robots/noindex\n"));
throw false;
}
}
#endif // 0
} else if ((j = p.find("http-equiv")) != p.end()) {
string hequiv = j->second;
lowercase_term(hequiv);

View File

@ -37,7 +37,7 @@ class MyHtmlParser : public HtmlParser {
bool in_body_tag;
bool in_pre_tag;
bool pending_space;
string title, sample, keywords, dump, dmtime;
string title, sample, keywords, dump, dmtime, author;
string ocharset; // This is the charset our user thinks the doc was
string charset; // This is the charset it was supposedly converted to
string doccharset; // Set this to value of charset parameter in header

View File

@ -8,8 +8,8 @@ LIBS = librcl.a
all: $(LIBS)
OBJS = rclaspell.o rclconfig.o rclinit.o textsplit.o unacpp.o csguess.o indexer.o mimetype.o htmlparse.o myhtmlparse.o mimehandler.o internfile.o mh_exec.o mh_html.o mh_mail.o mh_mbox.o mh_text.o docseq.o history.o sortseq.o pathhash.o rcldb.o searchdata.o stemdb.o base64.o conftree.o copyfile.o debuglog.o execmd.o fstreewalk.o idfile.o md5.o mimeparse.o pathut.o readfile.o smallut.o transcode.o wipedir.o x11mon.o
DEPS = rclaspell.dep.stamp rclconfig.dep.stamp rclinit.dep.stamp textsplit.dep.stamp unacpp.dep.stamp csguess.dep.stamp indexer.dep.stamp mimetype.dep.stamp htmlparse.dep.stamp myhtmlparse.dep.stamp mimehandler.dep.stamp internfile.dep.stamp mh_exec.dep.stamp mh_html.dep.stamp mh_mail.dep.stamp mh_mbox.dep.stamp mh_text.dep.stamp docseq.dep.stamp history.dep.stamp sortseq.dep.stamp pathhash.dep.stamp rcldb.dep.stamp searchdata.dep.stamp stemdb.dep.stamp base64.dep.stamp conftree.dep.stamp copyfile.dep.stamp debuglog.dep.stamp execmd.dep.stamp fstreewalk.dep.stamp idfile.dep.stamp md5.dep.stamp mimeparse.dep.stamp pathut.dep.stamp readfile.dep.stamp smallut.dep.stamp transcode.dep.stamp wipedir.dep.stamp x11mon.dep.stamp
OBJS = rclaspell.o rclconfig.o rclinit.o textsplit.o unacpp.o csguess.o indexer.o mimetype.o htmlparse.o myhtmlparse.o mimehandler.o internfile.o mh_exec.o mh_html.o mh_mail.o mh_mbox.o mh_text.o docseq.o history.o sortseq.o wasastringtoquery.o wasatorcl.o pathhash.o rcldb.o searchdata.o stemdb.o base64.o conftree.o copyfile.o debuglog.o execmd.o fstreewalk.o idfile.o md5.o mimeparse.o pathut.o readfile.o smallut.o transcode.o wipedir.o x11mon.o
DEPS = rclaspell.dep.stamp rclconfig.dep.stamp rclinit.dep.stamp textsplit.dep.stamp unacpp.dep.stamp csguess.dep.stamp indexer.dep.stamp mimetype.dep.stamp htmlparse.dep.stamp myhtmlparse.dep.stamp mimehandler.dep.stamp internfile.dep.stamp mh_exec.dep.stamp mh_html.dep.stamp mh_mail.dep.stamp mh_mbox.dep.stamp mh_text.dep.stamp docseq.dep.stamp history.dep.stamp sortseq.dep.stamp wasastringtoquery.dep.stamp wasatorcl.dep.stamp pathhash.dep.stamp rcldb.dep.stamp searchdata.dep.stamp stemdb.dep.stamp base64.dep.stamp conftree.dep.stamp copyfile.dep.stamp debuglog.dep.stamp execmd.dep.stamp fstreewalk.dep.stamp idfile.dep.stamp md5.dep.stamp mimeparse.dep.stamp pathut.dep.stamp readfile.dep.stamp smallut.dep.stamp transcode.dep.stamp wipedir.dep.stamp x11mon.dep.stamp
librcl.a : $(DEPS) $(OBJS) unac.o
ar ru librcl.a $(OBJS) unac.o
@ -57,6 +57,10 @@ history.o : ../query/history.cpp
$(CXX) $(ALL_CXXFLAGS) -c ../query/history.cpp
sortseq.o : ../query/sortseq.cpp
$(CXX) $(ALL_CXXFLAGS) -c ../query/sortseq.cpp
wasastringtoquery.o : ../query/wasastringtoquery.cpp
$(CXX) $(ALL_CXXFLAGS) -c ../query/wasastringtoquery.cpp
wasatorcl.o : ../query/wasatorcl.cpp
$(CXX) $(ALL_CXXFLAGS) -c ../query/wasatorcl.cpp
pathhash.o : ../rcldb/pathhash.cpp
$(CXX) $(ALL_CXXFLAGS) -c ../rcldb/pathhash.cpp
rcldb.o : ../rcldb/rcldb.cpp
@ -161,6 +165,12 @@ history.dep.stamp : ../query/history.cpp
sortseq.dep.stamp : ../query/sortseq.cpp
$(CXX) -M $(ALL_CXXFLAGS) ../query/sortseq.cpp > sortseq.dep
touch sortseq.dep.stamp
wasastringtoquery.dep.stamp : ../query/wasastringtoquery.cpp
$(CXX) -M $(ALL_CXXFLAGS) ../query/wasastringtoquery.cpp > wasastringtoquery.dep
touch wasastringtoquery.dep.stamp
wasatorcl.dep.stamp : ../query/wasatorcl.cpp
$(CXX) -M $(ALL_CXXFLAGS) ../query/wasatorcl.cpp > wasatorcl.dep
touch wasatorcl.dep.stamp
pathhash.dep.stamp : ../rcldb/pathhash.cpp
$(CXX) -M $(ALL_CXXFLAGS) ../rcldb/pathhash.cpp > pathhash.dep
touch pathhash.dep.stamp
@ -238,6 +248,8 @@ include mh_text.dep
include docseq.dep
include history.dep
include sortseq.dep
include wasastringtoquery.dep
include wasatorcl.dep
include pathhash.dep
include rcldb.dep
include searchdata.dep

View File

@ -24,6 +24,8 @@ ${depth}/internfile/mh_text.cpp \
${depth}/query/docseq.cpp \
${depth}/query/history.cpp \
${depth}/query/sortseq.cpp \
${depth}/query/wasastringtoquery.cpp \
${depth}/query/wasatorcl.cpp \
${depth}/rcldb/pathhash.cpp \
${depth}/rcldb/rcldb.cpp \
${depth}/rcldb/searchdata.cpp \

View File

@ -1,11 +1,11 @@
depth = ..
include $(depth)/mk/sysconf
PROGS = xadump #trhist qtry qxtry
PROGS = xadump rclqlang #trhist qtry qxtry
all: $(PROGS)
SRCS = xadump.cpp
SRCS = xadump.cpp rclqlang.cpp
.cpp.o :
$(CXX) -c $(ALL_CXXFLAGS) -o $@ $<
@ -14,6 +14,11 @@ xadump : $(XADUMP_OBJS)
$(CXX) $(ALL_CXXFLAGS) -o xadump $(XADUMP_OBJS) \
$(LIBICONV) $(LIBXAPIAN) $(LIBSYS)
RCLQLANG_OBJS= rclqlang.o $(BIGLIB)
rclqlang : $(RCLQLANG_OBJS)
$(CXX) $(ALL_CXXFLAGS) -o rclqlang $(RCLQLANG_OBJS) \
$(LIBICONV) $(LIBXAPIAN) $(LIBSYS)
HISTORY_OBJS= trhist.o $(BIGLIB) $(MIMELIB)
trhist : $(HISTORY_OBJS)
$(CXX) $(ALL_CXXFLAGS) -o trhist $(HISTORY_OBJS) \
@ -21,9 +26,19 @@ trhist : $(HISTORY_OBJS)
trhist.o : history.cpp history.h
$(CXX) $(ALL_CXXFLAGS) -DTEST_HISTORY -c -o trhist.o history.cpp
$(BIGLIB):
cd $(depth)/lib;make
WASASTRINGTOQUERY_OBJS= trwasastrtoq.o $(BIGLIB) $(MIMELIB)
trwasastrtoq : $(WASASTRINGTOQUERY_OBJS)
$(CXX) $(ALL_CXXFLAGS) -o trwasastrtoq $(WASASTRINGTOQUERY_OBJS) \
$(LIBICONV) $(LIBXAPIAN)
trwasastrtoq.o : wasastringtoquery.cpp wasastringtoquery.h
$(CXX) $(ALL_CXXFLAGS) -DTEST_WASASTRINGTOQUERY -c \
-o trwasastrtoq.o wasastringtoquery.cpp
$(BIGLIB): force
cd $(depth)/lib;$(MAKE)
force:
depend: alldeps.stamp
alldeps.stamp : $(SRCS)
$(CXX) -M $(ALL_CXXFLAGS) $(SRCS) > alldeps

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: wasastringtoquery.cpp,v 1.3 2006-12-10 17:03:08 dockes Exp $ (C) 2006 J.F.Dockes";
static char rcsid[] = "@(#$Id: wasastringtoquery.cpp,v 1.4 2007-01-17 13:53:41 dockes Exp $ (C) 2006 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -17,7 +17,7 @@ static char rcsid[] = "@(#$Id: wasastringtoquery.cpp,v 1.3 2006-12-10 17:03:08 d
* Free Software Foundation, Inc.,
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
#ifndef TEST_STRINGTOQUERY
#ifndef TEST_WASASTRINGTOQUERY
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
@ -25,6 +25,13 @@ static char rcsid[] = "@(#$Id: wasastringtoquery.cpp,v 1.3 2006-12-10 17:03:08 d
#include "wasastringtoquery.h"
//#define DEB_WASASTRINGTOQ 1
#ifdef DEB_WASASTRINGTOQ
#define DPRINT(X) fprintf X
#else
#define DPRINT(X)
#endif
WasaQuery::~WasaQuery()
{
for (vector<WasaQuery*>::iterator it = m_subs.begin();
@ -61,16 +68,16 @@ void WasaQuery::describe(string &desc) const
desc += ")";
}
desc += "(";
string fieldspec = m_fieldspec.empty() ? "" : m_fieldspec + ": ";
switch (m_op) {
case OP_NULL:
desc += "NULL";
break;
case OP_LEAF:
desc += m_fieldspec.empty() ?
m_value : m_fieldspec + ":" + m_value;
desc += fieldspec + m_value;
break;
case OP_EXCL:
desc += string("NOT (" ) + m_value + ") ";
desc += string("NOT (" ) + fieldspec + m_value + ") ";
break;
case OP_OR:
case OP_AND:
@ -84,6 +91,8 @@ void WasaQuery::describe(string &desc) const
}
break;
}
if (desc[desc.length() - 1] == ' ')
desc.erase(desc.length() - 1);
desc += ") ";
}
@ -111,7 +120,7 @@ void WasaQuery::describe(string &desc) const
* parenthesis increases the index, but we're not interested in all
*/
static const char * parserExpr =
"([oO][rR])" //1 OR is a special word
"([oO][rR])[[:space:]]*" //1 OR is a special word
"|"
"(" //2
"([+-])?" //3 Force or exclude indicator
@ -125,7 +134,7 @@ static const char * parserExpr =
"|"
"([^[:space:]]+)" //9 ANormalTerm
")"
")"
")[[:space:]]*"
;
// For debugging the parser. But see also NMATCH
@ -236,17 +245,18 @@ StringToWasaQuery::Internal::stringToQuery(const string& str, string& reason)
reason = "Internal regular expression handling error";
return 0;
}
#if 0
if (loop) printf("Next part:\n");
for (i = 0; i < NMATCH; i++) {
#ifdef DEB_WASASTRINGTOQ
if (loop) DPRINT((stderr, "Next part:\n"));
for (unsigned int i = 0; i < NMATCH; i++) {
if (m_pmatch[i].rm_so == -1) continue;
char match[maxmatchlen+1];
memcpy(match, m_cp + m_pmatch[i].rm_so,
m_pmatch[i].rm_eo - m_pmatch[i].rm_so);
match[m_pmatch[i].rm_eo - m_pmatch[i].rm_so] = 0;
if (matchNames[i][0])
printf("%10s: [%s] (%d->%d)\n", matchNames[i], match,
(int)m_pmatch[i].rm_so, (int)m_pmatch[i].rm_eo);
DPRINT((stderr, "%10s: [%s] (%d->%d)\n", matchNames[i], match,
(int)m_pmatch[i].rm_so, (int)m_pmatch[i].rm_eo));
}
#endif
char match[maxmatchlen+1];
@ -348,14 +358,17 @@ StringToWasaQuery::Internal::stringToQuery(const string& str, string& reason)
if (prev_or) {
// We're in an OR subquery, add new subquery
orClause->m_subs.push_back(nclause);
DPRINT((stderr, "Adding to OR chain\n"));
} else {
if (orClause) {
// Getting out of OR. Add the OR subquery to the main one
query->m_subs.push_back(orClause);
DPRINT((stderr, "Adding OR chain to main\n"));
orClause = 0;
}
// Add new subquery to main one.
query->m_subs.push_back(nclause);
DPRINT((stderr, "Adding to main chain\n"));
}
prev_or = false;
}
@ -369,6 +382,12 @@ StringToWasaQuery::Internal::stringToQuery(const string& str, string& reason)
break;
}
if (orClause) {
// Getting out of OR. Add the OR subquery to the main one
query->m_subs.push_back(orClause);
DPRINT((stderr, "Adding OR chain to main\n"));
}
regfree(&m_rx);
m_rxneedsfree = false;
return query;
@ -404,4 +423,4 @@ int main(int argc, char **argv)
exit(0);
}
#endif // TEST_STRINGTOQUERY
#endif // TEST_WASASTRINGTOQUERY

View File

@ -1,6 +1,6 @@
#ifndef _WASASTRINGTOQUERY_H_INCLUDED_
#define _WASASTRINGTOQUERY_H_INCLUDED_
/* @(#$Id: wasastringtoquery.h,v 1.3 2006-12-10 17:03:08 dockes Exp $ (C) 2006 J.F.Dockes */
/* @(#$Id: wasastringtoquery.h,v 1.4 2007-01-17 13:53:41 dockes Exp $ (C) 2006 J.F.Dockes */
/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@ -40,23 +40,28 @@ public:
{}
~WasaQuery();
// Get string describing the query tree from this point
/** Get string describing the query tree from this point */
void describe(string &desc) const;
/** Op to be performed on either value or subqueries */
WasaQuery::Op m_op;
/** Field specification if any (ie: title, author ...) */
string m_fieldspec;
/* Valid for op == OP_LEAF */
/* String value. Valid for op == OP_LEAF */
string m_value;
/* Valid for conjunctions */
/** Subqueries. Valid for conjunctions */
vector<WasaQuery*> m_subs;
/* Restrict results to some file type, defined by either mime, app group,
* or extension */
/** Restrict results to some file type, defined by either mime,
* app group, or extension */
enum TypeKind {WQTK_NONE, WQTK_MIME, WQTK_GROUP, WQTK_EXT};
TypeKind m_typeKind;
vector<string> m_types;
/* Sort on relevance, date, name or group */
/** Sort on relevance, date, name or group */
enum SortKind {WQSK_REL, WQSK_DATE, WQSK_ALPHA, WQSK_GROUP};
vector<SortKind> m_sortSpec;
};

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: wasatorcl.cpp,v 1.2 2006-12-10 17:03:08 dockes Exp $ (C) 2006 J.F.Dockes";
static char rcsid[] = "@(#$Id: wasatorcl.cpp,v 1.3 2007-01-17 13:53:41 dockes Exp $ (C) 2006 J.F.Dockes";
#endif
#ifndef TEST_WASATORCL
@ -27,11 +27,13 @@ Rcl::SearchData *wasaQueryToRcl(WasaQuery *wasa)
if ((*it)->m_value.find_first_of(" \t\n\r") != string::npos) {
sdata->addClause
(new Rcl::SearchDataClauseDist(Rcl::SCLT_PHRASE,
(*it)->m_value, 0));
(*it)->m_value, 0,
(*it)->m_fieldspec));
} else {
sdata->addClause
(new Rcl::SearchDataClauseSimple(Rcl::SCLT_AND,
(*it)->m_value));
(*it)->m_value,
(*it)->m_fieldspec));
}
break;
case WasaQuery::OP_EXCL:
@ -41,7 +43,8 @@ Rcl::SearchData *wasaQueryToRcl(WasaQuery *wasa)
sdata->addClause
(new Rcl::SearchDataClauseSimple(Rcl::SCLT_EXCL,
string("\"") +
(*it)->m_value + "\""));
(*it)->m_value + "\"",
(*it)->m_fieldspec));
break;
case WasaQuery::OP_OR:
// Concatenate all OR values as phrases. Hope there are no
@ -55,7 +58,8 @@ Rcl::SearchData *wasaQueryToRcl(WasaQuery *wasa)
}
sdata->addClause
(new Rcl::SearchDataClauseSimple(Rcl::SCLT_OR,
orvalue));
orvalue,
(*it)->m_fieldspec));
}
}
}
@ -105,7 +109,7 @@ int main(int argc, char *argv[])
if (argc != 1) {
fprintf(stderr, "need one arg\n");
exit(1);
return 1;
}
const string str = *argv++;argc--;
string reason;
@ -113,14 +117,12 @@ int main(int argc, char *argv[])
RclConfig *config = recollinit(RCLINIT_NONE, 0, 0, reason, 0);
if (config == 0 || !config->ok()) {
cerr << "Configuration problem: " << reason << endl;
exit(1);
return 1;
}
string dbdir = config->getDbDir();
if (dbdir.empty()) {
// Note: this will have to be replaced by a call to a
// configuration buildin dialog for initial configuration
cerr << "Configuration problem: " << "No dbdir" << endl;
exit(1);
return 1;
}
Rcl::Db rcldb;
if (!rcldb.open(dbdir, Rcl::Db::DbRO, 0)) {

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.101 2006-12-19 12:11:21 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.102 2007-01-17 13:53:41 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -80,6 +80,7 @@ namespace Rcl {
// Synthetic abstract marker (to discriminate from abstract actually
// found in doc)
const static string rclSyntAbs = "?!#@";
const static string emptystring;
// A class for data and methods that would have to expose
// Xapian-specific stuff if they were in Rcl::Db. There could actually be
@ -703,15 +704,24 @@ bool Db::isopen()
return m_ndb->m_isopen;
}
// A small class to hold state while splitting text
// The text splitter callback class which receives words from the
// splitter and adds postings to the Xapian document.
class mySplitterCB : public TextSplitCB {
public:
Xapian::Document &doc;
Xapian::Document &doc; // Xapian document
Xapian::termpos basepos; // Base for document section
Xapian::termpos curpos; // Last position sent to callback
mySplitterCB(Xapian::Document &d) : doc(d), basepos(1), curpos(0)
Xapian::termpos curpos; // Current position. Used to set basepos for the
// following section
mySplitterCB(Xapian::Document &d)
: doc(d), basepos(1), curpos(0)
{}
bool takeword(const std::string &term, int pos, int, int);
void setprefix(const string& pref) {prefix = pref;}
private:
// If prefix is set, we also add a posting for the prefixed terms
// (ie: for titles, add postings for both "term" and "Sterm")
string prefix;
};
// Callback for the document to word splitting class during indexation
@ -731,7 +741,11 @@ bool mySplitterCB::takeword(const std::string &term, int pos, int, int)
// be possible to assign different weigths to doc parts (ie title)
// by using a higher value
curpos = pos;
doc.add_posting(term, basepos + curpos, 1);
pos += basepos;
doc.add_posting(term, pos, 1);
if (!prefix.empty()) {
doc.add_posting(prefix + term, pos, 1);
}
return true;
} catch (const Xapian::Error &e) {
ermsg = e.get_msg().c_str();
@ -804,8 +818,9 @@ bool Db::add(const string &fn, const Doc &idoc,
doc.abstract = truncate_to_word(doc.abstract, m_idxAbsTruncLen);
}
doc.abstract = neutchars(doc.abstract, "\n\r");
doc.title = truncate_to_word(doc.title, 100);
doc.keywords = truncate_to_word(doc.keywords, 300);
doc.title = neutchars(truncate_to_word(doc.title, 150), "\n\r");
doc.author = neutchars(truncate_to_word(doc.author, 150), "\n\r");
doc.keywords = neutchars(truncate_to_word(doc.keywords, 300), "\n\r");
Xapian::Document newdocument;
@ -824,13 +839,30 @@ bool Db::add(const string &fn, const Doc &idoc,
}
// Split and index title
LOGDEB2(("Db::add: split title [%s]\n", doc.title.c_str()));
if (!dumb_string(doc.title, noacc)) {
LOGERR(("Db::add: dumb_string failed\n"));
return false;
if (!doc.title.empty()) {
LOGDEB2(("Db::add: split title [%s]\n", doc.title.c_str()));
if (!dumb_string(doc.title, noacc)) {
LOGERR(("Db::add: dumb_string failed\n"));
return false;
}
splitData.setprefix("S"); // Subject
splitter.text_to_words(noacc);
splitData.setprefix(emptystring);
splitData.basepos += splitData.curpos + 100;
}
// Split and index author
if (!doc.author.empty()) {
LOGDEB2(("Db::add: split author [%s]\n", doc.author.c_str()));
if (!dumb_string(doc.author, noacc)) {
LOGERR(("Db::add: dumb_string failed\n"));
return false;
}
splitData.setprefix("A");
splitter.text_to_words(noacc);
splitData.setprefix(emptystring);
splitData.basepos += splitData.curpos + 100;
}
splitter.text_to_words(noacc);
splitData.basepos += splitData.curpos + 100;
// Split and index body
LOGDEB2(("Db::add: split body\n"));
@ -842,13 +874,17 @@ bool Db::add(const string &fn, const Doc &idoc,
splitData.basepos += splitData.curpos + 100;
// Split and index keywords
LOGDEB2(("Db::add: split kw [%s]\n", doc.keywords.c_str()));
if (!dumb_string(doc.keywords, noacc)) {
LOGERR(("Db::add: dumb_string failed\n"));
return false;
if (!doc.keywords.empty()) {
LOGDEB2(("Db::add: split kw [%s]\n", doc.keywords.c_str()));
if (!dumb_string(doc.keywords, noacc)) {
LOGERR(("Db::add: dumb_string failed\n"));
return false;
}
splitData.setprefix("K");
splitter.text_to_words(noacc);
splitData.setprefix(emptystring);
splitData.basepos += splitData.curpos + 100;
}
splitter.text_to_words(noacc);
splitData.basepos += splitData.curpos + 100;
// Split and index abstract. We don't do this if it is synthetic
// any more (this used to give a relevance boost to the beginning
@ -946,6 +982,9 @@ bool Db::add(const string &fn, const Doc &idoc,
record += "\ncaption=" + doc.title;
record += "\nkeywords=" + doc.keywords;
record += "\nabstract=" + doc.abstract;
if (!doc.author.empty()) {
record += "\nauthor=" + doc.author;
}
record += "\n";
LOGDEB1(("Newdocument data: %s\n", record.c_str()));
newdocument.set_data(record);

View File

@ -16,7 +16,7 @@
*/
#ifndef _RCLDOC_H_INCLUDED_
#define _RCLDOC_H_INCLUDED_
/* @(#$Id: rcldoc.h,v 1.1 2006-12-14 14:54:13 dockes Exp $ (C) 2006 J.F.Dockes */
/* @(#$Id: rcldoc.h,v 1.2 2007-01-17 13:53:41 dockes Exp $ (C) 2006 J.F.Dockes */
#include <string>
@ -48,6 +48,7 @@ class Doc {
string origcharset; // Charset we transcoded from (in case we want back)
// Possibly set by handler
string title; // Possibly set by handler
string author; // Possibly set by handler
string keywords; // Possibly set by handler
string abstract; // Possibly set by handler
bool syntabs; // true if abstract is just the top of doc, not an

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.7 2006-12-19 12:11:21 dockes Exp $ (C) 2006 J.F.Dockes";
static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.8 2007-01-17 13:53:41 dockes Exp $ (C) 2006 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -154,6 +154,7 @@ public:
bool translate(const string &iq,
const string &prefix,
string &ermsg,
list<Xapian::Query> &pqueries,
int slack = 0, bool useNear = false);
@ -257,6 +258,14 @@ void multiply_groups(vector<vector<string> >::const_iterator vvit,
}
}
static void addPrefix(list<string>& terms, const string& prefix)
{
if (prefix.empty())
return;
for (list<string>::iterator it = terms.begin(); it != terms.end(); it++)
it->insert(0, prefix);
}
/**
* Turn string into list of xapian queries. There is little
* interpretation done on the string (no +term -term or filename:term
@ -271,6 +280,7 @@ void multiply_groups(vector<vector<string> >::const_iterator vvit,
* count)
*/
bool StringToXapianQ::translate(const string &iq,
const string &prefix,
string &ermsg,
list<Xapian::Query> &pqueries,
int slack, bool useNear)
@ -301,24 +311,25 @@ bool StringToXapianQ::translate(const string &iq,
splitterS.text_to_words(*it);
TextSplit splitterW(&splitDataW, TextSplit::TXTS_NOSPANS);
splitterW.text_to_words(*it);
wsQData& splitData = splitDataS;
if (splitDataS.terms.size() > 1 && splitDataS.terms.size() !=
splitDataW.terms.size())
splitData = splitDataW;
wsQData *splitData = &splitDataS;
if (splitDataS.terms.size() > 1 &&
splitDataS.terms.size() != splitDataW.terms.size())
splitData = &splitDataW;
LOGDEB1(("strToXapianQ: splitter term count: %d\n",
splitData.terms.size()));
switch(splitData.terms.size()) {
splitData->terms.size()));
switch(splitData->terms.size()) {
case 0: continue;// ??
case 1: // Not a real phrase: one term
{
string term = splitData.terms.front();
string term = splitData->terms.front();
list<string> exp;
maybeStemExp(false, term, exp);
m_terms.insert(m_terms.end(), exp.begin(), exp.end());
// Push either term or OR of stem-expanded set
addPrefix(exp, prefix);
pqueries.push_back(Xapian::Query(Xapian::Query::OP_OR,
exp.begin(), exp.end()));
m_terms.insert(m_terms.end(), exp.begin(), exp.end());
}
break;
@ -329,8 +340,8 @@ bool StringToXapianQ::translate(const string &iq,
list<Xapian::Query> orqueries;
bool hadmultiple = false;
vector<vector<string> >groups;
for (vector<string>::iterator it = splitData.terms.begin();
it != splitData.terms.end(); it++) {
for (vector<string>::iterator it = splitData->terms.begin();
it != splitData->terms.end(); it++) {
// Some version of xapian will accept only one OR clause
// inside NEAR, all others must be leafs
bool nostemexp =
@ -341,6 +352,7 @@ bool StringToXapianQ::translate(const string &iq,
maybeStemExp(nostemexp, *it, exp);
groups.push_back(vector<string>(exp.begin(), exp.end()));
addPrefix(exp, prefix);
orqueries.push_back(Xapian::Query(Xapian::Query::OP_OR,
exp.begin(), exp.end()));
#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
@ -352,7 +364,7 @@ bool StringToXapianQ::translate(const string &iq,
pqueries.push_back(Xapian::Query(op,
orqueries.begin(),
orqueries.end(),
splitData.terms.size() + slack));
splitData->terms.size() + slack));
// Add NEAR/PHRASE groups to the highlighting data. Must
// push all combinations
vector<vector<string> > allcombs;
@ -378,6 +390,28 @@ bool StringToXapianQ::translate(const string &iq,
return true;
}
// Try to translate field specification into field prefix. This should
// probably be an Rcl::Db method and much more configurable (store
// prefix translation list in config ?)
static string fieldToPrefix(const string& i_field)
{
static map<string, string> fldToPrefs;
if (fldToPrefs.empty()) {
fldToPrefs["title"] = "S";
fldToPrefs["caption"] = "S";
fldToPrefs["subject"] = "S";
fldToPrefs["author"] = "A";
fldToPrefs["from"] = "A";
fldToPrefs["keyword"] = "K";
}
string fld(i_field);
stringtolower(fld);
map<string, string>::const_iterator it = fldToPrefs.find(fld);
if (it != fldToPrefs.end())
return it->second;
return "";
}
// Translate a simple OR, AND, or EXCL search clause.
bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p,
const string& stemlang)
@ -397,9 +431,12 @@ bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p,
LOGERR(("SearchDataClauseSimple: bad m_tp %d\n", m_tp));
return false;
}
string prefix;
if (!m_field.empty())
prefix = fieldToPrefix(m_field);
list<Xapian::Query> pqueries;
StringToXapianQ tr(db, stemlang);
if (!tr.translate(m_text, m_reason, pqueries))
if (!tr.translate(m_text, prefix, m_reason, pqueries))
return false;
if (pqueries.empty()) {
LOGERR(("SearchDataClauseSimple: resolved to null query\n"));
@ -437,12 +474,16 @@ bool SearchDataClauseDist::toNativeQuery(Rcl::Db &db, void *p,
list<Xapian::Query> pqueries;
Xapian::Query nq;
string prefix;
if (!m_field.empty())
prefix = fieldToPrefix(m_field);
// Use stringToXapianQueries to lowercase and simplify the phrase
// terms etc. The result should be a single element list
string s = string("\"") + m_text + string("\"");
bool useNear = m_tp == SCLT_NEAR;
StringToXapianQ tr(db, stemlang);
if (!tr.translate(s, m_reason, pqueries, m_slack, useNear))
if (!tr.translate(s, prefix, m_reason, pqueries, m_slack, useNear))
return false;
if (pqueries.empty()) {
LOGERR(("SearchDataClauseDist: resolved to null query\n"));

View File

@ -16,7 +16,7 @@
*/
#ifndef _SEARCHDATA_H_INCLUDED_
#define _SEARCHDATA_H_INCLUDED_
/* @(#$Id: searchdata.h,v 1.7 2006-12-05 15:17:13 dockes Exp $ (C) 2004 J.F.Dockes */
/* @(#$Id: searchdata.h,v 1.8 2007-01-17 13:53:41 dockes Exp $ (C) 2004 J.F.Dockes */
/**
* Structures to hold data coming almost directly from the gui
@ -46,11 +46,22 @@ enum SClType {
class SearchDataClause;
/**
* Holder for a list of search clauses. Some of the clauses may be be reference
* to other subqueries in the future. For now, they just reflect user entry in
* a query field: type, some text and possibly a distance. Each clause may
* hold several queries in the Xapian sense, for exemple several terms
* and phrases as would result from ["this is a phrase" term1 term2]
Data structure representing A Recoll query.
This is currently simply a list of search clauses.
For now, clauses in the list just reflect user entry in a query
field: some text, a clause type (AND/OR/NEAR etc.) and possibly a
distance. Each clause may hold several queries in the Xapian sense,
for exemple several terms and phrases as would result from
["this is a phrase" term1 term2]
This means that SearchData will be translated into a Xapian
Query tree of depth 2.
The structure might be extended in the future so that some of the
clauses may be references to other subqueries (there doesn't seem to
be an urgent need for this)
*/
class SearchData {
public:
@ -134,15 +145,19 @@ protected:
*/
class SearchDataClauseSimple : public SearchDataClause {
public:
SearchDataClauseSimple(SClType tp, string txt)
: SearchDataClause(tp), m_text(txt), m_slack(0) {}
SearchDataClauseSimple(SClType tp, const string& txt,
const string& fld = "")
: SearchDataClause(tp), m_text(txt), m_field(fld), m_slack(0) {}
virtual ~SearchDataClauseSimple() {}
/** Translate to Xapian query */
virtual bool toNativeQuery(Rcl::Db &db, void *, const string& stemlang);
virtual bool getTerms(vector<string>& terms,
vector<vector<string> >& groups,
vector<int>& gslks) const
/** Retrieve query terms and term groups. This is used for highlighting */
virtual bool getTerms(vector<string>& terms, /* Single terms */
vector<vector<string> >& groups, /* Prox grps */
vector<int>& gslks) const /* Prox slacks */
{
terms.insert(terms.end(), m_terms.begin(), m_terms.end());
groups.insert(groups.end(), m_groups.begin(), m_groups.end());
@ -151,7 +166,8 @@ public:
}
protected:
string m_text;
string m_text; // Raw user entry text.
string m_field; // Field specification if any
// Single terms and phrases resulting from breaking up m_text;
// valid after toNativeQuery() call
vector<string> m_terms;
@ -161,10 +177,10 @@ protected:
int m_slack;
};
/** Filename search. */
/** Filename search clause. */
class SearchDataClauseFilename : public SearchDataClauseSimple {
public:
SearchDataClauseFilename(string txt)
SearchDataClauseFilename(const string& txt)
: SearchDataClauseSimple(SCLT_FILENAME, txt) {}
virtual ~SearchDataClauseFilename() {}
virtual bool toNativeQuery(Rcl::Db &db, void *, const string& stemlang);
@ -176,8 +192,9 @@ public:
*/
class SearchDataClauseDist : public SearchDataClauseSimple {
public:
SearchDataClauseDist(SClType tp, string txt, int slack)
: SearchDataClauseSimple(tp, txt) {m_slack = slack;}
SearchDataClauseDist(SClType tp, const string& txt, int slack,
const string& fld = "")
: SearchDataClauseSimple(tp, txt, fld) {m_slack = slack;}
virtual ~SearchDataClauseDist() {}
virtual bool toNativeQuery(Rcl::Db &db, void *, const string& stemlang);