diff --git a/src/internfile/internfile.cpp b/src/internfile/internfile.cpp index 4318ac08..3d93c64b 100644 --- a/src/internfile/internfile.cpp +++ b/src/internfile/internfile.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: internfile.cpp,v 1.24 2007-01-15 13:06:38 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: internfile.cpp,v 1.25 2007-01-17 13:53:40 dockes Exp $ (C) 2004 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -236,6 +236,7 @@ static inline bool getKeyValue(const map& docdata, } static const string keyab("abstract"); +static const string keyau("author"); static const string keycs("charset"); static const string keyct("content"); static const string keyfn("filename"); @@ -251,6 +252,7 @@ bool FileInterner::dijontorcl(Rcl::Doc& doc) Dijon::Filter *df = m_handlers.back(); const std::map& docdata = df->get_meta_data(); + getKeyValue(docdata, keyau, doc.author); getKeyValue(docdata, keyoc, doc.origcharset); getKeyValue(docdata, keyct, doc.text); getKeyValue(docdata, keytt, doc.title); diff --git a/src/internfile/mh_html.cpp b/src/internfile/mh_html.cpp index bba522f6..446e1934 100644 --- a/src/internfile/mh_html.cpp +++ b/src/internfile/mh_html.cpp @@ -122,6 +122,7 @@ bool MimeHandlerHtml::next_document() m_metaData["charset"] = "utf-8"; m_metaData["title"] = result.title; m_metaData["keywords"] = result.keywords; + m_metaData["author"] = result.author; m_metaData["modificationdate"] = result.dmtime; m_metaData["sample"] = result.sample; m_metaData["mimetype"] = "text/plain"; diff --git a/src/internfile/mh_mail.cpp b/src/internfile/mh_mail.cpp index 67e80add..f8785def 100644 --- a/src/internfile/mh_mail.cpp +++ b/src/internfile/mh_mail.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: mh_mail.cpp,v 1.28 2007-01-13 10:28:37 dockes Exp $ (C) 2005 J.F.Dockes"; +static char rcsid[] = "@(#$Id: mh_mail.cpp,v 1.29 2007-01-17 13:53:40 dockes Exp $ (C) 2005 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -217,6 +217,9 @@ bool MimeHandlerMail::processMsg(Binc::MimePart *doc, int depth) if (doc->h.getFirstHeader("From", hi)) { rfc2047_decode(hi.getValue(), transcoded); text += string("From: ") + transcoded + string("\n"); + if (depth == 1) { + m_metaData["author"] = transcoded; + } } if (doc->h.getFirstHeader("To", hi)) { rfc2047_decode(hi.getValue(), transcoded); @@ -245,7 +248,7 @@ bool MimeHandlerMail::processMsg(Binc::MimePart *doc, int depth) } text += '\n'; - LOGDEB2(("MimeHandlerMail::rocessMsg:ismultipart %d mime subtype '%s'\n", + LOGDEB2(("MimeHandlerMail::processMsg:ismultipart %d mime subtype '%s'\n", doc->isMultipart(), doc->getSubType().c_str())); walkmime(doc, depth); diff --git a/src/internfile/myhtmlparse.cpp b/src/internfile/myhtmlparse.cpp index e6a6c9dc..e5343033 100644 --- a/src/internfile/myhtmlparse.cpp +++ b/src/internfile/myhtmlparse.cpp @@ -154,6 +154,11 @@ MyHtmlParser::opening_tag(const string &tag, const map &p) string tmp = i->second; decode_entities(tmp); keywords += tmp; + } else if (name == "author") { + if (!author.empty()) author += ' '; + string tmp = i->second; + decode_entities(tmp); + author += tmp; } else if (name == "date") { // Yes this doesnt exist. It's output by filters // And the format isn't even standard http/html @@ -168,19 +173,6 @@ MyHtmlParser::opening_tag(const string &tag, const map &p) dmtime = ascuxtime; } } -#if 0 // We're not a robot, so we don't care about robots metainfo - else if (name == "robots") { - string val = i->second; - decode_entities(val); - lowercase_term(val); - if (val.find("none") != string::npos || - val.find("noindex") != string::npos) { - indexing_allowed = false; - LOGDEB1(("myhtmlparse: robots/noindex\n")); - throw false; - } - } -#endif // 0 } else if ((j = p.find("http-equiv")) != p.end()) { string hequiv = j->second; lowercase_term(hequiv); diff --git a/src/internfile/myhtmlparse.h b/src/internfile/myhtmlparse.h index 3c855d68..233a5c0c 100644 --- a/src/internfile/myhtmlparse.h +++ b/src/internfile/myhtmlparse.h @@ -37,7 +37,7 @@ class MyHtmlParser : public HtmlParser { bool in_body_tag; bool in_pre_tag; bool pending_space; - string title, sample, keywords, dump, dmtime; + string title, sample, keywords, dump, dmtime, author; string ocharset; // This is the charset our user thinks the doc was string charset; // This is the charset it was supposedly converted to string doccharset; // Set this to value of charset parameter in header diff --git a/src/lib/Makefile b/src/lib/Makefile index 96f04875..212b301d 100644 --- a/src/lib/Makefile +++ b/src/lib/Makefile @@ -8,8 +8,8 @@ LIBS = librcl.a all: $(LIBS) -OBJS = rclaspell.o rclconfig.o rclinit.o textsplit.o unacpp.o csguess.o indexer.o mimetype.o htmlparse.o myhtmlparse.o mimehandler.o internfile.o mh_exec.o mh_html.o mh_mail.o mh_mbox.o mh_text.o docseq.o history.o sortseq.o pathhash.o rcldb.o searchdata.o stemdb.o base64.o conftree.o copyfile.o debuglog.o execmd.o fstreewalk.o idfile.o md5.o mimeparse.o pathut.o readfile.o smallut.o transcode.o wipedir.o x11mon.o -DEPS = rclaspell.dep.stamp rclconfig.dep.stamp rclinit.dep.stamp textsplit.dep.stamp unacpp.dep.stamp csguess.dep.stamp indexer.dep.stamp mimetype.dep.stamp htmlparse.dep.stamp myhtmlparse.dep.stamp mimehandler.dep.stamp internfile.dep.stamp mh_exec.dep.stamp mh_html.dep.stamp mh_mail.dep.stamp mh_mbox.dep.stamp mh_text.dep.stamp docseq.dep.stamp history.dep.stamp sortseq.dep.stamp pathhash.dep.stamp rcldb.dep.stamp searchdata.dep.stamp stemdb.dep.stamp base64.dep.stamp conftree.dep.stamp copyfile.dep.stamp debuglog.dep.stamp execmd.dep.stamp fstreewalk.dep.stamp idfile.dep.stamp md5.dep.stamp mimeparse.dep.stamp pathut.dep.stamp readfile.dep.stamp smallut.dep.stamp transcode.dep.stamp wipedir.dep.stamp x11mon.dep.stamp +OBJS = rclaspell.o rclconfig.o rclinit.o textsplit.o unacpp.o csguess.o indexer.o mimetype.o htmlparse.o myhtmlparse.o mimehandler.o internfile.o mh_exec.o mh_html.o mh_mail.o mh_mbox.o mh_text.o docseq.o history.o sortseq.o wasastringtoquery.o wasatorcl.o pathhash.o rcldb.o searchdata.o stemdb.o base64.o conftree.o copyfile.o debuglog.o execmd.o fstreewalk.o idfile.o md5.o mimeparse.o pathut.o readfile.o smallut.o transcode.o wipedir.o x11mon.o +DEPS = rclaspell.dep.stamp rclconfig.dep.stamp rclinit.dep.stamp textsplit.dep.stamp unacpp.dep.stamp csguess.dep.stamp indexer.dep.stamp mimetype.dep.stamp htmlparse.dep.stamp myhtmlparse.dep.stamp mimehandler.dep.stamp internfile.dep.stamp mh_exec.dep.stamp mh_html.dep.stamp mh_mail.dep.stamp mh_mbox.dep.stamp mh_text.dep.stamp docseq.dep.stamp history.dep.stamp sortseq.dep.stamp wasastringtoquery.dep.stamp wasatorcl.dep.stamp pathhash.dep.stamp rcldb.dep.stamp searchdata.dep.stamp stemdb.dep.stamp base64.dep.stamp conftree.dep.stamp copyfile.dep.stamp debuglog.dep.stamp execmd.dep.stamp fstreewalk.dep.stamp idfile.dep.stamp md5.dep.stamp mimeparse.dep.stamp pathut.dep.stamp readfile.dep.stamp smallut.dep.stamp transcode.dep.stamp wipedir.dep.stamp x11mon.dep.stamp librcl.a : $(DEPS) $(OBJS) unac.o ar ru librcl.a $(OBJS) unac.o @@ -57,6 +57,10 @@ history.o : ../query/history.cpp $(CXX) $(ALL_CXXFLAGS) -c ../query/history.cpp sortseq.o : ../query/sortseq.cpp $(CXX) $(ALL_CXXFLAGS) -c ../query/sortseq.cpp +wasastringtoquery.o : ../query/wasastringtoquery.cpp + $(CXX) $(ALL_CXXFLAGS) -c ../query/wasastringtoquery.cpp +wasatorcl.o : ../query/wasatorcl.cpp + $(CXX) $(ALL_CXXFLAGS) -c ../query/wasatorcl.cpp pathhash.o : ../rcldb/pathhash.cpp $(CXX) $(ALL_CXXFLAGS) -c ../rcldb/pathhash.cpp rcldb.o : ../rcldb/rcldb.cpp @@ -161,6 +165,12 @@ history.dep.stamp : ../query/history.cpp sortseq.dep.stamp : ../query/sortseq.cpp $(CXX) -M $(ALL_CXXFLAGS) ../query/sortseq.cpp > sortseq.dep touch sortseq.dep.stamp +wasastringtoquery.dep.stamp : ../query/wasastringtoquery.cpp + $(CXX) -M $(ALL_CXXFLAGS) ../query/wasastringtoquery.cpp > wasastringtoquery.dep + touch wasastringtoquery.dep.stamp +wasatorcl.dep.stamp : ../query/wasatorcl.cpp + $(CXX) -M $(ALL_CXXFLAGS) ../query/wasatorcl.cpp > wasatorcl.dep + touch wasatorcl.dep.stamp pathhash.dep.stamp : ../rcldb/pathhash.cpp $(CXX) -M $(ALL_CXXFLAGS) ../rcldb/pathhash.cpp > pathhash.dep touch pathhash.dep.stamp @@ -238,6 +248,8 @@ include mh_text.dep include docseq.dep include history.dep include sortseq.dep +include wasastringtoquery.dep +include wasatorcl.dep include pathhash.dep include rcldb.dep include searchdata.dep diff --git a/src/lib/mkMake b/src/lib/mkMake index fa715aac..51e989d4 100755 --- a/src/lib/mkMake +++ b/src/lib/mkMake @@ -24,6 +24,8 @@ ${depth}/internfile/mh_text.cpp \ ${depth}/query/docseq.cpp \ ${depth}/query/history.cpp \ ${depth}/query/sortseq.cpp \ +${depth}/query/wasastringtoquery.cpp \ +${depth}/query/wasatorcl.cpp \ ${depth}/rcldb/pathhash.cpp \ ${depth}/rcldb/rcldb.cpp \ ${depth}/rcldb/searchdata.cpp \ diff --git a/src/query/Makefile b/src/query/Makefile index 2dbbae12..76cf3b56 100644 --- a/src/query/Makefile +++ b/src/query/Makefile @@ -1,11 +1,11 @@ depth = .. include $(depth)/mk/sysconf -PROGS = xadump #trhist qtry qxtry +PROGS = xadump rclqlang #trhist qtry qxtry all: $(PROGS) -SRCS = xadump.cpp +SRCS = xadump.cpp rclqlang.cpp .cpp.o : $(CXX) -c $(ALL_CXXFLAGS) -o $@ $< @@ -14,6 +14,11 @@ xadump : $(XADUMP_OBJS) $(CXX) $(ALL_CXXFLAGS) -o xadump $(XADUMP_OBJS) \ $(LIBICONV) $(LIBXAPIAN) $(LIBSYS) +RCLQLANG_OBJS= rclqlang.o $(BIGLIB) +rclqlang : $(RCLQLANG_OBJS) + $(CXX) $(ALL_CXXFLAGS) -o rclqlang $(RCLQLANG_OBJS) \ + $(LIBICONV) $(LIBXAPIAN) $(LIBSYS) + HISTORY_OBJS= trhist.o $(BIGLIB) $(MIMELIB) trhist : $(HISTORY_OBJS) $(CXX) $(ALL_CXXFLAGS) -o trhist $(HISTORY_OBJS) \ @@ -21,9 +26,19 @@ trhist : $(HISTORY_OBJS) trhist.o : history.cpp history.h $(CXX) $(ALL_CXXFLAGS) -DTEST_HISTORY -c -o trhist.o history.cpp -$(BIGLIB): - cd $(depth)/lib;make - +WASASTRINGTOQUERY_OBJS= trwasastrtoq.o $(BIGLIB) $(MIMELIB) +trwasastrtoq : $(WASASTRINGTOQUERY_OBJS) + $(CXX) $(ALL_CXXFLAGS) -o trwasastrtoq $(WASASTRINGTOQUERY_OBJS) \ + $(LIBICONV) $(LIBXAPIAN) +trwasastrtoq.o : wasastringtoquery.cpp wasastringtoquery.h + $(CXX) $(ALL_CXXFLAGS) -DTEST_WASASTRINGTOQUERY -c \ + -o trwasastrtoq.o wasastringtoquery.cpp + +$(BIGLIB): force + cd $(depth)/lib;$(MAKE) +force: + + depend: alldeps.stamp alldeps.stamp : $(SRCS) $(CXX) -M $(ALL_CXXFLAGS) $(SRCS) > alldeps diff --git a/src/query/wasastringtoquery.cpp b/src/query/wasastringtoquery.cpp index b47e64da..b8fc0adf 100644 --- a/src/query/wasastringtoquery.cpp +++ b/src/query/wasastringtoquery.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: wasastringtoquery.cpp,v 1.3 2006-12-10 17:03:08 dockes Exp $ (C) 2006 J.F.Dockes"; +static char rcsid[] = "@(#$Id: wasastringtoquery.cpp,v 1.4 2007-01-17 13:53:41 dockes Exp $ (C) 2006 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -17,7 +17,7 @@ static char rcsid[] = "@(#$Id: wasastringtoquery.cpp,v 1.3 2006-12-10 17:03:08 d * Free Software Foundation, Inc., * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ -#ifndef TEST_STRINGTOQUERY +#ifndef TEST_WASASTRINGTOQUERY #include #include #include @@ -25,6 +25,13 @@ static char rcsid[] = "@(#$Id: wasastringtoquery.cpp,v 1.3 2006-12-10 17:03:08 d #include "wasastringtoquery.h" +//#define DEB_WASASTRINGTOQ 1 +#ifdef DEB_WASASTRINGTOQ +#define DPRINT(X) fprintf X +#else +#define DPRINT(X) +#endif + WasaQuery::~WasaQuery() { for (vector::iterator it = m_subs.begin(); @@ -61,16 +68,16 @@ void WasaQuery::describe(string &desc) const desc += ")"; } desc += "("; + string fieldspec = m_fieldspec.empty() ? "" : m_fieldspec + ": "; switch (m_op) { case OP_NULL: desc += "NULL"; break; case OP_LEAF: - desc += m_fieldspec.empty() ? - m_value : m_fieldspec + ":" + m_value; + desc += fieldspec + m_value; break; case OP_EXCL: - desc += string("NOT (" ) + m_value + ") "; + desc += string("NOT (" ) + fieldspec + m_value + ") "; break; case OP_OR: case OP_AND: @@ -84,6 +91,8 @@ void WasaQuery::describe(string &desc) const } break; } + if (desc[desc.length() - 1] == ' ') + desc.erase(desc.length() - 1); desc += ") "; } @@ -111,7 +120,7 @@ void WasaQuery::describe(string &desc) const * parenthesis increases the index, but we're not interested in all */ static const char * parserExpr = - "([oO][rR])" //1 OR is a special word + "([oO][rR])[[:space:]]*" //1 OR is a special word "|" "(" //2 "([+-])?" //3 Force or exclude indicator @@ -125,7 +134,7 @@ static const char * parserExpr = "|" "([^[:space:]]+)" //9 ANormalTerm ")" - ")" + ")[[:space:]]*" ; // For debugging the parser. But see also NMATCH @@ -236,17 +245,18 @@ StringToWasaQuery::Internal::stringToQuery(const string& str, string& reason) reason = "Internal regular expression handling error"; return 0; } -#if 0 - if (loop) printf("Next part:\n"); - for (i = 0; i < NMATCH; i++) { + +#ifdef DEB_WASASTRINGTOQ + if (loop) DPRINT((stderr, "Next part:\n")); + for (unsigned int i = 0; i < NMATCH; i++) { if (m_pmatch[i].rm_so == -1) continue; char match[maxmatchlen+1]; memcpy(match, m_cp + m_pmatch[i].rm_so, m_pmatch[i].rm_eo - m_pmatch[i].rm_so); match[m_pmatch[i].rm_eo - m_pmatch[i].rm_so] = 0; if (matchNames[i][0]) - printf("%10s: [%s] (%d->%d)\n", matchNames[i], match, - (int)m_pmatch[i].rm_so, (int)m_pmatch[i].rm_eo); + DPRINT((stderr, "%10s: [%s] (%d->%d)\n", matchNames[i], match, + (int)m_pmatch[i].rm_so, (int)m_pmatch[i].rm_eo)); } #endif char match[maxmatchlen+1]; @@ -348,14 +358,17 @@ StringToWasaQuery::Internal::stringToQuery(const string& str, string& reason) if (prev_or) { // We're in an OR subquery, add new subquery orClause->m_subs.push_back(nclause); + DPRINT((stderr, "Adding to OR chain\n")); } else { if (orClause) { // Getting out of OR. Add the OR subquery to the main one query->m_subs.push_back(orClause); + DPRINT((stderr, "Adding OR chain to main\n")); orClause = 0; } // Add new subquery to main one. query->m_subs.push_back(nclause); + DPRINT((stderr, "Adding to main chain\n")); } prev_or = false; } @@ -369,6 +382,12 @@ StringToWasaQuery::Internal::stringToQuery(const string& str, string& reason) break; } + if (orClause) { + // Getting out of OR. Add the OR subquery to the main one + query->m_subs.push_back(orClause); + DPRINT((stderr, "Adding OR chain to main\n")); + } + regfree(&m_rx); m_rxneedsfree = false; return query; @@ -404,4 +423,4 @@ int main(int argc, char **argv) exit(0); } -#endif // TEST_STRINGTOQUERY +#endif // TEST_WASASTRINGTOQUERY diff --git a/src/query/wasastringtoquery.h b/src/query/wasastringtoquery.h index 54ac950b..a96be508 100644 --- a/src/query/wasastringtoquery.h +++ b/src/query/wasastringtoquery.h @@ -1,6 +1,6 @@ #ifndef _WASASTRINGTOQUERY_H_INCLUDED_ #define _WASASTRINGTOQUERY_H_INCLUDED_ -/* @(#$Id: wasastringtoquery.h,v 1.3 2006-12-10 17:03:08 dockes Exp $ (C) 2006 J.F.Dockes */ +/* @(#$Id: wasastringtoquery.h,v 1.4 2007-01-17 13:53:41 dockes Exp $ (C) 2006 J.F.Dockes */ /* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -40,23 +40,28 @@ public: {} ~WasaQuery(); - // Get string describing the query tree from this point + /** Get string describing the query tree from this point */ void describe(string &desc) const; + /** Op to be performed on either value or subqueries */ WasaQuery::Op m_op; + + /** Field specification if any (ie: title, author ...) */ string m_fieldspec; - /* Valid for op == OP_LEAF */ + + /* String value. Valid for op == OP_LEAF */ string m_value; - /* Valid for conjunctions */ + + /** Subqueries. Valid for conjunctions */ vector m_subs; - /* Restrict results to some file type, defined by either mime, app group, - * or extension */ + /** Restrict results to some file type, defined by either mime, + * app group, or extension */ enum TypeKind {WQTK_NONE, WQTK_MIME, WQTK_GROUP, WQTK_EXT}; TypeKind m_typeKind; vector m_types; - /* Sort on relevance, date, name or group */ + /** Sort on relevance, date, name or group */ enum SortKind {WQSK_REL, WQSK_DATE, WQSK_ALPHA, WQSK_GROUP}; vector m_sortSpec; }; diff --git a/src/query/wasatorcl.cpp b/src/query/wasatorcl.cpp index 58a336b5..f520597c 100644 --- a/src/query/wasatorcl.cpp +++ b/src/query/wasatorcl.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: wasatorcl.cpp,v 1.2 2006-12-10 17:03:08 dockes Exp $ (C) 2006 J.F.Dockes"; +static char rcsid[] = "@(#$Id: wasatorcl.cpp,v 1.3 2007-01-17 13:53:41 dockes Exp $ (C) 2006 J.F.Dockes"; #endif #ifndef TEST_WASATORCL @@ -27,11 +27,13 @@ Rcl::SearchData *wasaQueryToRcl(WasaQuery *wasa) if ((*it)->m_value.find_first_of(" \t\n\r") != string::npos) { sdata->addClause (new Rcl::SearchDataClauseDist(Rcl::SCLT_PHRASE, - (*it)->m_value, 0)); + (*it)->m_value, 0, + (*it)->m_fieldspec)); } else { sdata->addClause (new Rcl::SearchDataClauseSimple(Rcl::SCLT_AND, - (*it)->m_value)); + (*it)->m_value, + (*it)->m_fieldspec)); } break; case WasaQuery::OP_EXCL: @@ -41,7 +43,8 @@ Rcl::SearchData *wasaQueryToRcl(WasaQuery *wasa) sdata->addClause (new Rcl::SearchDataClauseSimple(Rcl::SCLT_EXCL, string("\"") + - (*it)->m_value + "\"")); + (*it)->m_value + "\"", + (*it)->m_fieldspec)); break; case WasaQuery::OP_OR: // Concatenate all OR values as phrases. Hope there are no @@ -55,7 +58,8 @@ Rcl::SearchData *wasaQueryToRcl(WasaQuery *wasa) } sdata->addClause (new Rcl::SearchDataClauseSimple(Rcl::SCLT_OR, - orvalue)); + orvalue, + (*it)->m_fieldspec)); } } } @@ -105,7 +109,7 @@ int main(int argc, char *argv[]) if (argc != 1) { fprintf(stderr, "need one arg\n"); - exit(1); + return 1; } const string str = *argv++;argc--; string reason; @@ -113,14 +117,12 @@ int main(int argc, char *argv[]) RclConfig *config = recollinit(RCLINIT_NONE, 0, 0, reason, 0); if (config == 0 || !config->ok()) { cerr << "Configuration problem: " << reason << endl; - exit(1); + return 1; } string dbdir = config->getDbDir(); if (dbdir.empty()) { - // Note: this will have to be replaced by a call to a - // configuration buildin dialog for initial configuration cerr << "Configuration problem: " << "No dbdir" << endl; - exit(1); + return 1; } Rcl::Db rcldb; if (!rcldb.open(dbdir, Rcl::Db::DbRO, 0)) { diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index 37eb9a95..6f0bbd2a 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.101 2006-12-19 12:11:21 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.102 2007-01-17 13:53:41 dockes Exp $ (C) 2004 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -80,6 +80,7 @@ namespace Rcl { // Synthetic abstract marker (to discriminate from abstract actually // found in doc) const static string rclSyntAbs = "?!#@"; +const static string emptystring; // A class for data and methods that would have to expose // Xapian-specific stuff if they were in Rcl::Db. There could actually be @@ -703,15 +704,24 @@ bool Db::isopen() return m_ndb->m_isopen; } -// A small class to hold state while splitting text +// The text splitter callback class which receives words from the +// splitter and adds postings to the Xapian document. class mySplitterCB : public TextSplitCB { public: - Xapian::Document &doc; + Xapian::Document &doc; // Xapian document Xapian::termpos basepos; // Base for document section - Xapian::termpos curpos; // Last position sent to callback - mySplitterCB(Xapian::Document &d) : doc(d), basepos(1), curpos(0) + Xapian::termpos curpos; // Current position. Used to set basepos for the + // following section + mySplitterCB(Xapian::Document &d) + : doc(d), basepos(1), curpos(0) {} bool takeword(const std::string &term, int pos, int, int); + void setprefix(const string& pref) {prefix = pref;} + +private: + // If prefix is set, we also add a posting for the prefixed terms + // (ie: for titles, add postings for both "term" and "Sterm") + string prefix; }; // Callback for the document to word splitting class during indexation @@ -731,7 +741,11 @@ bool mySplitterCB::takeword(const std::string &term, int pos, int, int) // be possible to assign different weigths to doc parts (ie title) // by using a higher value curpos = pos; - doc.add_posting(term, basepos + curpos, 1); + pos += basepos; + doc.add_posting(term, pos, 1); + if (!prefix.empty()) { + doc.add_posting(prefix + term, pos, 1); + } return true; } catch (const Xapian::Error &e) { ermsg = e.get_msg().c_str(); @@ -804,8 +818,9 @@ bool Db::add(const string &fn, const Doc &idoc, doc.abstract = truncate_to_word(doc.abstract, m_idxAbsTruncLen); } doc.abstract = neutchars(doc.abstract, "\n\r"); - doc.title = truncate_to_word(doc.title, 100); - doc.keywords = truncate_to_word(doc.keywords, 300); + doc.title = neutchars(truncate_to_word(doc.title, 150), "\n\r"); + doc.author = neutchars(truncate_to_word(doc.author, 150), "\n\r"); + doc.keywords = neutchars(truncate_to_word(doc.keywords, 300), "\n\r"); Xapian::Document newdocument; @@ -824,13 +839,30 @@ bool Db::add(const string &fn, const Doc &idoc, } // Split and index title - LOGDEB2(("Db::add: split title [%s]\n", doc.title.c_str())); - if (!dumb_string(doc.title, noacc)) { - LOGERR(("Db::add: dumb_string failed\n")); - return false; + if (!doc.title.empty()) { + LOGDEB2(("Db::add: split title [%s]\n", doc.title.c_str())); + if (!dumb_string(doc.title, noacc)) { + LOGERR(("Db::add: dumb_string failed\n")); + return false; + } + splitData.setprefix("S"); // Subject + splitter.text_to_words(noacc); + splitData.setprefix(emptystring); + splitData.basepos += splitData.curpos + 100; + } + + // Split and index author + if (!doc.author.empty()) { + LOGDEB2(("Db::add: split author [%s]\n", doc.author.c_str())); + if (!dumb_string(doc.author, noacc)) { + LOGERR(("Db::add: dumb_string failed\n")); + return false; + } + splitData.setprefix("A"); + splitter.text_to_words(noacc); + splitData.setprefix(emptystring); + splitData.basepos += splitData.curpos + 100; } - splitter.text_to_words(noacc); - splitData.basepos += splitData.curpos + 100; // Split and index body LOGDEB2(("Db::add: split body\n")); @@ -842,13 +874,17 @@ bool Db::add(const string &fn, const Doc &idoc, splitData.basepos += splitData.curpos + 100; // Split and index keywords - LOGDEB2(("Db::add: split kw [%s]\n", doc.keywords.c_str())); - if (!dumb_string(doc.keywords, noacc)) { - LOGERR(("Db::add: dumb_string failed\n")); - return false; + if (!doc.keywords.empty()) { + LOGDEB2(("Db::add: split kw [%s]\n", doc.keywords.c_str())); + if (!dumb_string(doc.keywords, noacc)) { + LOGERR(("Db::add: dumb_string failed\n")); + return false; + } + splitData.setprefix("K"); + splitter.text_to_words(noacc); + splitData.setprefix(emptystring); + splitData.basepos += splitData.curpos + 100; } - splitter.text_to_words(noacc); - splitData.basepos += splitData.curpos + 100; // Split and index abstract. We don't do this if it is synthetic // any more (this used to give a relevance boost to the beginning @@ -946,6 +982,9 @@ bool Db::add(const string &fn, const Doc &idoc, record += "\ncaption=" + doc.title; record += "\nkeywords=" + doc.keywords; record += "\nabstract=" + doc.abstract; + if (!doc.author.empty()) { + record += "\nauthor=" + doc.author; + } record += "\n"; LOGDEB1(("Newdocument data: %s\n", record.c_str())); newdocument.set_data(record); diff --git a/src/rcldb/rcldoc.h b/src/rcldb/rcldoc.h index ad6ce2aa..1dea0189 100644 --- a/src/rcldb/rcldoc.h +++ b/src/rcldb/rcldoc.h @@ -16,7 +16,7 @@ */ #ifndef _RCLDOC_H_INCLUDED_ #define _RCLDOC_H_INCLUDED_ -/* @(#$Id: rcldoc.h,v 1.1 2006-12-14 14:54:13 dockes Exp $ (C) 2006 J.F.Dockes */ +/* @(#$Id: rcldoc.h,v 1.2 2007-01-17 13:53:41 dockes Exp $ (C) 2006 J.F.Dockes */ #include @@ -48,6 +48,7 @@ class Doc { string origcharset; // Charset we transcoded from (in case we want back) // Possibly set by handler string title; // Possibly set by handler + string author; // Possibly set by handler string keywords; // Possibly set by handler string abstract; // Possibly set by handler bool syntabs; // true if abstract is just the top of doc, not an diff --git a/src/rcldb/searchdata.cpp b/src/rcldb/searchdata.cpp index da7c033c..d51bce3d 100644 --- a/src/rcldb/searchdata.cpp +++ b/src/rcldb/searchdata.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.7 2006-12-19 12:11:21 dockes Exp $ (C) 2006 J.F.Dockes"; +static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.8 2007-01-17 13:53:41 dockes Exp $ (C) 2006 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -154,6 +154,7 @@ public: bool translate(const string &iq, + const string &prefix, string &ermsg, list &pqueries, int slack = 0, bool useNear = false); @@ -257,6 +258,14 @@ void multiply_groups(vector >::const_iterator vvit, } } +static void addPrefix(list& terms, const string& prefix) +{ + if (prefix.empty()) + return; + for (list::iterator it = terms.begin(); it != terms.end(); it++) + it->insert(0, prefix); +} + /** * Turn string into list of xapian queries. There is little * interpretation done on the string (no +term -term or filename:term @@ -271,6 +280,7 @@ void multiply_groups(vector >::const_iterator vvit, * count) */ bool StringToXapianQ::translate(const string &iq, + const string &prefix, string &ermsg, list &pqueries, int slack, bool useNear) @@ -301,24 +311,25 @@ bool StringToXapianQ::translate(const string &iq, splitterS.text_to_words(*it); TextSplit splitterW(&splitDataW, TextSplit::TXTS_NOSPANS); splitterW.text_to_words(*it); - wsQData& splitData = splitDataS; - if (splitDataS.terms.size() > 1 && splitDataS.terms.size() != - splitDataW.terms.size()) - splitData = splitDataW; + wsQData *splitData = &splitDataS; + if (splitDataS.terms.size() > 1 && + splitDataS.terms.size() != splitDataW.terms.size()) + splitData = &splitDataW; LOGDEB1(("strToXapianQ: splitter term count: %d\n", - splitData.terms.size())); - switch(splitData.terms.size()) { + splitData->terms.size())); + switch(splitData->terms.size()) { case 0: continue;// ?? case 1: // Not a real phrase: one term { - string term = splitData.terms.front(); + string term = splitData->terms.front(); list exp; maybeStemExp(false, term, exp); + m_terms.insert(m_terms.end(), exp.begin(), exp.end()); // Push either term or OR of stem-expanded set + addPrefix(exp, prefix); pqueries.push_back(Xapian::Query(Xapian::Query::OP_OR, exp.begin(), exp.end())); - m_terms.insert(m_terms.end(), exp.begin(), exp.end()); } break; @@ -329,8 +340,8 @@ bool StringToXapianQ::translate(const string &iq, list orqueries; bool hadmultiple = false; vector >groups; - for (vector::iterator it = splitData.terms.begin(); - it != splitData.terms.end(); it++) { + for (vector::iterator it = splitData->terms.begin(); + it != splitData->terms.end(); it++) { // Some version of xapian will accept only one OR clause // inside NEAR, all others must be leafs bool nostemexp = @@ -341,6 +352,7 @@ bool StringToXapianQ::translate(const string &iq, maybeStemExp(nostemexp, *it, exp); groups.push_back(vector(exp.begin(), exp.end())); + addPrefix(exp, prefix); orqueries.push_back(Xapian::Query(Xapian::Query::OP_OR, exp.begin(), exp.end())); #ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF @@ -352,7 +364,7 @@ bool StringToXapianQ::translate(const string &iq, pqueries.push_back(Xapian::Query(op, orqueries.begin(), orqueries.end(), - splitData.terms.size() + slack)); + splitData->terms.size() + slack)); // Add NEAR/PHRASE groups to the highlighting data. Must // push all combinations vector > allcombs; @@ -378,6 +390,28 @@ bool StringToXapianQ::translate(const string &iq, return true; } +// Try to translate field specification into field prefix. This should +// probably be an Rcl::Db method and much more configurable (store +// prefix translation list in config ?) +static string fieldToPrefix(const string& i_field) +{ + static map fldToPrefs; + if (fldToPrefs.empty()) { + fldToPrefs["title"] = "S"; + fldToPrefs["caption"] = "S"; + fldToPrefs["subject"] = "S"; + fldToPrefs["author"] = "A"; + fldToPrefs["from"] = "A"; + fldToPrefs["keyword"] = "K"; + } + string fld(i_field); + stringtolower(fld); + map::const_iterator it = fldToPrefs.find(fld); + if (it != fldToPrefs.end()) + return it->second; + return ""; +} + // Translate a simple OR, AND, or EXCL search clause. bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p, const string& stemlang) @@ -397,9 +431,12 @@ bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p, LOGERR(("SearchDataClauseSimple: bad m_tp %d\n", m_tp)); return false; } + string prefix; + if (!m_field.empty()) + prefix = fieldToPrefix(m_field); list pqueries; StringToXapianQ tr(db, stemlang); - if (!tr.translate(m_text, m_reason, pqueries)) + if (!tr.translate(m_text, prefix, m_reason, pqueries)) return false; if (pqueries.empty()) { LOGERR(("SearchDataClauseSimple: resolved to null query\n")); @@ -437,12 +474,16 @@ bool SearchDataClauseDist::toNativeQuery(Rcl::Db &db, void *p, list pqueries; Xapian::Query nq; + string prefix; + if (!m_field.empty()) + prefix = fieldToPrefix(m_field); + // Use stringToXapianQueries to lowercase and simplify the phrase // terms etc. The result should be a single element list string s = string("\"") + m_text + string("\""); bool useNear = m_tp == SCLT_NEAR; StringToXapianQ tr(db, stemlang); - if (!tr.translate(s, m_reason, pqueries, m_slack, useNear)) + if (!tr.translate(s, prefix, m_reason, pqueries, m_slack, useNear)) return false; if (pqueries.empty()) { LOGERR(("SearchDataClauseDist: resolved to null query\n")); diff --git a/src/rcldb/searchdata.h b/src/rcldb/searchdata.h index bad37103..d783df18 100644 --- a/src/rcldb/searchdata.h +++ b/src/rcldb/searchdata.h @@ -16,7 +16,7 @@ */ #ifndef _SEARCHDATA_H_INCLUDED_ #define _SEARCHDATA_H_INCLUDED_ -/* @(#$Id: searchdata.h,v 1.7 2006-12-05 15:17:13 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: searchdata.h,v 1.8 2007-01-17 13:53:41 dockes Exp $ (C) 2004 J.F.Dockes */ /** * Structures to hold data coming almost directly from the gui @@ -46,11 +46,22 @@ enum SClType { class SearchDataClause; /** - * Holder for a list of search clauses. Some of the clauses may be be reference - * to other subqueries in the future. For now, they just reflect user entry in - * a query field: type, some text and possibly a distance. Each clause may - * hold several queries in the Xapian sense, for exemple several terms - * and phrases as would result from ["this is a phrase" term1 term2] + Data structure representing A Recoll query. + This is currently simply a list of search clauses. + + For now, clauses in the list just reflect user entry in a query + field: some text, a clause type (AND/OR/NEAR etc.) and possibly a + distance. Each clause may hold several queries in the Xapian sense, + for exemple several terms and phrases as would result from + ["this is a phrase" term1 term2] + + This means that SearchData will be translated into a Xapian + Query tree of depth 2. + + The structure might be extended in the future so that some of the + clauses may be references to other subqueries (there doesn't seem to + be an urgent need for this) + */ class SearchData { public: @@ -134,15 +145,19 @@ protected: */ class SearchDataClauseSimple : public SearchDataClause { public: - SearchDataClauseSimple(SClType tp, string txt) - : SearchDataClause(tp), m_text(txt), m_slack(0) {} + SearchDataClauseSimple(SClType tp, const string& txt, + const string& fld = "") + : SearchDataClause(tp), m_text(txt), m_field(fld), m_slack(0) {} + virtual ~SearchDataClauseSimple() {} + /** Translate to Xapian query */ virtual bool toNativeQuery(Rcl::Db &db, void *, const string& stemlang); - virtual bool getTerms(vector& terms, - vector >& groups, - vector& gslks) const + /** Retrieve query terms and term groups. This is used for highlighting */ + virtual bool getTerms(vector& terms, /* Single terms */ + vector >& groups, /* Prox grps */ + vector& gslks) const /* Prox slacks */ { terms.insert(terms.end(), m_terms.begin(), m_terms.end()); groups.insert(groups.end(), m_groups.begin(), m_groups.end()); @@ -151,7 +166,8 @@ public: } protected: - string m_text; + string m_text; // Raw user entry text. + string m_field; // Field specification if any // Single terms and phrases resulting from breaking up m_text; // valid after toNativeQuery() call vector m_terms; @@ -161,10 +177,10 @@ protected: int m_slack; }; -/** Filename search. */ +/** Filename search clause. */ class SearchDataClauseFilename : public SearchDataClauseSimple { public: - SearchDataClauseFilename(string txt) + SearchDataClauseFilename(const string& txt) : SearchDataClauseSimple(SCLT_FILENAME, txt) {} virtual ~SearchDataClauseFilename() {} virtual bool toNativeQuery(Rcl::Db &db, void *, const string& stemlang); @@ -176,8 +192,9 @@ public: */ class SearchDataClauseDist : public SearchDataClauseSimple { public: - SearchDataClauseDist(SClType tp, string txt, int slack) - : SearchDataClauseSimple(tp, txt) {m_slack = slack;} + SearchDataClauseDist(SClType tp, const string& txt, int slack, + const string& fld = "") + : SearchDataClauseSimple(tp, txt, fld) {m_slack = slack;} virtual ~SearchDataClauseDist() {} virtual bool toNativeQuery(Rcl::Db &db, void *, const string& stemlang);