From c8d34dc8ba605ee9f832d426ec5e06d8f09ed72b Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Sat, 8 Aug 2015 21:56:45 +0200 Subject: [PATCH] Prevent highligting of bogus terms in results (prevent path elts, negative queries or internal stuff) --- src/common/rclconfig.cpp | 2 ++ src/common/rclconfig.h | 4 +-- src/rcldb/rcldb.cpp | 4 +-- src/rcldb/rcldb.h | 2 -- src/rcldb/searchdata.cpp | 49 +++++++++++++++++++++---------------- src/rcldb/searchdata.h | 18 ++++++-------- src/rcldb/searchdatatox.cpp | 15 +++++++++--- src/sampleconf/fields | 10 ++++---- 8 files changed, 58 insertions(+), 46 deletions(-) diff --git a/src/common/rclconfig.cpp b/src/common/rclconfig.cpp index 3f915452..a4770d57 100644 --- a/src/common/rclconfig.cpp +++ b/src/common/rclconfig.cpp @@ -867,6 +867,8 @@ bool RclConfig::readFieldsConfig(const string& cnferrloc) ft.boost = atof(tval.c_str()); if (attrs.get("pfxonly", tval)) ft.pfxonly = stringToBool(tval); + if (attrs.get("noterms", tval)) + ft.noterms = stringToBool(tval); m_fldtotraits[stringtolower(*it)] = ft; LOGDEB2(("readFieldsConfig: [%s] -> [%s] %d %.1f\n", it->c_str(), ft.pfx.c_str(), ft.wdfinc, ft.boost)); diff --git a/src/common/rclconfig.h b/src/common/rclconfig.h index f305b0ec..7be82b08 100644 --- a/src/common/rclconfig.h +++ b/src/common/rclconfig.h @@ -66,9 +66,9 @@ struct FieldTraits { int wdfinc; // Index time term frequency increment (default 1) double boost; // Query time boost (default 1.0) bool pfxonly; // Suppress prefix-less indexing - + bool noterms; // Don't add term to highlight data (e.g.: rclbes) FieldTraits() - : wdfinc(1), boost(1.0), pfxonly(false) + : wdfinc(1), boost(1.0), pfxonly(false), noterms(false) {} }; diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index 31754635..b4780ca1 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -71,8 +71,8 @@ static const string xapday_prefix = "D"; static const string xapmonth_prefix = "M"; static const string xapyear_prefix = "Y"; const string pathelt_prefix = "XP"; -const string udi_prefix("Q"); -const string parent_prefix("F"); +static const string udi_prefix("Q"); +static const string parent_prefix("F"); // Special terms to mark begin/end of field (for anchored searches), and // page breaks diff --git a/src/rcldb/rcldb.h b/src/rcldb/rcldb.h index ab1ab978..d4f9f773 100644 --- a/src/rcldb/rcldb.h +++ b/src/rcldb/rcldb.h @@ -533,8 +533,6 @@ private: string version_string(); extern const string pathelt_prefix; -extern const string udi_prefix; -extern const string parent_prefix; extern const string mimetype_prefix; extern const string unsplitFilenameFieldName; extern string start_of_field_term; diff --git a/src/rcldb/searchdata.cpp b/src/rcldb/searchdata.cpp index 3fda7988..70d3f243 100644 --- a/src/rcldb/searchdata.cpp +++ b/src/rcldb/searchdata.cpp @@ -181,7 +181,8 @@ bool SearchData::addClause(SearchDataClause* cl) return true; } -// Am I a file name only search ? This is to turn off term highlighting +// Am I a file name only search ? This is to turn off term highlighting. +// There can't be a subclause in a filename search: no possible need to recurse bool SearchData::fileNameOnly() { for (qlist_it_t it = m_query.begin(); it != m_query.end(); it++) @@ -190,6 +191,7 @@ bool SearchData::fileNameOnly() return true; } +// The query language creates a lot of subqueries. See if we can merge them. void SearchData::simplify() { for (unsigned int i = 0; i < m_query.size(); i++) { @@ -249,30 +251,35 @@ void SearchData::simplify() } } -bool SearchData::singleSimple() -{ - if (m_query.size() != 1 || !m_filetypes.empty() || !m_nfiletypes.empty() || - m_haveDates || m_maxSize != size_t(-1) || m_minSize != size_t(-1) || - m_haveWildCards) - return false; - SearchDataClause *clp = *m_query.begin(); - if (clp->getTp() != SCLT_AND && clp->getTp() != SCLT_OR) { - return false; - } - return true; -} - -// Extract all term data +// Extract terms and groups for highlighting void SearchData::getTerms(HighlightData &hld) const { - for (qlist_cit_t it = m_query.begin(); it != m_query.end(); it++) - (*it)->getTerms(hld); + for (qlist_cit_t it = m_query.begin(); it != m_query.end(); it++) { + if (!((*it)->getmodifiers() & SearchDataClause::SDCM_NOTERMS) && + !(*it)->getexclude()) { + (*it)->getTerms(hld); + } + } return; } +static const char * tpToString(SClType t) +{ + switch (t) { + case SCLT_AND: return "AND"; + case SCLT_OR: return "OR"; + case SCLT_FILENAME: return "FILENAME"; + case SCLT_PHRASE: return "PHRASE"; + case SCLT_NEAR: return "NEAR"; + case SCLT_PATH: return "PATH"; + case SCLT_SUB: return "SUB"; + default: return "UNKNOWN"; + } +} + void SearchData::dump(ostream& o) const { - o << "SearchData: " << " qs " << int(m_query.size()) << + o << "SearchData: " << tpToString(m_tp) << " qs " << int(m_query.size()) << " ft " << m_filetypes.size() << " nft " << m_nfiletypes.size() << " hd " << m_haveDates << " maxs " << int(m_maxSize) << " mins " << int(m_minSize) << " wc " << m_haveWildCards << "\n"; @@ -291,7 +298,7 @@ void SearchDataClause::dump(ostream& o) const void SearchDataClauseSimple::dump(ostream& o) const { - o << "ClauseSimple: "; + o << "ClauseSimple: " << tpToString(m_tp) << " "; if (m_exclude) o << "- "; o << "[" ; @@ -319,9 +326,9 @@ void SearchDataClausePath::dump(ostream& o) const void SearchDataClauseDist::dump(ostream& o) const { if (m_tp == SCLT_NEAR) - o << "ClauseDist: NEAR: "; + o << "ClauseDist: NEAR "; else - o << "ClauseDist: PHRA: "; + o << "ClauseDist: PHRA "; if (m_exclude) o << " - "; diff --git a/src/rcldb/searchdata.h b/src/rcldb/searchdata.h index 4cdc3758..1b952eef 100644 --- a/src/rcldb/searchdata.h +++ b/src/rcldb/searchdata.h @@ -96,9 +96,6 @@ public: /** Is there anything but a file name search in here ? */ bool fileNameOnly(); - /** Are we a simple query with one clause? */ - bool singleSimple(); - /** Do we have wildcards anywhere apart from filename searches ? */ bool haveWildCards() {return m_haveWildCards;} @@ -228,7 +225,9 @@ private: class SearchDataClause { public: enum Modifier {SDCM_NONE=0, SDCM_NOSTEMMING=1, SDCM_ANCHORSTART=2, - SDCM_ANCHOREND=4, SDCM_CASESENS=8, SDCM_DIACSENS=16}; + SDCM_ANCHOREND=4, SDCM_CASESENS=8, SDCM_DIACSENS=16, + SDCM_NOTERMS=32 // Don't include terms for highlighting + }; enum Relation {REL_CONTAINS, REL_EQUALS, REL_LT, REL_LTE, REL_GT, REL_GTE}; SearchDataClause(SClType tp) @@ -278,13 +277,12 @@ public: { return m_parentSearch ? m_parentSearch->getSoftMaxExp() : -1; } - virtual void setModifiers(Modifier mod) - { - m_modifiers = mod; - } virtual void addModifier(Modifier mod) { - m_modifiers = Modifier(m_modifiers | mod); + m_modifiers = m_modifiers | mod; + } + virtual unsigned int getmodifiers() { + return m_modifiers; } virtual void setWeight(float w) { @@ -312,7 +310,7 @@ protected: SClType m_tp; SearchData *m_parentSearch; bool m_haveWildCards; - Modifier m_modifiers; + unsigned int m_modifiers; float m_weight; bool m_exclude; Relation m_rel; diff --git a/src/rcldb/searchdatatox.cpp b/src/rcldb/searchdatatox.cpp index 0dff6c8b..d127d5dd 100644 --- a/src/rcldb/searchdatatox.cpp +++ b/src/rcldb/searchdatatox.cpp @@ -25,6 +25,7 @@ #include #include #include +#include using namespace std; #include "xapian.h" @@ -53,9 +54,10 @@ typedef vector::iterator qlist_it_t; static const int original_term_wqf_booster = 10; -// Expand categories and mime type wild card exps Categories are -// expanded against the configuration, mimetypes against the index -// (for wildcards). +// Expand doc categories and mime type wild card expressions +// +// Categories are expanded against the configuration, mimetypes +// against the index. bool SearchData::expandFileTypes(Db &db, vector& tps) { const RclConfig *cfg = db.getConf(); @@ -101,6 +103,8 @@ static const char *maxXapClauseCaseDiacMsg = "wildcards ?" ; + +// Walk the clauses list, translate each and add to top Xapian Query bool SearchData::clausesToQuery(Rcl::Db &db, SClType tp, vector& query, string& reason, void *d) @@ -484,7 +488,8 @@ bool SearchDataClauseSimple::expandTerm(Rcl::Db &db, if (noexpansion) { oexp.push_back(prefix + term); m_hldata.terms[term] = term; - LOGDEB(("ExpandTerm: noexpansion: final: %s\n", stringsToString(oexp).c_str())); + LOGDEB(("ExpandTerm: noexpansion: final: %s\n", + stringsToString(oexp).c_str())); return true; } @@ -568,6 +573,8 @@ void SearchDataClauseSimple::processSimpleSpan(Rcl::Db &db, string& ermsg, string prefix; const FieldTraits *ftp; if (!m_field.empty() && db.fieldToTraits(m_field, &ftp, true)) { + if (ftp->noterms) + addModifier(SDCM_NOTERMS); prefix = wrap_prefix(ftp->pfx); } diff --git a/src/sampleconf/fields b/src/sampleconf/fields index ace96c23..76a98985 100644 --- a/src/sampleconf/fields +++ b/src/sampleconf/fields @@ -43,12 +43,12 @@ keywords= K xapyearmon = M title = S ; wdfinc = 10 mtype = T -ext = XE +ext = XE; noterms = 1 rclmd5 = XM -dir = XP +dir = XP ; noterms = 1 abstract = XS -filename = XSFN -containerfilename = XCFN ; pfxonly = 1 +filename = XSFN ; noterms = 1 +containerfilename = XCFN ; pfxonly = 1 ; noterms = 1 rclUnsplitFN = XSFS xapyear = Y recipient = XTO @@ -58,7 +58,7 @@ recipient = XTO # by default. # Some values are internally reserved by recoll: # XE (file ext), XP (for path elements), XSFN, XSFS, XXST, XXND, XXPG -rclbes = XB +rclbes = XB ; noterms = 1 # Using XX was not a good idea. # # I hereby commit to not using XY for Recoll: