diff --git a/src/doc/user/usermanual.html b/src/doc/user/usermanual.html index 48d77ff2..371bc7a4 100644 --- a/src/doc/user/usermanual.html +++ b/src/doc/user/usermanual.html @@ -4089,8 +4089,7 @@ alink="#0000FF"> set.

  • -

    %t. Title or Filename if not - set.

    +

    %t. Title.

  • %U. Url

    diff --git a/src/doc/user/usermanual.xml b/src/doc/user/usermanual.xml index 30631b24..5530d1fd 100644 --- a/src/doc/user/usermanual.xml +++ b/src/doc/user/usermanual.xml @@ -3109,8 +3109,8 @@ %TTitle or Filename if not set. - %tTitle or Filename if - not set. + %tTitle or empty. + %UUrl diff --git a/src/rcldb/rcldb.h b/src/rcldb/rcldb.h index dc124c2f..cc127c12 100644 --- a/src/rcldb/rcldb.h +++ b/src/rcldb/rcldb.h @@ -377,7 +377,7 @@ class Db { * in the TermMatchResult header */ enum MatchType {ET_NONE=0, ET_WILD=1, ET_REGEXP=2, ET_STEM=3, - ET_DIACSENS=8, ET_CASESENS=16, ET_SYNEXP=32}; + ET_DIACSENS=8, ET_CASESENS=16, ET_SYNEXP=32, ET_PATHELT=64}; int matchTypeTp(int tp) { return tp & 7; diff --git a/src/rcldb/rclterms.cpp b/src/rcldb/rclterms.cpp index d83fe2cc..d5cb3d48 100644 --- a/src/rcldb/rclterms.cpp +++ b/src/rcldb/rclterms.cpp @@ -164,8 +164,17 @@ bool Db::termMatch(int typ_sens, const string &lang, const string &_term, bool diac_sensitive = (typ_sens & ET_DIACSENS) != 0; bool case_sensitive = (typ_sens & ET_CASESENS) != 0; - - LOGDEB0("Db::TermMatch: typ " << (tmtptostr(matchtyp)) << " diacsens " << (diac_sensitive) << " casesens " << (case_sensitive) << " lang [" << (lang) << "] term [" << (_term) << "] max " << (max) << " field [" << (field) << "] stripped " << (o_index_stripchars) << " init res.size " << (res.entries.size()) << "\n" ); + // Path elements (used for dir: filtering) are special because + // they are not unaccented or lowercased even if the index is + // otherwise stripped. + bool pathelt = (typ_sens & ET_PATHELT) != 0; + + LOGDEB0("Db::TermMatch: typ " << tmtptostr(matchtyp) << " diacsens " << + diac_sensitive << " casesens " << case_sensitive << " pathelt " << + pathelt << " lang [" << + lang << "] term [" << _term << "] max " << max << " field [" << + field << "] stripped " << o_index_stripchars << " init res.size " + << res.entries.size() << "\n"); // If index is stripped, no case or diac expansion can be needed: // for the processing inside this routine, everything looks like @@ -174,8 +183,8 @@ bool Db::termMatch(int typ_sens, const string &lang, const string &_term, string term = _term; if (o_index_stripchars) { diac_sensitive = case_sensitive = true; - if (!unacmaybefold(_term, term, "UTF-8", UNACOP_UNACFOLD)) { - LOGERR("Db::termMatch: unac failed for [" << (_term) << "]\n" ); + if (!pathelt && !unacmaybefold(_term, term, "UTF-8", UNACOP_UNACFOLD)) { + LOGERR("Db::termMatch: unac failed for [" << _term << "]\n"); return false; } } diff --git a/src/rcldb/searchdata.h b/src/rcldb/searchdata.h index cbabdb44..5f7d9cbf 100644 --- a/src/rcldb/searchdata.h +++ b/src/rcldb/searchdata.h @@ -232,6 +232,9 @@ public: SDCM_ANCHOREND=0x4, SDCM_CASESENS=0x8, SDCM_DIACSENS=0x10, SDCM_NOTERMS=0x20, // Don't include terms for highlighting SDCM_NOSYNS = 0x40, // Don't perform synonym expansion + // Aargh special case. pathelts are case/diac-sensitive + // even in a stripped index + SDCM_PATHELT = 0x80, }; enum Relation {REL_CONTAINS, REL_EQUALS, REL_LT, REL_LTE, REL_GT, REL_GTE}; diff --git a/src/rcldb/searchdatatox.cpp b/src/rcldb/searchdatatox.cpp index 40cfef2c..940eeb83 100644 --- a/src/rcldb/searchdatatox.cpp +++ b/src/rcldb/searchdatatox.cpp @@ -374,6 +374,16 @@ private: map m_nste; }; +static const vector expandModStrings{ + {SearchDataClause::SDCM_NOSTEMMING, "nostemming"}, + {SearchDataClause::SDCM_ANCHORSTART, "anchorstart"}, + {SearchDataClause::SDCM_ANCHOREND, "anchorend"}, + {SearchDataClause::SDCM_CASESENS, "casesens"}, + {SearchDataClause::SDCM_DIACSENS, "diacsens"}, + {SearchDataClause::SDCM_NOTERMS, "noterms"}, + {SearchDataClause::SDCM_NOSYNS, "nosyns"}, + {SearchDataClause::SDCM_PATHELT, "pathelt"}, + }; /** Expand term into term list, using appropriate mode: stem, wildcards, * diacritics... @@ -396,12 +406,20 @@ bool SearchDataClauseSimple::expandTerm(Rcl::Db &db, vector* multiwords ) { - LOGDEB0("expandTerm: mods 0x" << (mods) << " fld [" << (m_field) << "] trm [" << (term) << "] lang [" << (getStemLang()) << "]\n" ); + LOGDEB0("expandTerm: mods: [" << flagsToString(expandModStrings, mods) << + "] fld [" << m_field << "] trm [" << term << "] lang [" << + getStemLang() << "]\n"); sterm.clear(); oexp.clear(); if (term.empty()) return true; + if (mods & SDCM_PATHELT) { + // Path element are so special. Only wildcards, and they are + // case-sensitive. + mods |= SDCM_NOSTEMMING|SDCM_CASESENS|SDCM_DIACSENS|SDCM_NOSYNS; + } + bool maxexpissoft = false; int maxexpand = getSoftMaxExp(); if (maxexpand != -1) { @@ -420,14 +438,15 @@ bool SearchDataClauseSimple::expandTerm(Rcl::Db &db, // No stem expansion if there are wildcards or if prevented by caller bool nostemexp = (mods & SDCM_NOSTEMMING) != 0; if (haswild || getStemLang().empty()) { - LOGDEB2("expandTerm: found wildcards or stemlang empty: no exp\n" ); + LOGDEB2("expandTerm: found wildcards or stemlang empty: no exp\n"); nostemexp = true; } bool diac_sensitive = (mods & SDCM_DIACSENS) != 0; bool case_sensitive = (mods & SDCM_CASESENS) != 0; bool synonyms = (mods & SDCM_NOSYNS) == 0; - + bool pathelt = (mods & SDCM_PATHELT) != 0; + // noexpansion can be modified further down by possible case/diac expansion bool noexpansion = nostemexp && !haswild && !synonyms; @@ -442,7 +461,7 @@ bool SearchDataClauseSimple::expandTerm(Rcl::Db &db, // performed (conversion+comparison) will automatically ignore // accented characters which are actually a separate letter if (getAutoDiac() && unachasaccents(term)) { - LOGDEB0("expandTerm: term has accents -> diac-sensitive\n" ); + LOGDEB0("expandTerm: term has accents -> diac-sensitive\n"); diac_sensitive = true; } @@ -453,13 +472,14 @@ bool SearchDataClauseSimple::expandTerm(Rcl::Db &db, Utf8Iter it(term); it++; if (getAutoCase() && unachasuppercase(term.substr(it.getBpos()))) { - LOGDEB0("expandTerm: term has uppercase -> case-sensitive\n" ); + LOGDEB0("expandTerm: term has uppercase -> case-sensitive\n"); case_sensitive = true; } // If we are sensitive to case or diacritics turn stemming off if (diac_sensitive || case_sensitive) { - LOGDEB0("expandTerm: diac or case sens set -> stemexpand and synonyms off\n" ); + LOGDEB0("expandTerm: diac or case sens set -> stemexpand and " + "synonyms off\n"); nostemexp = true; synonyms = false; } @@ -472,7 +492,7 @@ bool SearchDataClauseSimple::expandTerm(Rcl::Db &db, if (noexpansion) { oexp.push_back(prefix + term); m_hldata.terms[term] = term; - LOGDEB("ExpandTerm: noexpansion: final: " << (stringsToString(oexp)) << "\n" ); + LOGDEB("ExpandTerm: noexpansion: final: "<::const_iterator it = res.entries.begin(); - it != res.entries.end(); it++) { - oexp.push_back(it->term); + for (const auto& entry : res.entries) { + oexp.push_back(entry.term); } // If the term does not exist at all in the db, the return from // termMatch() is going to be empty, which is not what we want (we @@ -509,11 +529,10 @@ bool SearchDataClauseSimple::expandTerm(Rcl::Db &db, oexp.push_back(prefix + term); // Remember the uterm-to-expansion links - for (vector::const_iterator it = oexp.begin(); - it != oexp.end(); it++) { - m_hldata.terms[strip_prefix(*it)] = term; + for (const auto& entry : oexp) { + m_hldata.terms[strip_prefix(entry)] = term; } - LOGDEB("ExpandTerm: final: " << (stringsToString(oexp)) << "\n" ); + LOGDEB("ExpandTerm: final: " << stringsToString(oexp) << "\n"); return true; } @@ -951,7 +970,7 @@ bool SearchDataClausePath::toNativeQuery(Rcl::Db &db, void *p) #endif if (ltext.empty()) { - LOGERR("SearchDataClausePath: empty path??\n" ); + LOGERR("SearchDataClausePath: empty path??\n"); m_reason = "Empty path ?"; return false; } @@ -971,8 +990,7 @@ bool SearchDataClausePath::toNativeQuery(Rcl::Db &db, void *p) string sterm; vector exp; - if (!expandTerm(db, m_reason, - SDCM_NOSTEMMING|SDCM_CASESENS|SDCM_DIACSENS, + if (!expandTerm(db, m_reason, SDCM_PATHELT, *pit, exp, sterm, wrap_prefix(pathelt_prefix))) { return false; }