From b4493ed9e1a608e1c70842f61e2fad805a9e780f Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Sat, 30 Dec 2017 08:43:14 +0100 Subject: [PATCH] Snippets generation: add method for generating from doc stored text. Still needs refining, esp. for phrase/near --- src/Makefile.am | 1 + src/common/rclconfig.cpp | 11 + src/common/rclconfig.h | 10 + src/rcldb/rclabsfromtext.cpp | 298 ++++++++++++++++ src/rcldb/rclabstract.cpp | 638 +++++++++++++++++++---------------- src/rcldb/rcldb.cpp | 20 +- src/rcldb/rcldb.h | 10 +- src/rcldb/rcldb_p.h | 7 + src/rcldb/rclquery_p.h | 53 +++ src/sampleconf/recoll.conf | 17 +- 10 files changed, 762 insertions(+), 303 deletions(-) create mode 100644 src/rcldb/rclabsfromtext.cpp diff --git a/src/Makefile.am b/src/Makefile.am index f4294f41..75128525 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -154,6 +154,7 @@ rcldb/daterange.h \ rcldb/expansiondbs.cpp \ rcldb/expansiondbs.h \ rcldb/rclabstract.cpp \ +rcldb/rclabsfromtext.cpp \ rcldb/rcldb.cpp \ rcldb/rcldb.h \ rcldb/rcldb_p.h \ diff --git a/src/common/rclconfig.cpp b/src/common/rclconfig.cpp index a2bf8d70..e7a0234c 100644 --- a/src/common/rclconfig.cpp +++ b/src/common/rclconfig.cpp @@ -60,6 +60,16 @@ using namespace std; // We default to a case- and diacritics-less index for now bool o_index_stripchars = true; +// Store document text in index. Allows extracting snippets from text +// instead of building them from index position data. Has become +// necessary for versions of Xapian 1.6, which have dropped support +// for the chert index format, and adopted a setup which renders our +// use of positions list unacceptably slow in cases. 'raw' text here +// means that the text is not stripped of upper-case, diacritics, or +// punctuation signs. It is still translated from its original format +// to UTF-8 plain text. +bool o_index_storerawtext = false; + bool o_uptodate_test_use_mtime = false; string RclConfig::o_localecharset; @@ -391,6 +401,7 @@ bool RclConfig::updateMainConfig() static int m_index_stripchars_init = 0; if (!m_index_stripchars_init) { getConfParam("indexStripChars", &o_index_stripchars); + getConfParam("indexStoreRawText", &o_index_storerawtext); getConfParam("testmodifusemtime", &o_uptodate_test_use_mtime); m_index_stripchars_init = 1; } diff --git a/src/common/rclconfig.h b/src/common/rclconfig.h index a4539468..19a9381a 100644 --- a/src/common/rclconfig.h +++ b/src/common/rclconfig.h @@ -438,6 +438,16 @@ class RclConfig { // reset. When using multiple indexes, all must have the same value extern bool o_index_stripchars; +// Store document text in index. Allows extracting snippets from text +// instead of building them from index position data. Has become +// necessary for versions of Xapian 1.6, which have dropped support +// for the chert index format, and adopted a setup which renders our +// use of positions list unacceptably slow in cases. 'raw' text here +// means that the text is not stripped of upper-case, diacritics, or +// punctuation signs. It is still translated from its original format +// to UTF-8 plain text. +extern bool o_index_storerawtext; + // This global variable defines if we use mtime instead of ctime for // up-to-date tests. This is mostly incompatible with xattr indexing, // in addition to other issues. See recoll.conf comments. diff --git a/src/rcldb/rclabsfromtext.cpp b/src/rcldb/rclabsfromtext.cpp new file mode 100644 index 00000000..81c1cf9c --- /dev/null +++ b/src/rcldb/rclabsfromtext.cpp @@ -0,0 +1,298 @@ +/* Copyright (C) 2004-2017 J.F.Dockes + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the + * Free Software Foundation, Inc., + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#include "autoconfig.h" + +#include + +#include +#include +#include +#include + +#include "log.h" +#include "rcldb.h" +#include "rcldb_p.h" +#include "rclquery.h" +#include "rclquery_p.h" +#include "textsplit.h" +#include "hldata.h" +#include "chrono.h" +#include "unacpp.h" +#include "zlibut.h" + +using namespace std; + + +namespace Rcl { + +#warning NEAR and PHRASE + +// Text splitter for finding the match terms in the doc text. +class TextSplitABS : public TextSplit { +public: + + struct MatchEntry { + // Start/End byte offsets of fragment in the document text + int start; + int stop; + double coef; + // Position of the first matched term. + unsigned int hitpos; + // "best term" for this match + string term; + // Hilight areas (each is one or several contiguous match terms). + vector> hlzones; + + MatchEntry(int sta, int sto, double c, vector>& hl, + unsigned int pos, string& trm) + : start(sta), stop(sto), coef(c), hitpos(pos) { + hlzones.swap(hl); + term.swap(trm); + } + }; + + + TextSplitABS(const vector& matchTerms, + unordered_map& wordcoefs, + unsigned int ctxwords, + Flags flags = TXTS_NONE) + : TextSplit(flags), m_terms(matchTerms.begin(), matchTerms.end()), + m_wordcoefs(wordcoefs), m_ctxwords(ctxwords) { + LOGDEB("TextSPlitABS: ctxwords " << ctxwords << endl); + } + + // Accept a word and its position. If the word is a matched term, + // add/update fragment definition. + virtual bool takeword(const std::string& term, int pos, int bts, int bte) { + LOGDEB2("takeword: " << term << endl); + + // Recent past + m_prevterms.push_back(pair(bts,bte)); + if (m_prevterms.size() > m_ctxwords+1) { + m_prevterms.pop_front(); + } + + string dumb; + if (o_index_stripchars) { + if (!unacmaybefold(term, dumb, "UTF-8", UNACOP_UNACFOLD)) { + LOGINFO("abstract: unac failed for [" << term << "]\n"); + return true; + } + } else { + dumb = term; + } + + if (m_terms.find(dumb) != m_terms.end()) { + // This word is a search term. Extend or create fragment + LOGDEB2("match: [" << dumb << "] current: " << m_curfrag.first << + ", " << m_curfrag.second << " remain " << + m_remainingWords << endl); + double coef = m_wordcoefs[dumb]; + if (!m_remainingWords) { + // No current fragment + m_curhitpos = baseTextPosition + pos; + m_curfrag.first = m_prevterms.front().first; + m_curfrag.second = m_prevterms.back().second; + m_curhlzones.push_back(pair(bts, bte)); + m_curterm = term; + m_curtermcoef = coef; + } else { + LOGDEB2("Extending current fragment: " << m_remainingWords << + " -> " << m_ctxwords << endl); + m_extcount++; + if (m_prevwordhit) { + m_curhlzones.back().second = bte; + } else { + m_curhlzones.push_back(pair(bts, bte)); + } + if (coef > m_curtermcoef) { + m_curterm = term; + m_curtermcoef = coef; + } + } + m_prevwordhit = true; + m_curfragcoef += coef; + m_remainingWords = m_ctxwords + 1; + if (m_extcount > 3) { + // Limit expansion of contiguous fragments (this is to + // avoid common terms in search causing long + // heavyweight meaningless fragments. Also, limit length). + m_remainingWords = 1; + m_extcount = 0; + } + } else { + m_prevwordhit = false; + } + + + if (m_remainingWords) { + // Fragment currently open. Time to close ? + m_remainingWords--; + m_curfrag.second = bte; + if (m_remainingWords == 0) { + if (m_totalcoef < 5.0 || m_curfragcoef >= 1.0) { + // Don't push bad fragments if we have a lot already + m_fragments.push_back(MatchEntry(m_curfrag.first, + m_curfrag.second, + m_curfragcoef, + m_curhlzones, + m_curhitpos, + m_curterm + )); + } + m_totalcoef += m_curfragcoef; + m_curfragcoef = 0.0; + m_curtermcoef = 0.0; + } + } + return true; + } + const vector& getFragments() { + return m_fragments; + } + +private: + // Past terms because we need to go back for context before a hit + deque> m_prevterms; + // Data about the fragment we are building + pair m_curfrag{0,0}; + double m_curfragcoef{0.0}; + unsigned int m_remainingWords{0}; + unsigned int m_extcount{0}; + vector> m_curhlzones; + bool m_prevwordhit{false}; + // Current sum of fragment weights + double m_totalcoef{0.0}; + // Position of 1st term match (for page number computations) + unsigned int m_curhitpos{0}; + // "best" term + string m_curterm; + double m_curtermcoef{0.0}; + + // Input + set m_terms; + unordered_map& m_wordcoefs; + unsigned int m_ctxwords; + + // Result: begin and end byte positions of query terms/groups in text + vector m_fragments; +}; + +int Query::Native::abstractFromText( + Rcl::Db::Native *ndb, + Xapian::docid docid, + const vector& matchTerms, + const multimap> byQ, + double totalweight, + int ctxwords, + unsigned int maxtotaloccs, + vector& vabs, + Chrono& + ) +{ + Xapian::Database& xrdb(ndb->xrdb); + Xapian::Document xdoc; + + string reason; + XAPTRY(xdoc = xrdb.get_document(docid), xrdb, reason); + if (!reason.empty()) { + LOGERR("abstractFromText: could not get doc: " << reason << endl); + return ABSRES_ERROR; + } + + string rawtext, data; +#ifdef RAWTEXT_IN_DATA + XAPTRY(data = xdoc.get_data(), xrdb, reason); + if (!reason.empty()) { + LOGERR("abstractFromText: could not get data: " << reason << endl); + return ABSRES_ERROR; + } + Doc doc; + if (ndb->dbDataToRclDoc(docid, data, doc)) { + rawtext = doc.meta["RAWTEXT"]; + } +#endif +#ifdef RAWTEXT_IN_VALUE + XAPTRY(rawtext = xdoc.get_value(VALUE_RAWTEXT), xrdb, reason); + if (!reason.empty()) { + LOGERR("abstractFromText: could not get value: " << reason << endl); + return ABSRES_ERROR; + } + ZLibUtBuf cbuf; + inflateToBuf(rawtext.c_str(), rawtext.size(), cbuf); + rawtext.assign(cbuf.getBuf(), cbuf.getCnt()); +#endif + + if (rawtext.empty()) { + LOGDEB0("abstractFromText: no text\n"); + return ABSRES_ERROR; + } + + // tryout the xapian internal method. +#if 0 && ! (XAPIAN_MAJOR_VERSION <= 1 && XAPIAN_MINOR_VERSION <= 2) && \ + (defined(RAWTEXT_IN_DATA) || defined(RAWTEXT_IN_VALUE)) + string snippet = xmset.snippet(rawtext); + LOGDEB("SNIPPET: [" << snippet << "] END SNIPPET\n"); +#endif + + // We need the q coefs for individual terms + unordered_map wordcoefs; + for (const auto& mment : byQ) { + for (const auto& word : mment.second) { + wordcoefs[word] = mment.first; + } + } + TextSplitABS splitter(matchTerms, wordcoefs, ctxwords, + TextSplit::TXTS_ONLYSPANS); + splitter.text_to_words(rawtext); + const vector& res1 = splitter.getFragments(); + vector result(res1.begin(), res1.end()); + std::sort(result.begin(), result.end(), + [](const TextSplitABS::MatchEntry& a, + const TextSplitABS::MatchEntry& b) -> bool { + return a.coef > b.coef; + } + ); + + static const string cstr_nc("\n\r\x0c\\"); + vector vpbreaks; + ndb->getPagePositions(docid, vpbreaks); + unsigned int count = 0; + for (const auto& entry : result) { + string frag = neutchars( + rawtext.substr(entry.start, entry.stop - entry.start), cstr_nc); +#if 0 + static const string starthit(""); + static const string endhit(""); + size_t inslen = 0; + for (const auto& hlzone: entry.hlzones) { + frag.replace(hlzone.first - entry.start + inslen, 0, starthit); + inslen += starthit.size(); + frag.replace(hlzone.second - entry.start + inslen, 0, endhit); + inslen += endhit.size(); + } +#endif + LOGDEB("=== FRAGMENT: Coef: " << entry.coef << ": " << frag << endl); + int page = ndb->getPageNumberForPosition(vpbreaks, entry.hitpos); + vabs.push_back(Snippet(page, frag).setTerm(entry.term)); + if (count++ >= maxtotaloccs) + break; + } + return ABSRES_OK; +} + +} diff --git a/src/rcldb/rclabstract.cpp b/src/rcldb/rclabstract.cpp index 41836d8a..a24eb5b9 100644 --- a/src/rcldb/rclabstract.cpp +++ b/src/rcldb/rclabstract.cpp @@ -1,4 +1,4 @@ -/* Copyright (C) 2004 J.F.Dockes +/* Copyright (C) 2004-2017 J.F.Dockes * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or @@ -19,6 +19,9 @@ #include #include +#include +#include +#include #include "log.h" #include "rcldb.h" @@ -33,30 +36,22 @@ using namespace std; + namespace Rcl { + // This is used as a marker inside the abstract frag lists, but // normally doesn't remain in final output (which is built with a // custom sep. by our caller). static const string cstr_ellipsis("..."); +static const string emptys; // This is used to mark positions overlapped by a multi-word match term static const string occupiedmarker("?"); -#undef DEBUGABSTRACT +#define DEBUGABSTRACT #ifdef DEBUGABSTRACT #define LOGABS LOGDEB -static void listList(const string& what, const vector&l) -{ - string a; - for (vector::const_iterator it = l.begin(); it != l.end(); it++) { - a = a + *it + " "; - } - LOGDEB("" << what << ": " << a << "\n"); -} #else #define LOGABS LOGDEB2 -static void listList(const string&, const vector&) -{ -} #endif // Unprefix terms. Actually it's not completely clear if we should @@ -66,13 +61,12 @@ static void listList(const string&, const vector&) static const bool prune_prefixed_terms = true; static void noPrefixList(const vector& in, vector& out) { - for (vector::const_iterator qit = in.begin(); - qit != in.end(); qit++) { + for (const auto& term : in) { if (prune_prefixed_terms) { - if (has_prefix(*qit)) + if (has_prefix(term)) continue; } - out.push_back(strip_prefix(*qit)); + out.push_back(strip_prefix(term)); } sort(out.begin(), out.end()); vector::iterator it = unique(out.begin(), out.end()); @@ -117,18 +111,17 @@ void Query::Native::setDbWideQTermsFreqs() m_q->getQueryTerms(iqterms); noPrefixList(iqterms, qterms); } - // listList("Query terms: ", qterms); + LOGDEB("Query terms: " << stringsToString(qterms) << endl); Xapian::Database &xrdb = m_q->m_db->m_ndb->xrdb; double doccnt = xrdb.get_doccount(); if (doccnt == 0) doccnt = 1; - for (vector::const_iterator qit = qterms.begin(); - qit != qterms.end(); qit++) { - termfreqs[*qit] = xrdb.get_termfreq(*qit) / doccnt; - LOGABS("setDbWideQTermFreqs: [" << *qit << "] db freq " << - termfreqs[*qit] << "\n"); + for (const auto& term : qterms) { + termfreqs[term] = xrdb.get_termfreq(term) / doccnt; + LOGABS("setDbWideQTermFreqs: [" << term << "] db freq " << + termfreqs[term] << "\n"); } } @@ -162,36 +155,29 @@ double Query::Native::qualityTerms(Xapian::docid docid, m_q->m_sd->getTerms(hld); } -#ifdef DEBUGABSTRACT - { - string deb; - hld.toString(deb); - LOGABS("qualityTerms: hld: " << deb << "\n"); - } -#endif - - // Group the input terms by the user term they were possibly expanded from + // Group the input terms by the user term they were possibly + // expanded from (by stemming) map > byRoot; - for (vector::const_iterator qit = terms.begin(); - qit != terms.end(); qit++) { - map::const_iterator eit = hld.terms.find(*qit); + for (const auto& term: terms) { + map::const_iterator eit = hld.terms.find(term); if (eit != hld.terms.end()) { - byRoot[eit->second].push_back(*qit); + byRoot[eit->second].push_back(term); } else { - LOGDEB0("qualityTerms: [" << *qit << "] not found in hld\n"); - byRoot[*qit].push_back(*qit); + LOGDEB0("qualityTerms: [" << term << "] not found in hld\n"); + byRoot[term].push_back(term); } } #ifdef DEBUGABSTRACT { + string deb; + hld.toString(deb); + LOGABS("qualityTerms: hld: " << deb << "\n"); string byRootstr; - for (map >::const_iterator debit = - byRoot.begin(); debit != byRoot.end(); debit++) { - byRootstr.append("[").append(debit->first).append("]->"); - for (vector::const_iterator it = debit->second.begin(); - it != debit->second.end(); it++) { - byRootstr.append("[").append(*it).append("] "); + for (const auto& entry : byRoot) { + byRootstr.append("[").append(entry.first).append("]->"); + for (const auto& term : entry.second) { + byRootstr.append("[").append(term).append("] "); } byRootstr.append("\n"); } @@ -202,28 +188,25 @@ double Query::Native::qualityTerms(Xapian::docid docid, // Compute in-document and global frequencies for the groups. map grpwdfs; map grptfreqs; - for (map >::const_iterator git = byRoot.begin(); - git != byRoot.end(); git++) { - for (vector::const_iterator qit = git->second.begin(); - qit != git->second.end(); qit++) { - Xapian::TermIterator term = xrdb.termlist_begin(docid); - term.skip_to(*qit); - if (term != xrdb.termlist_end(docid) && *term == *qit) { - if (grpwdfs.find(git->first) != grpwdfs.end()) { - grpwdfs[git->first] = term.get_wdf() / doclen; - grptfreqs[git->first] = termfreqs[*qit]; + for (const auto& group : byRoot) { + for (const auto& term : group.second) { + Xapian::TermIterator xtermit = xrdb.termlist_begin(docid); + xtermit.skip_to(term); + if (xtermit != xrdb.termlist_end(docid) && *xtermit == term) { + if (grpwdfs.find(group.first) != grpwdfs.end()) { + grpwdfs[group.first] = xtermit.get_wdf() / doclen; + grptfreqs[group.first] = termfreqs[term]; } else { - grpwdfs[git->first] += term.get_wdf() / doclen; - grptfreqs[git->first] += termfreqs[*qit]; + grpwdfs[group.first] += xtermit.get_wdf() / doclen; + grptfreqs[group.first] += termfreqs[term]; } } } } // Build a sorted by quality container for the groups - for (map >::const_iterator git = byRoot.begin(); - git != byRoot.end(); git++) { - double q = (grpwdfs[git->first]) * grptfreqs[git->first]; + for (const auto& group : byRoot) { + double q = (grpwdfs[group.first]) * grptfreqs[group.first]; q = -log10(q); if (q < 3) { q = 0.05; @@ -237,22 +220,19 @@ double Query::Native::qualityTerms(Xapian::docid docid, q = 1; } totalweight += q; - byQ.insert(pair >(q, git->second)); + byQ.insert(pair >(q, group.second)); } #ifdef DEBUGABSTRACT - for (multimap >::reverse_iterator mit= byQ.rbegin(); - mit != byQ.rend(); mit++) { - LOGABS("qualityTerms: group\n"); - for (vector::const_iterator qit = mit->second.begin(); - qit != mit->second.end(); qit++) { - LOGABS("" << mit->first << "->[" << *qit << "]\n"); - } + for (auto mit= byQ.rbegin(); mit != byQ.rend(); mit++) { + LOGABS("qualityTerms: coef: " << mit->first << " group: " << + stringsToString(mit->second) << endl); } #endif return totalweight; } + // Return page number for first match of "significant" term. int Query::Native::getFirstMatchPage(Xapian::docid docid, string& term) { @@ -283,8 +263,7 @@ int Query::Native::getFirstMatchPage(Xapian::docid docid, string& term) multimap > byQ; qualityTerms(docid, terms, byQ); - for (multimap >::reverse_iterator mit = byQ.rbegin(); - mit != byQ.rend(); mit++) { + for (auto mit = byQ.rbegin(); mit != byQ.rend(); mit++) { for (vector::const_iterator qit = mit->second.begin(); qit != mit->second.end(); qit++) { string qterm = *qit; @@ -307,55 +286,211 @@ int Query::Native::getFirstMatchPage(Xapian::docid docid, string& term) return -1; } -// Build a document abstract by extracting text chunks around the query terms -// This uses the db termlists, not the original document. -// -// DatabaseModified and other general exceptions are catched and -// possibly retried by our caller -int Query::Native::makeAbstract(Xapian::docid docid, - vector& vabs, - int imaxoccs, int ictxwords) +// Creating the abstract from index position data: populate the sparse +// array with the positions for a given query term, and mark the +// neighboring positions. +void Query::Native::abstractPopulateQTerm( + Xapian::Database& xrdb, + Xapian::docid docid, + const string& qterm, + int qtrmwrdcnt, + int ctxwords, + unsigned int maxgrpoccs, + unsigned int maxtotaloccs, + map& sparseDoc, + unordered_set& searchTermPositions, + unsigned int& maxpos, + unsigned int& totaloccs, + unsigned int& grpoccs, + int& ret + ) { - Chrono chron; - LOGABS("makeAbstract: docid " << docid << " imaxoccs " << - imaxoccs << " ictxwords " << ictxwords << "\n"); + Xapian::PositionIterator pos; - // The (unprefixed) terms matched by this document - vector matchedTerms; - getMatchTerms(docid, matchedTerms); - if (matchedTerms.empty()) { - LOGDEB("makeAbstract:"<m_db->getAbsCtxLen(); + for (unsigned int ii = sta; ii <= sto; ii++) { + if (ii == (unsigned int)ipos) { + sparseDoc[ii] = qterm; + searchTermPositions.insert(ii); + if (ii > maxpos) + maxpos = ii; + } else if (ii > (unsigned int)ipos && + ii < (unsigned int)ipos + qtrmwrdcnt) { + // Position for another word of the multi-word term + sparseDoc[ii] = occupiedmarker; + } else if (!sparseDoc[ii].compare(cstr_ellipsis)) { + // For an empty slot, the test above has a side + // effect of inserting an empty string which + // is what we want. Do it also if it was an ellipsis + sparseDoc[ii] = emptys; + } + } + // Add ellipsis at the end. This may be replaced later by + // an overlapping extract. Take care not to replace an + // empty string here, we really want an empty slot, + // use find() + if (sparseDoc.find(sto+1) == sparseDoc.end()) { + sparseDoc[sto+1] = cstr_ellipsis; + } + + // Group done ? + if (grpoccs >= maxgrpoccs) { + ret |= ABSRES_TRUNC; + LOGABS("Db::makeAbstract: max group occs cutoff\n"); + break; + } + // Global done ? + if (totaloccs >= maxtotaloccs) { + ret |= ABSRES_TRUNC; + LOGABS("Db::makeAbstract: max occurrences cutoff\n"); + break; + } } +} - listList("Match terms: ", matchedTerms); +// Creating the abstract from index position data: after the query +// terms have been inserted at their place in the sparse array, and +// the neighboring positions marked, populate the neighbours: for each +// term in the document, walk its position list and populate slots +// around the query terms. We arbitrarily truncate the list to avoid +// taking forever. If we do cutoff, the abstract may be inconsistant +// (missing words, potentially altering meaning), which is bad. +void Query::Native::abstractPopulateContextTerms( + Xapian::Database& xrdb, + Xapian::docid docid, + unsigned int maxpos, + map& sparseDoc, + int& ret + ) +{ + Xapian::TermIterator term; + int cutoff = m_q->m_snipMaxPosWalk; + for (term = xrdb.termlist_begin(docid); + term != xrdb.termlist_end(docid); term++) { + // Ignore prefixed terms + if (has_prefix(*term)) + continue; + if (m_q->m_snipMaxPosWalk > 0 && cutoff-- < 0) { + ret |= ABSRES_TERMMISS; + LOGDEB0("makeAbstract: max term count cutoff " << + m_q->m_snipMaxPosWalk << "\n"); + break; + } - // Retrieve the term frequencies for the query terms. This is - // actually computed only once for a query, and for all terms in - // the query (not only the matches for this doc) - setDbWideQTermsFreqs(); - - // Build a sorted by quality container for the match terms We are - // going to try and show text around the less common search terms. - // Terms issued from an original one by stem expansion are - // aggregated by the qualityTerms() routine. - multimap > byQ; - double totalweight = qualityTerms(docid, matchedTerms, byQ); - LOGABS("makeAbstract:" << chron.millis() << "mS: computed Qcoefs.\n"); - // This can't happen, but would crash us - if (totalweight == 0.0) { - LOGERR("makeAbstract:"<::iterator vit; + Xapian::PositionIterator pos; + for (pos = xrdb.positionlist_begin(docid, *term); + pos != xrdb.positionlist_end(docid, *term); pos++) { + if (m_q->m_snipMaxPosWalk > 0 && cutoff-- < 0) { + ret |= ABSRES_TERMMISS; + LOGDEB0("makeAbstract: max term count cutoff " << + m_q->m_snipMaxPosWalk << "\n"); + break; + } + // If we are beyond the max possible position, stop + // for this term + if (*pos > maxpos) { + break; + } + if ((vit = sparseDoc.find(*pos)) != sparseDoc.end()) { + // Don't replace a term: the terms list is in + // alphabetic order, and we may have several terms + // at the same position, we want to keep only the + // first one (ie: dockes and dockes@wanadoo.fr) + if (vit->second.empty()) { + LOGDEB2("makeAbstract: populating: [" << *term << + "] at " << *pos << "\n"); + sparseDoc[*pos] = *term; + } + } + } } +} - Rcl::Db::Native *ndb(m_q->m_db->m_ndb); +// Creating the abstract from position data: final phase: extract the +// snippets from the sparse array. +void Query::Native::abstractCreateSnippetsVector( + Rcl::Db::Native *ndb, + map& sparseDoc, + unordered_set& searchTermPositions, + vector& vpbreaks, + vector& vabs) +{ + vabs.clear(); + string chunk; + bool incjk = false; + int page = 0; + string term; + + for (const auto& ent : sparseDoc) { + LOGDEB2("Abtract:output "<< ent.first <<" -> [" <getPageNumberForPosition(vpbreaks, ent.first); + if (page < 0) + page = 0; + term.clear(); + } + Utf8Iter uit(ent.second); + bool newcjk = false; + if (TextSplit::isCJK(*uit)) + newcjk = true; + if (!incjk || (incjk && !newcjk)) + chunk += " "; + incjk = newcjk; + if (searchTermPositions.find(ent.first) != searchTermPositions.end()) + term = ent.second; + if (ent.second == cstr_ellipsis) { + vabs.push_back(Snippet(page, chunk).setTerm(term)); + chunk.clear(); + } else { + if (ent.second.compare(end_of_field_term) && + ent.second.compare(start_of_field_term)) + chunk += ent.second; + } + } + if (!chunk.empty()) + vabs.push_back(Snippet(page, chunk).setTerm(term)); +} + +// Creating the abstract from index position data: top level routine +int Query::Native::abstractFromIndex( + Rcl::Db::Native *ndb, + Xapian::docid docid, + const vector& matchTerms, + const multimap> byQ, + double totalweight, + int ctxwords, + unsigned int maxtotaloccs, + vector& vabs, + Chrono& chron + ) +{ Xapian::Database& xrdb(ndb->xrdb); - - /////////////////// - // For each of the query terms, ask xapian for its positions list - // in the document. For each position entry, insert it and its - // neighbours in the set of 'interesting' positions - + int ret = ABSRES_OK; // The terms 'array' that we partially populate with the document // terms, at their positions around the search terms positions: map sparseDoc; @@ -370,22 +505,12 @@ int Query::Native::makeAbstract(Xapian::docid docid, // Total number of occurences for all terms. We stop when we have too much unsigned int totaloccs = 0; - // Total number of slots we populate. The 7 is taken as - // average word size. It was a mistake to have the user max - // abstract size parameter in characters, we basically only deal - // with words. We used to limit the character size at the end, but - // this damaged our careful selection of terms - const unsigned int maxtotaloccs = imaxoccs > 0 ? imaxoccs : - m_q->m_db->getAbsLen() /(7 * (m_q->m_db->getAbsCtxLen() + 1)); - int ctxwords = ictxwords == -1 ? m_q->m_db->getAbsCtxLen() : ictxwords; - LOGABS("makeAbstract:" << chron.millis() << "mS: mxttloccs " << - maxtotaloccs << " ctxwords " << ctxwords << "\n"); - - int ret = ABSRES_OK; - - // Let's go populate - for (multimap >::reverse_iterator mit = byQ.rbegin(); - mit != byQ.rend(); mit++) { + // First pass to populate the sparse document: we walk the term + // groups, beginning with the better ones, and insert each term at + // its position. We also insert empty strings at the surrounding + // positions. These are markers showing where we should insert + // data during the next pass. + for (auto mit = byQ.rbegin(); mit != byQ.rend(); mit++) { unsigned int maxgrpoccs; double q; if (byQ.size() == 1) { @@ -398,87 +523,30 @@ int Query::Native::makeAbstract(Xapian::docid docid, } unsigned int grpoccs = 0; - for (vector::const_iterator qit = mit->second.begin(); - qit != mit->second.end(); qit++) { - - // Group done ? + // For each term in user term expansion group + for (const auto& qterm : mit->second) { + // Enough for this group ? if (grpoccs >= maxgrpoccs) break; - string qterm = *qit; - LOGABS("makeAbstract: [" << qterm << "] " << maxgrpoccs << " max grp occs (coef " << q << ")\n"); - // The match term may span several words + // The match term may span several words (more than one position) int qtrmwrdcnt = TextSplit::countWords(qterm, TextSplit::TXTS_NOSPANS); - Xapian::PositionIterator pos; + // Populate positions for this query term. // There may be query terms not in this doc. This raises an // exception when requesting the position list, we catch it ?? // Not clear how this can happen because we are walking the // match list returned by Xapian. Maybe something with the // fields? - string emptys; try { - for (pos = xrdb.positionlist_begin(docid, qterm); - pos != xrdb.positionlist_end(docid, qterm); pos++) { - int ipos = *pos; - if (ipos < int(baseTextPosition)) // Not in text body - continue; - LOGABS("makeAbstract: [" << qterm << "] at pos " << - ipos << " grpoccs " << grpoccs << " maxgrpoccs " << - maxgrpoccs << "\n"); - - totaloccs++; - grpoccs++; - - // Add adjacent slots to the set to populate at next - // step by inserting empty strings. Special provisions - // for adding ellipsis and for positions overlapped by - // the match term. - unsigned int sta = MAX(int(baseTextPosition), - ipos - ctxwords); - unsigned int sto = ipos + qtrmwrdcnt-1 + - m_q->m_db->getAbsCtxLen(); - for (unsigned int ii = sta; ii <= sto; ii++) { - if (ii == (unsigned int)ipos) { - sparseDoc[ii] = qterm; - searchTermPositions.insert(ii); - if (ii > maxpos) - maxpos = ii; - } else if (ii > (unsigned int)ipos && - ii < (unsigned int)ipos + qtrmwrdcnt) { - sparseDoc[ii] = occupiedmarker; - } else if (!sparseDoc[ii].compare(cstr_ellipsis)) { - // For an empty slot, the test has a side - // effect of inserting an empty string which - // is what we want. - sparseDoc[ii] = emptys; - } - } - // Add ellipsis at the end. This may be replaced later by - // an overlapping extract. Take care not to replace an - // empty string here, we really want an empty slot, - // use find() - if (sparseDoc.find(sto+1) == sparseDoc.end()) { - sparseDoc[sto+1] = cstr_ellipsis; - } - - // Group done ? - if (grpoccs >= maxgrpoccs) { - ret |= ABSRES_TRUNC; - LOGABS("Db::makeAbstract: max group occs cutoff\n"); - break; - } - // Global done ? - if (totaloccs >= maxtotaloccs) { - ret |= ABSRES_TRUNC; - LOGABS("Db::makeAbstract: max occurrences cutoff\n"); - break; - } - } + abstractPopulateQTerm(xrdb, docid, qterm, qtrmwrdcnt, ctxwords, + maxgrpoccs,maxtotaloccs, sparseDoc, + searchTermPositions, maxpos, totaloccs, + grpoccs, ret); } catch (...) { // Term does not occur. No problem. } @@ -494,6 +562,7 @@ int Query::Native::makeAbstract(Xapian::docid docid, LOGABS("makeAbstract:" << chron.millis() << "mS:chosen number of positions " << totaloccs << "\n"); + // This can happen if there are term occurences in the keywords // etc. but not elsewhere ? if (totaloccs == 0) { @@ -501,124 +570,95 @@ int Query::Native::makeAbstract(Xapian::docid docid, return ABSRES_OK; } - // Walk all document's terms position lists and populate slots - // around the query terms. We arbitrarily truncate the list to - // avoid taking forever. If we do cutoff, the abstract may be - // inconsistant (missing words, potentially altering meaning), - // which is bad. - { - Xapian::TermIterator term; - int cutoff = m_q->m_snipMaxPosWalk; - for (term = xrdb.termlist_begin(docid); - term != xrdb.termlist_end(docid); term++) { - // Ignore prefixed terms - if (has_prefix(*term)) - continue; - if (m_q->m_snipMaxPosWalk > 0 && cutoff-- < 0) { - ret |= ABSRES_TERMMISS; - LOGDEB0("makeAbstract: max term count cutoff " << - m_q->m_snipMaxPosWalk << "\n"); - break; - } - - map::iterator vit; - Xapian::PositionIterator pos; - for (pos = xrdb.positionlist_begin(docid, *term); - pos != xrdb.positionlist_end(docid, *term); pos++) { - if (m_q->m_snipMaxPosWalk > 0 && cutoff-- < 0) { - ret |= ABSRES_TERMMISS; - LOGDEB0("makeAbstract: max term count cutoff " << - m_q->m_snipMaxPosWalk << "\n"); - break; - } - // If we are beyond the max possible position, stop - // for this term - if (*pos > maxpos) { - break; - } - if ((vit = sparseDoc.find(*pos)) != sparseDoc.end()) { - // Don't replace a term: the terms list is in - // alphabetic order, and we may have several terms - // at the same position, we want to keep only the - // first one (ie: dockes and dockes@wanadoo.fr) - if (vit->second.empty()) { - LOGDEB2("makeAbstract: populating: [" << *term << - "] at " << *pos << "\n"); - sparseDoc[*pos] = *term; - } - } - } - } - } + abstractPopulateContextTerms(xrdb, docid, maxpos, sparseDoc, ret); + LOGABS("makeAbstract:" << chron.millis() << "mS: all term poslist read\n"); -#if 0 - // Debug only: output the full term[position] vector - bool epty = false; - int ipos = 0; - for (map::iterator it = sparseDoc.begin(); - it != sparseDoc.end(); - it++, ipos++) { - if (it->empty()) { - if (!epty) - LOGDEB("makeAbstract:vec[" << ipos << "]: [" << it << "]\n"); - epty=true; - } else { - epty = false; - LOGDEB("makeAbstract:vec[" << ipos << "]: [" << it << "]\n"); - } - } -#endif - vector vpbreaks; ndb->getPagePositions(docid, vpbreaks); LOGABS("makeAbstract:" << chron.millis() << "mS: extracting. Got " << vpbreaks.size() << " pages\n"); - // Finally build the abstract by walking the map (in order of position) - vabs.clear(); - string chunk; - bool incjk = false; - int page = 0; - string term; - for (map::const_iterator it = sparseDoc.begin(); - it != sparseDoc.end(); it++) { - LOGDEB2("Abtract:output " << it->first << " -> [" << it->second << - "]\n"); - if (!occupiedmarker.compare(it->second)) { - LOGDEB("Abstract: qtrm position not filled ??\n"); - continue; - } - if (chunk.empty() && !vpbreaks.empty()) { - page = ndb->getPageNumberForPosition(vpbreaks, it->first); - if (page < 0) - page = 0; - term.clear(); - } - Utf8Iter uit(it->second); - bool newcjk = false; - if (TextSplit::isCJK(*uit)) - newcjk = true; - if (!incjk || (incjk && !newcjk)) - chunk += " "; - incjk = newcjk; - if (searchTermPositions.find(it->first) != searchTermPositions.end()) - term = it->second; - if (it->second == cstr_ellipsis) { - vabs.push_back(Snippet(page, chunk).setTerm(term)); - chunk.clear(); - } else { - if (it->second.compare(end_of_field_term) && - it->second.compare(start_of_field_term)) - chunk += it->second; - } - } - if (!chunk.empty()) - vabs.push_back(Snippet(page, chunk).setTerm(term)); + // Finally build the abstract by walking the map (in order of position) + abstractCreateSnippetsVector(ndb, sparseDoc, searchTermPositions, + vpbreaks, vabs); + LOGABS("makeAbtract: done in " << chron.millis() << " mS\n"); return ret; } +// Build a document abstract by extracting text chunks around the +// query terms. This can either uses the index position lists, or the +// stored document text, with very different implementations. +// +// DatabaseModified and other general exceptions are catched and +// possibly retried by our caller. +// +// @param[out] vabs the abstract is returned as a vector of snippets. +int Query::Native::makeAbstract(Xapian::docid docid, + vector& vabs, + int imaxoccs, int ictxwords) +{ + Chrono chron; + LOGABS("makeAbstract: docid " << docid << " imaxoccs " << + imaxoccs << " ictxwords " << ictxwords << "\n"); + + // The (unprefixed) terms matched by this document + vector matchedTerms; + getMatchTerms(docid, matchedTerms); + if (matchedTerms.empty()) { + LOGDEB("makeAbstract:" << chron.millis() << "mS:Empty term list\n"); + return ABSRES_ERROR; + } + + LOGDEB("Match terms: " << stringsToString(matchedTerms) << endl); + + // Retrieve the term frequencies for the query terms. This is + // actually computed only once for a query, and for all terms in + // the query (not only the matches for this doc) + setDbWideQTermsFreqs(); + + // Build a sorted by quality container for the match terms We are + // going to try and show text around the less common search terms. + // Terms issued from an original one by stem expansion are + // aggregated by the qualityTerms() routine (this is what we call + // 'term groups' in the following: index terms expanded from the + // same user term). + multimap> byQ; + double totalweight = qualityTerms(docid, matchedTerms, byQ); + LOGABS("makeAbstract:" << chron.millis() << "mS: computed Qcoefs.\n"); + // This can't happen, but would crash us + if (totalweight == 0.0) { + LOGERR("makeAbstract:"<m_db->m_ndb); + Xapian::Database& xrdb(ndb->xrdb); + + // Total number of slots we populate. The 7 is taken as + // average word size. It was a mistake to have the user max + // abstract size parameter in characters, we basically only deal + // with words. We used to limit the character size at the end, but + // this damaged our careful selection of terms + const unsigned int maxtotaloccs = imaxoccs > 0 ? imaxoccs : + m_q->m_db->getAbsLen() /(7 * (m_q->m_db->getAbsCtxLen() + 1)); + int ctxwords = ictxwords == -1 ? m_q->m_db->getAbsCtxLen() : ictxwords; + LOGABS("makeAbstract:" << chron.millis() << "mS: mxttloccs " << + maxtotaloccs << " ctxwords " << ctxwords << "\n"); + + if (o_index_storerawtext) { + return abstractFromText(ndb, docid, matchedTerms, byQ, + totalweight, ctxwords, maxtotaloccs, vabs, + chron); + } else { + return abstractFromIndex(ndb, docid, matchedTerms, byQ, + totalweight, ctxwords, maxtotaloccs, vabs, + chron); + } +} + + } diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index 4f35516c..8f278447 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -61,6 +61,7 @@ using namespace std; #ifdef RCL_USE_ASPELL #include "rclaspell.h" #endif +#include "zlibut.h" // Recoll index format version is stored in user metadata. When this change, // we can't open the db and will have to reindex. @@ -1458,8 +1459,19 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc) #ifdef TEXTSPLIT_STATS splitter.resetStats(); #endif - if (!splitter.text_to_words(doc.text)) + if (!splitter.text_to_words(doc.text)) { LOGDEB("Db::addOrUpdate: split failed for main text\n"); + } else { +#ifdef RAWTEXT_IN_VALUE + if (o_index_storerawtext) { + ZLibUtBuf buf; + deflateToBuf(doc.text.c_str(), doc.text.size(), buf); + string tt; + tt.assign(buf.getBuf(), buf.getCnt()); + newdocument.add_value(VALUE_RAWTEXT, tt); + } +#endif + } #ifdef TEXTSPLIT_STATS // Reject bad data. unrecognized base64 text is characterized by @@ -1670,6 +1682,12 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc) newdocument.add_boolean_term(wrap_prefix("XM") + *md5); } +#ifdef RAWTEXT_IN_DATA + if (o_index_storerawtext) { + RECORD_APPEND(record, string("RAWTEXT"), + neutchars(doc.text, cstr_nc)); + } +#endif LOGDEB0("Rcl::Db::add: new doc record:\n" << record << "\n"); newdocument.set_data(record); } diff --git a/src/rcldb/rcldb.h b/src/rcldb/rcldb.h index e7485ad1..0c8cca61 100644 --- a/src/rcldb/rcldb.h +++ b/src/rcldb/rcldb.h @@ -67,8 +67,14 @@ enum value_slot { VALUE_MD5 = 1, // 16 byte MD5 checksum of original document. VALUE_SIZE = 2, // sortable_serialise() - // Recoll only: - VALUE_SIG = 10 // Doc sig as chosen by app (ex: mtime+size + ////////// Recoll only: + // Doc sig as chosen by app (ex: mtime+size + VALUE_SIG = 10, + // Doc extracted text, with punctuation: splitter input. Used for + // generating snippets. This is only used if RAWTEXT_IN_VALUE is + // defined (else the text goes to the data record), but reserve + // the value in any case. + VALUE_RAWTEXT= 11, }; class SearchData; diff --git a/src/rcldb/rcldb_p.h b/src/rcldb/rcldb_p.h index f895d172..12302869 100644 --- a/src/rcldb/rcldb_p.h +++ b/src/rcldb/rcldb_p.h @@ -177,5 +177,12 @@ class Db::Native { // (abstract, keywords, etc.. are stored before this) static const unsigned int baseTextPosition = 100000; +// Store raw doc text in data record or value slot ? +#if 0 +#define RAWTEXT_IN_DATA 1 +#elif 1 +#define RAWTEXT_IN_VALUE 1 +#endif + } #endif /* _rcldb_p_h_included_ */ diff --git a/src/rcldb/rclquery_p.h b/src/rcldb/rclquery_p.h index 516e73cf..1508540a 100644 --- a/src/rcldb/rclquery_p.h +++ b/src/rcldb/rclquery_p.h @@ -20,10 +20,13 @@ #include #include #include +#include #include #include "rclquery.h" +class Chrono; + namespace Rcl { class Query::Native { @@ -58,6 +61,56 @@ public: double qualityTerms(Xapian::docid docid, const std::vector& terms, std::multimap >& byQ); + void abstractPopulateQTerm( + Xapian::Database& xrdb, + Xapian::docid docid, + const string& qterm, + int qtrmwrdcnt, + int ctxwords, + unsigned int maxgrpoccs, + unsigned int maxtotaloccs, + std::map& sparseDoc, + std::unordered_set& searchTermPositions, + unsigned int& maxpos, + unsigned int& totaloccs, + unsigned int& grpoccs, + int& ret + ); + void abstractPopulateContextTerms( + Xapian::Database& xrdb, + Xapian::docid docid, + unsigned int maxpos, + std::map& sparseDoc, + int& ret + ); + void abstractCreateSnippetsVector( + Db::Native *ndb, + std::map& sparseDoc, + std::unordered_set& searchTermPositions, + std::vector& vpbreaks, + std::vector& vabs); + int abstractFromIndex( + Rcl::Db::Native *ndb, + Xapian::docid docid, + const std::vector& matchTerms, + const std::multimap> byQ, + double totalweight, + int ctxwords, + unsigned int maxtotaloccs, + std::vector& vabs, + Chrono& chron + ); + int abstractFromText( + Rcl::Db::Native *ndb, + Xapian::docid docid, + const std::vector& matchTerms, + const std::multimap> byQ, + double totalweight, + int ctxwords, + unsigned int maxtotaloccs, + vector& vabs, + Chrono& chron + ); }; } diff --git a/src/sampleconf/recoll.conf b/src/sampleconf/recoll.conf index d3428178..a14424e0 100644 --- a/src/sampleconf/recoll.conf +++ b/src/sampleconf/recoll.conf @@ -214,7 +214,7 @@ membermaxkbs = 50000 # Parameters affecting how we generate -# terms +# terms and organize the index # Changing some of these parameters will imply a full # reindex. Also, when using multiple indexes, it may not make sense @@ -231,6 +231,21 @@ membermaxkbs = 50000 # implies an index reset. indexStripChars = 1 +# Decide if we store the +# documents' text content in the index.Storing the text +# allows extracting snippets from it at query time, +# instead of building them from index position data. This Has become +# necessary for versions of Xapian 1.6, which have dropped support +# for the chert index format, and adopted a setup which renders our +# use of positions list unacceptably slow in cases. 'raw' text here +# means that the text is not stripped of upper-case, diacritics, or +# punctuation signs. It is still translated from its original format +# to UTF-8 plain text. This increases the index size by 10-20% typically, +# but also allows for nicer snippets, so it may be worth enabling it even +# if not strictly needed for performance if you can afford the space. +# +indexStoreRawText = 0 + # Decides if terms will be # generated for numbers.For example "123", "1.5e6", # 192.168.1.4, would not be indexed if nonumbers is set ("value123" would