Snippets generation: add method for generating from doc stored text. Still needs refining, esp. for phrase/near
This commit is contained in:
parent
040f62f1d2
commit
b4493ed9e1
@ -154,6 +154,7 @@ rcldb/daterange.h \
|
||||
rcldb/expansiondbs.cpp \
|
||||
rcldb/expansiondbs.h \
|
||||
rcldb/rclabstract.cpp \
|
||||
rcldb/rclabsfromtext.cpp \
|
||||
rcldb/rcldb.cpp \
|
||||
rcldb/rcldb.h \
|
||||
rcldb/rcldb_p.h \
|
||||
|
||||
@ -60,6 +60,16 @@ using namespace std;
|
||||
// We default to a case- and diacritics-less index for now
|
||||
bool o_index_stripchars = true;
|
||||
|
||||
// Store document text in index. Allows extracting snippets from text
|
||||
// instead of building them from index position data. Has become
|
||||
// necessary for versions of Xapian 1.6, which have dropped support
|
||||
// for the chert index format, and adopted a setup which renders our
|
||||
// use of positions list unacceptably slow in cases. 'raw' text here
|
||||
// means that the text is not stripped of upper-case, diacritics, or
|
||||
// punctuation signs. It is still translated from its original format
|
||||
// to UTF-8 plain text.
|
||||
bool o_index_storerawtext = false;
|
||||
|
||||
bool o_uptodate_test_use_mtime = false;
|
||||
|
||||
string RclConfig::o_localecharset;
|
||||
@ -391,6 +401,7 @@ bool RclConfig::updateMainConfig()
|
||||
static int m_index_stripchars_init = 0;
|
||||
if (!m_index_stripchars_init) {
|
||||
getConfParam("indexStripChars", &o_index_stripchars);
|
||||
getConfParam("indexStoreRawText", &o_index_storerawtext);
|
||||
getConfParam("testmodifusemtime", &o_uptodate_test_use_mtime);
|
||||
m_index_stripchars_init = 1;
|
||||
}
|
||||
|
||||
@ -438,6 +438,16 @@ class RclConfig {
|
||||
// reset. When using multiple indexes, all must have the same value
|
||||
extern bool o_index_stripchars;
|
||||
|
||||
// Store document text in index. Allows extracting snippets from text
|
||||
// instead of building them from index position data. Has become
|
||||
// necessary for versions of Xapian 1.6, which have dropped support
|
||||
// for the chert index format, and adopted a setup which renders our
|
||||
// use of positions list unacceptably slow in cases. 'raw' text here
|
||||
// means that the text is not stripped of upper-case, diacritics, or
|
||||
// punctuation signs. It is still translated from its original format
|
||||
// to UTF-8 plain text.
|
||||
extern bool o_index_storerawtext;
|
||||
|
||||
// This global variable defines if we use mtime instead of ctime for
|
||||
// up-to-date tests. This is mostly incompatible with xattr indexing,
|
||||
// in addition to other issues. See recoll.conf comments.
|
||||
|
||||
298
src/rcldb/rclabsfromtext.cpp
Normal file
298
src/rcldb/rclabsfromtext.cpp
Normal file
@ -0,0 +1,298 @@
|
||||
/* Copyright (C) 2004-2017 J.F.Dockes
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc.,
|
||||
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
*/
|
||||
#include "autoconfig.h"
|
||||
|
||||
#include <math.h>
|
||||
|
||||
#include <map>
|
||||
#include <unordered_map>
|
||||
#include <deque>
|
||||
#include <algorithm>
|
||||
|
||||
#include "log.h"
|
||||
#include "rcldb.h"
|
||||
#include "rcldb_p.h"
|
||||
#include "rclquery.h"
|
||||
#include "rclquery_p.h"
|
||||
#include "textsplit.h"
|
||||
#include "hldata.h"
|
||||
#include "chrono.h"
|
||||
#include "unacpp.h"
|
||||
#include "zlibut.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
|
||||
namespace Rcl {
|
||||
|
||||
#warning NEAR and PHRASE
|
||||
|
||||
// Text splitter for finding the match terms in the doc text.
|
||||
class TextSplitABS : public TextSplit {
|
||||
public:
|
||||
|
||||
struct MatchEntry {
|
||||
// Start/End byte offsets of fragment in the document text
|
||||
int start;
|
||||
int stop;
|
||||
double coef;
|
||||
// Position of the first matched term.
|
||||
unsigned int hitpos;
|
||||
// "best term" for this match
|
||||
string term;
|
||||
// Hilight areas (each is one or several contiguous match terms).
|
||||
vector<pair<int,int>> hlzones;
|
||||
|
||||
MatchEntry(int sta, int sto, double c, vector<pair<int,int>>& hl,
|
||||
unsigned int pos, string& trm)
|
||||
: start(sta), stop(sto), coef(c), hitpos(pos) {
|
||||
hlzones.swap(hl);
|
||||
term.swap(trm);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
TextSplitABS(const vector<string>& matchTerms,
|
||||
unordered_map<string, double>& wordcoefs,
|
||||
unsigned int ctxwords,
|
||||
Flags flags = TXTS_NONE)
|
||||
: TextSplit(flags), m_terms(matchTerms.begin(), matchTerms.end()),
|
||||
m_wordcoefs(wordcoefs), m_ctxwords(ctxwords) {
|
||||
LOGDEB("TextSPlitABS: ctxwords " << ctxwords << endl);
|
||||
}
|
||||
|
||||
// Accept a word and its position. If the word is a matched term,
|
||||
// add/update fragment definition.
|
||||
virtual bool takeword(const std::string& term, int pos, int bts, int bte) {
|
||||
LOGDEB2("takeword: " << term << endl);
|
||||
|
||||
// Recent past
|
||||
m_prevterms.push_back(pair<int,int>(bts,bte));
|
||||
if (m_prevterms.size() > m_ctxwords+1) {
|
||||
m_prevterms.pop_front();
|
||||
}
|
||||
|
||||
string dumb;
|
||||
if (o_index_stripchars) {
|
||||
if (!unacmaybefold(term, dumb, "UTF-8", UNACOP_UNACFOLD)) {
|
||||
LOGINFO("abstract: unac failed for [" << term << "]\n");
|
||||
return true;
|
||||
}
|
||||
} else {
|
||||
dumb = term;
|
||||
}
|
||||
|
||||
if (m_terms.find(dumb) != m_terms.end()) {
|
||||
// This word is a search term. Extend or create fragment
|
||||
LOGDEB2("match: [" << dumb << "] current: " << m_curfrag.first <<
|
||||
", " << m_curfrag.second << " remain " <<
|
||||
m_remainingWords << endl);
|
||||
double coef = m_wordcoefs[dumb];
|
||||
if (!m_remainingWords) {
|
||||
// No current fragment
|
||||
m_curhitpos = baseTextPosition + pos;
|
||||
m_curfrag.first = m_prevterms.front().first;
|
||||
m_curfrag.second = m_prevterms.back().second;
|
||||
m_curhlzones.push_back(pair<int,int>(bts, bte));
|
||||
m_curterm = term;
|
||||
m_curtermcoef = coef;
|
||||
} else {
|
||||
LOGDEB2("Extending current fragment: " << m_remainingWords <<
|
||||
" -> " << m_ctxwords << endl);
|
||||
m_extcount++;
|
||||
if (m_prevwordhit) {
|
||||
m_curhlzones.back().second = bte;
|
||||
} else {
|
||||
m_curhlzones.push_back(pair<int,int>(bts, bte));
|
||||
}
|
||||
if (coef > m_curtermcoef) {
|
||||
m_curterm = term;
|
||||
m_curtermcoef = coef;
|
||||
}
|
||||
}
|
||||
m_prevwordhit = true;
|
||||
m_curfragcoef += coef;
|
||||
m_remainingWords = m_ctxwords + 1;
|
||||
if (m_extcount > 3) {
|
||||
// Limit expansion of contiguous fragments (this is to
|
||||
// avoid common terms in search causing long
|
||||
// heavyweight meaningless fragments. Also, limit length).
|
||||
m_remainingWords = 1;
|
||||
m_extcount = 0;
|
||||
}
|
||||
} else {
|
||||
m_prevwordhit = false;
|
||||
}
|
||||
|
||||
|
||||
if (m_remainingWords) {
|
||||
// Fragment currently open. Time to close ?
|
||||
m_remainingWords--;
|
||||
m_curfrag.second = bte;
|
||||
if (m_remainingWords == 0) {
|
||||
if (m_totalcoef < 5.0 || m_curfragcoef >= 1.0) {
|
||||
// Don't push bad fragments if we have a lot already
|
||||
m_fragments.push_back(MatchEntry(m_curfrag.first,
|
||||
m_curfrag.second,
|
||||
m_curfragcoef,
|
||||
m_curhlzones,
|
||||
m_curhitpos,
|
||||
m_curterm
|
||||
));
|
||||
}
|
||||
m_totalcoef += m_curfragcoef;
|
||||
m_curfragcoef = 0.0;
|
||||
m_curtermcoef = 0.0;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
const vector<MatchEntry>& getFragments() {
|
||||
return m_fragments;
|
||||
}
|
||||
|
||||
private:
|
||||
// Past terms because we need to go back for context before a hit
|
||||
deque<pair<int,int>> m_prevterms;
|
||||
// Data about the fragment we are building
|
||||
pair<int,int> m_curfrag{0,0};
|
||||
double m_curfragcoef{0.0};
|
||||
unsigned int m_remainingWords{0};
|
||||
unsigned int m_extcount{0};
|
||||
vector<pair<int,int>> m_curhlzones;
|
||||
bool m_prevwordhit{false};
|
||||
// Current sum of fragment weights
|
||||
double m_totalcoef{0.0};
|
||||
// Position of 1st term match (for page number computations)
|
||||
unsigned int m_curhitpos{0};
|
||||
// "best" term
|
||||
string m_curterm;
|
||||
double m_curtermcoef{0.0};
|
||||
|
||||
// Input
|
||||
set<string> m_terms;
|
||||
unordered_map<string, double>& m_wordcoefs;
|
||||
unsigned int m_ctxwords;
|
||||
|
||||
// Result: begin and end byte positions of query terms/groups in text
|
||||
vector<MatchEntry> m_fragments;
|
||||
};
|
||||
|
||||
int Query::Native::abstractFromText(
|
||||
Rcl::Db::Native *ndb,
|
||||
Xapian::docid docid,
|
||||
const vector<string>& matchTerms,
|
||||
const multimap<double, vector<string>> byQ,
|
||||
double totalweight,
|
||||
int ctxwords,
|
||||
unsigned int maxtotaloccs,
|
||||
vector<Snippet>& vabs,
|
||||
Chrono&
|
||||
)
|
||||
{
|
||||
Xapian::Database& xrdb(ndb->xrdb);
|
||||
Xapian::Document xdoc;
|
||||
|
||||
string reason;
|
||||
XAPTRY(xdoc = xrdb.get_document(docid), xrdb, reason);
|
||||
if (!reason.empty()) {
|
||||
LOGERR("abstractFromText: could not get doc: " << reason << endl);
|
||||
return ABSRES_ERROR;
|
||||
}
|
||||
|
||||
string rawtext, data;
|
||||
#ifdef RAWTEXT_IN_DATA
|
||||
XAPTRY(data = xdoc.get_data(), xrdb, reason);
|
||||
if (!reason.empty()) {
|
||||
LOGERR("abstractFromText: could not get data: " << reason << endl);
|
||||
return ABSRES_ERROR;
|
||||
}
|
||||
Doc doc;
|
||||
if (ndb->dbDataToRclDoc(docid, data, doc)) {
|
||||
rawtext = doc.meta["RAWTEXT"];
|
||||
}
|
||||
#endif
|
||||
#ifdef RAWTEXT_IN_VALUE
|
||||
XAPTRY(rawtext = xdoc.get_value(VALUE_RAWTEXT), xrdb, reason);
|
||||
if (!reason.empty()) {
|
||||
LOGERR("abstractFromText: could not get value: " << reason << endl);
|
||||
return ABSRES_ERROR;
|
||||
}
|
||||
ZLibUtBuf cbuf;
|
||||
inflateToBuf(rawtext.c_str(), rawtext.size(), cbuf);
|
||||
rawtext.assign(cbuf.getBuf(), cbuf.getCnt());
|
||||
#endif
|
||||
|
||||
if (rawtext.empty()) {
|
||||
LOGDEB0("abstractFromText: no text\n");
|
||||
return ABSRES_ERROR;
|
||||
}
|
||||
|
||||
// tryout the xapian internal method.
|
||||
#if 0 && ! (XAPIAN_MAJOR_VERSION <= 1 && XAPIAN_MINOR_VERSION <= 2) && \
|
||||
(defined(RAWTEXT_IN_DATA) || defined(RAWTEXT_IN_VALUE))
|
||||
string snippet = xmset.snippet(rawtext);
|
||||
LOGDEB("SNIPPET: [" << snippet << "] END SNIPPET\n");
|
||||
#endif
|
||||
|
||||
// We need the q coefs for individual terms
|
||||
unordered_map<string, double> wordcoefs;
|
||||
for (const auto& mment : byQ) {
|
||||
for (const auto& word : mment.second) {
|
||||
wordcoefs[word] = mment.first;
|
||||
}
|
||||
}
|
||||
TextSplitABS splitter(matchTerms, wordcoefs, ctxwords,
|
||||
TextSplit::TXTS_ONLYSPANS);
|
||||
splitter.text_to_words(rawtext);
|
||||
const vector<TextSplitABS::MatchEntry>& res1 = splitter.getFragments();
|
||||
vector<TextSplitABS::MatchEntry> result(res1.begin(), res1.end());
|
||||
std::sort(result.begin(), result.end(),
|
||||
[](const TextSplitABS::MatchEntry& a,
|
||||
const TextSplitABS::MatchEntry& b) -> bool {
|
||||
return a.coef > b.coef;
|
||||
}
|
||||
);
|
||||
|
||||
static const string cstr_nc("\n\r\x0c\\");
|
||||
vector<int> vpbreaks;
|
||||
ndb->getPagePositions(docid, vpbreaks);
|
||||
unsigned int count = 0;
|
||||
for (const auto& entry : result) {
|
||||
string frag = neutchars(
|
||||
rawtext.substr(entry.start, entry.stop - entry.start), cstr_nc);
|
||||
#if 0
|
||||
static const string starthit("<span style='color: blue;'>");
|
||||
static const string endhit("</span>");
|
||||
size_t inslen = 0;
|
||||
for (const auto& hlzone: entry.hlzones) {
|
||||
frag.replace(hlzone.first - entry.start + inslen, 0, starthit);
|
||||
inslen += starthit.size();
|
||||
frag.replace(hlzone.second - entry.start + inslen, 0, endhit);
|
||||
inslen += endhit.size();
|
||||
}
|
||||
#endif
|
||||
LOGDEB("=== FRAGMENT: Coef: " << entry.coef << ": " << frag << endl);
|
||||
int page = ndb->getPageNumberForPosition(vpbreaks, entry.hitpos);
|
||||
vabs.push_back(Snippet(page, frag).setTerm(entry.term));
|
||||
if (count++ >= maxtotaloccs)
|
||||
break;
|
||||
}
|
||||
return ABSRES_OK;
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,4 +1,4 @@
|
||||
/* Copyright (C) 2004 J.F.Dockes
|
||||
/* Copyright (C) 2004-2017 J.F.Dockes
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
@ -19,6 +19,9 @@
|
||||
#include <math.h>
|
||||
|
||||
#include <map>
|
||||
#include <unordered_map>
|
||||
#include <deque>
|
||||
#include <algorithm>
|
||||
|
||||
#include "log.h"
|
||||
#include "rcldb.h"
|
||||
@ -33,30 +36,22 @@
|
||||
|
||||
using namespace std;
|
||||
|
||||
|
||||
namespace Rcl {
|
||||
|
||||
// This is used as a marker inside the abstract frag lists, but
|
||||
// normally doesn't remain in final output (which is built with a
|
||||
// custom sep. by our caller).
|
||||
static const string cstr_ellipsis("...");
|
||||
static const string emptys;
|
||||
// This is used to mark positions overlapped by a multi-word match term
|
||||
static const string occupiedmarker("?");
|
||||
|
||||
#undef DEBUGABSTRACT
|
||||
#define DEBUGABSTRACT
|
||||
#ifdef DEBUGABSTRACT
|
||||
#define LOGABS LOGDEB
|
||||
static void listList(const string& what, const vector<string>&l)
|
||||
{
|
||||
string a;
|
||||
for (vector<string>::const_iterator it = l.begin(); it != l.end(); it++) {
|
||||
a = a + *it + " ";
|
||||
}
|
||||
LOGDEB("" << what << ": " << a << "\n");
|
||||
}
|
||||
#else
|
||||
#define LOGABS LOGDEB2
|
||||
static void listList(const string&, const vector<string>&)
|
||||
{
|
||||
}
|
||||
#endif
|
||||
|
||||
// Unprefix terms. Actually it's not completely clear if we should
|
||||
@ -66,13 +61,12 @@ static void listList(const string&, const vector<string>&)
|
||||
static const bool prune_prefixed_terms = true;
|
||||
static void noPrefixList(const vector<string>& in, vector<string>& out)
|
||||
{
|
||||
for (vector<string>::const_iterator qit = in.begin();
|
||||
qit != in.end(); qit++) {
|
||||
for (const auto& term : in) {
|
||||
if (prune_prefixed_terms) {
|
||||
if (has_prefix(*qit))
|
||||
if (has_prefix(term))
|
||||
continue;
|
||||
}
|
||||
out.push_back(strip_prefix(*qit));
|
||||
out.push_back(strip_prefix(term));
|
||||
}
|
||||
sort(out.begin(), out.end());
|
||||
vector<string>::iterator it = unique(out.begin(), out.end());
|
||||
@ -117,18 +111,17 @@ void Query::Native::setDbWideQTermsFreqs()
|
||||
m_q->getQueryTerms(iqterms);
|
||||
noPrefixList(iqterms, qterms);
|
||||
}
|
||||
// listList("Query terms: ", qterms);
|
||||
LOGDEB("Query terms: " << stringsToString(qterms) << endl);
|
||||
Xapian::Database &xrdb = m_q->m_db->m_ndb->xrdb;
|
||||
|
||||
double doccnt = xrdb.get_doccount();
|
||||
if (doccnt == 0)
|
||||
doccnt = 1;
|
||||
|
||||
for (vector<string>::const_iterator qit = qterms.begin();
|
||||
qit != qterms.end(); qit++) {
|
||||
termfreqs[*qit] = xrdb.get_termfreq(*qit) / doccnt;
|
||||
LOGABS("setDbWideQTermFreqs: [" << *qit << "] db freq " <<
|
||||
termfreqs[*qit] << "\n");
|
||||
for (const auto& term : qterms) {
|
||||
termfreqs[term] = xrdb.get_termfreq(term) / doccnt;
|
||||
LOGABS("setDbWideQTermFreqs: [" << term << "] db freq " <<
|
||||
termfreqs[term] << "\n");
|
||||
}
|
||||
}
|
||||
|
||||
@ -162,36 +155,29 @@ double Query::Native::qualityTerms(Xapian::docid docid,
|
||||
m_q->m_sd->getTerms(hld);
|
||||
}
|
||||
|
||||
#ifdef DEBUGABSTRACT
|
||||
{
|
||||
string deb;
|
||||
hld.toString(deb);
|
||||
LOGABS("qualityTerms: hld: " << deb << "\n");
|
||||
}
|
||||
#endif
|
||||
|
||||
// Group the input terms by the user term they were possibly expanded from
|
||||
// Group the input terms by the user term they were possibly
|
||||
// expanded from (by stemming)
|
||||
map<string, vector<string> > byRoot;
|
||||
for (vector<string>::const_iterator qit = terms.begin();
|
||||
qit != terms.end(); qit++) {
|
||||
map<string, string>::const_iterator eit = hld.terms.find(*qit);
|
||||
for (const auto& term: terms) {
|
||||
map<string, string>::const_iterator eit = hld.terms.find(term);
|
||||
if (eit != hld.terms.end()) {
|
||||
byRoot[eit->second].push_back(*qit);
|
||||
byRoot[eit->second].push_back(term);
|
||||
} else {
|
||||
LOGDEB0("qualityTerms: [" << *qit << "] not found in hld\n");
|
||||
byRoot[*qit].push_back(*qit);
|
||||
LOGDEB0("qualityTerms: [" << term << "] not found in hld\n");
|
||||
byRoot[term].push_back(term);
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef DEBUGABSTRACT
|
||||
{
|
||||
string deb;
|
||||
hld.toString(deb);
|
||||
LOGABS("qualityTerms: hld: " << deb << "\n");
|
||||
string byRootstr;
|
||||
for (map<string, vector<string> >::const_iterator debit =
|
||||
byRoot.begin(); debit != byRoot.end(); debit++) {
|
||||
byRootstr.append("[").append(debit->first).append("]->");
|
||||
for (vector<string>::const_iterator it = debit->second.begin();
|
||||
it != debit->second.end(); it++) {
|
||||
byRootstr.append("[").append(*it).append("] ");
|
||||
for (const auto& entry : byRoot) {
|
||||
byRootstr.append("[").append(entry.first).append("]->");
|
||||
for (const auto& term : entry.second) {
|
||||
byRootstr.append("[").append(term).append("] ");
|
||||
}
|
||||
byRootstr.append("\n");
|
||||
}
|
||||
@ -202,28 +188,25 @@ double Query::Native::qualityTerms(Xapian::docid docid,
|
||||
// Compute in-document and global frequencies for the groups.
|
||||
map<string, double> grpwdfs;
|
||||
map<string, double> grptfreqs;
|
||||
for (map<string, vector<string> >::const_iterator git = byRoot.begin();
|
||||
git != byRoot.end(); git++) {
|
||||
for (vector<string>::const_iterator qit = git->second.begin();
|
||||
qit != git->second.end(); qit++) {
|
||||
Xapian::TermIterator term = xrdb.termlist_begin(docid);
|
||||
term.skip_to(*qit);
|
||||
if (term != xrdb.termlist_end(docid) && *term == *qit) {
|
||||
if (grpwdfs.find(git->first) != grpwdfs.end()) {
|
||||
grpwdfs[git->first] = term.get_wdf() / doclen;
|
||||
grptfreqs[git->first] = termfreqs[*qit];
|
||||
for (const auto& group : byRoot) {
|
||||
for (const auto& term : group.second) {
|
||||
Xapian::TermIterator xtermit = xrdb.termlist_begin(docid);
|
||||
xtermit.skip_to(term);
|
||||
if (xtermit != xrdb.termlist_end(docid) && *xtermit == term) {
|
||||
if (grpwdfs.find(group.first) != grpwdfs.end()) {
|
||||
grpwdfs[group.first] = xtermit.get_wdf() / doclen;
|
||||
grptfreqs[group.first] = termfreqs[term];
|
||||
} else {
|
||||
grpwdfs[git->first] += term.get_wdf() / doclen;
|
||||
grptfreqs[git->first] += termfreqs[*qit];
|
||||
grpwdfs[group.first] += xtermit.get_wdf() / doclen;
|
||||
grptfreqs[group.first] += termfreqs[term];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Build a sorted by quality container for the groups
|
||||
for (map<string, vector<string> >::const_iterator git = byRoot.begin();
|
||||
git != byRoot.end(); git++) {
|
||||
double q = (grpwdfs[git->first]) * grptfreqs[git->first];
|
||||
for (const auto& group : byRoot) {
|
||||
double q = (grpwdfs[group.first]) * grptfreqs[group.first];
|
||||
q = -log10(q);
|
||||
if (q < 3) {
|
||||
q = 0.05;
|
||||
@ -237,22 +220,19 @@ double Query::Native::qualityTerms(Xapian::docid docid,
|
||||
q = 1;
|
||||
}
|
||||
totalweight += q;
|
||||
byQ.insert(pair<double, vector<string> >(q, git->second));
|
||||
byQ.insert(pair<double, vector<string> >(q, group.second));
|
||||
}
|
||||
|
||||
#ifdef DEBUGABSTRACT
|
||||
for (multimap<double, vector<string> >::reverse_iterator mit= byQ.rbegin();
|
||||
mit != byQ.rend(); mit++) {
|
||||
LOGABS("qualityTerms: group\n");
|
||||
for (vector<string>::const_iterator qit = mit->second.begin();
|
||||
qit != mit->second.end(); qit++) {
|
||||
LOGABS("" << mit->first << "->[" << *qit << "]\n");
|
||||
}
|
||||
for (auto mit= byQ.rbegin(); mit != byQ.rend(); mit++) {
|
||||
LOGABS("qualityTerms: coef: " << mit->first << " group: " <<
|
||||
stringsToString(mit->second) << endl);
|
||||
}
|
||||
#endif
|
||||
return totalweight;
|
||||
}
|
||||
|
||||
|
||||
// Return page number for first match of "significant" term.
|
||||
int Query::Native::getFirstMatchPage(Xapian::docid docid, string& term)
|
||||
{
|
||||
@ -283,8 +263,7 @@ int Query::Native::getFirstMatchPage(Xapian::docid docid, string& term)
|
||||
multimap<double, vector<string> > byQ;
|
||||
qualityTerms(docid, terms, byQ);
|
||||
|
||||
for (multimap<double, vector<string> >::reverse_iterator mit = byQ.rbegin();
|
||||
mit != byQ.rend(); mit++) {
|
||||
for (auto mit = byQ.rbegin(); mit != byQ.rend(); mit++) {
|
||||
for (vector<string>::const_iterator qit = mit->second.begin();
|
||||
qit != mit->second.end(); qit++) {
|
||||
string qterm = *qit;
|
||||
@ -307,55 +286,211 @@ int Query::Native::getFirstMatchPage(Xapian::docid docid, string& term)
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Build a document abstract by extracting text chunks around the query terms
|
||||
// This uses the db termlists, not the original document.
|
||||
//
|
||||
// DatabaseModified and other general exceptions are catched and
|
||||
// possibly retried by our caller
|
||||
int Query::Native::makeAbstract(Xapian::docid docid,
|
||||
vector<Snippet>& vabs,
|
||||
int imaxoccs, int ictxwords)
|
||||
// Creating the abstract from index position data: populate the sparse
|
||||
// array with the positions for a given query term, and mark the
|
||||
// neighboring positions.
|
||||
void Query::Native::abstractPopulateQTerm(
|
||||
Xapian::Database& xrdb,
|
||||
Xapian::docid docid,
|
||||
const string& qterm,
|
||||
int qtrmwrdcnt,
|
||||
int ctxwords,
|
||||
unsigned int maxgrpoccs,
|
||||
unsigned int maxtotaloccs,
|
||||
map<unsigned int, string>& sparseDoc,
|
||||
unordered_set<unsigned int>& searchTermPositions,
|
||||
unsigned int& maxpos,
|
||||
unsigned int& totaloccs,
|
||||
unsigned int& grpoccs,
|
||||
int& ret
|
||||
)
|
||||
{
|
||||
Chrono chron;
|
||||
LOGABS("makeAbstract: docid " << docid << " imaxoccs " <<
|
||||
imaxoccs << " ictxwords " << ictxwords << "\n");
|
||||
Xapian::PositionIterator pos;
|
||||
|
||||
// The (unprefixed) terms matched by this document
|
||||
vector<string> matchedTerms;
|
||||
getMatchTerms(docid, matchedTerms);
|
||||
if (matchedTerms.empty()) {
|
||||
LOGDEB("makeAbstract:"<<chron.millis()<<"mS:Empty term list\n");
|
||||
return ABSRES_ERROR;
|
||||
// Walk the position list for this term.
|
||||
for (pos = xrdb.positionlist_begin(docid, qterm);
|
||||
pos != xrdb.positionlist_end(docid, qterm); pos++) {
|
||||
int ipos = *pos;
|
||||
if (ipos < int(baseTextPosition)) // Not in text body
|
||||
continue;
|
||||
LOGABS("makeAbstract: [" << qterm << "] at pos " <<
|
||||
ipos << " grpoccs " << grpoccs << " maxgrpoccs " <<
|
||||
maxgrpoccs << "\n");
|
||||
|
||||
totaloccs++;
|
||||
grpoccs++;
|
||||
|
||||
// Add adjacent slots to the set to populate at next
|
||||
// step by inserting empty strings. Special provisions
|
||||
// for adding ellipsis and for positions overlapped by
|
||||
// the match term.
|
||||
unsigned int sta = MAX(int(baseTextPosition),
|
||||
ipos - ctxwords);
|
||||
unsigned int sto = ipos + qtrmwrdcnt-1 +
|
||||
m_q->m_db->getAbsCtxLen();
|
||||
for (unsigned int ii = sta; ii <= sto; ii++) {
|
||||
if (ii == (unsigned int)ipos) {
|
||||
sparseDoc[ii] = qterm;
|
||||
searchTermPositions.insert(ii);
|
||||
if (ii > maxpos)
|
||||
maxpos = ii;
|
||||
} else if (ii > (unsigned int)ipos &&
|
||||
ii < (unsigned int)ipos + qtrmwrdcnt) {
|
||||
// Position for another word of the multi-word term
|
||||
sparseDoc[ii] = occupiedmarker;
|
||||
} else if (!sparseDoc[ii].compare(cstr_ellipsis)) {
|
||||
// For an empty slot, the test above has a side
|
||||
// effect of inserting an empty string which
|
||||
// is what we want. Do it also if it was an ellipsis
|
||||
sparseDoc[ii] = emptys;
|
||||
}
|
||||
}
|
||||
// Add ellipsis at the end. This may be replaced later by
|
||||
// an overlapping extract. Take care not to replace an
|
||||
// empty string here, we really want an empty slot,
|
||||
// use find()
|
||||
if (sparseDoc.find(sto+1) == sparseDoc.end()) {
|
||||
sparseDoc[sto+1] = cstr_ellipsis;
|
||||
}
|
||||
|
||||
// Group done ?
|
||||
if (grpoccs >= maxgrpoccs) {
|
||||
ret |= ABSRES_TRUNC;
|
||||
LOGABS("Db::makeAbstract: max group occs cutoff\n");
|
||||
break;
|
||||
}
|
||||
// Global done ?
|
||||
if (totaloccs >= maxtotaloccs) {
|
||||
ret |= ABSRES_TRUNC;
|
||||
LOGABS("Db::makeAbstract: max occurrences cutoff\n");
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
listList("Match terms: ", matchedTerms);
|
||||
// Creating the abstract from index position data: after the query
|
||||
// terms have been inserted at their place in the sparse array, and
|
||||
// the neighboring positions marked, populate the neighbours: for each
|
||||
// term in the document, walk its position list and populate slots
|
||||
// around the query terms. We arbitrarily truncate the list to avoid
|
||||
// taking forever. If we do cutoff, the abstract may be inconsistant
|
||||
// (missing words, potentially altering meaning), which is bad.
|
||||
void Query::Native::abstractPopulateContextTerms(
|
||||
Xapian::Database& xrdb,
|
||||
Xapian::docid docid,
|
||||
unsigned int maxpos,
|
||||
map<unsigned int, string>& sparseDoc,
|
||||
int& ret
|
||||
)
|
||||
{
|
||||
Xapian::TermIterator term;
|
||||
int cutoff = m_q->m_snipMaxPosWalk;
|
||||
for (term = xrdb.termlist_begin(docid);
|
||||
term != xrdb.termlist_end(docid); term++) {
|
||||
// Ignore prefixed terms
|
||||
if (has_prefix(*term))
|
||||
continue;
|
||||
if (m_q->m_snipMaxPosWalk > 0 && cutoff-- < 0) {
|
||||
ret |= ABSRES_TERMMISS;
|
||||
LOGDEB0("makeAbstract: max term count cutoff " <<
|
||||
m_q->m_snipMaxPosWalk << "\n");
|
||||
break;
|
||||
}
|
||||
|
||||
// Retrieve the term frequencies for the query terms. This is
|
||||
// actually computed only once for a query, and for all terms in
|
||||
// the query (not only the matches for this doc)
|
||||
setDbWideQTermsFreqs();
|
||||
|
||||
// Build a sorted by quality container for the match terms We are
|
||||
// going to try and show text around the less common search terms.
|
||||
// Terms issued from an original one by stem expansion are
|
||||
// aggregated by the qualityTerms() routine.
|
||||
multimap<double, vector<string> > byQ;
|
||||
double totalweight = qualityTerms(docid, matchedTerms, byQ);
|
||||
LOGABS("makeAbstract:" << chron.millis() << "mS: computed Qcoefs.\n");
|
||||
// This can't happen, but would crash us
|
||||
if (totalweight == 0.0) {
|
||||
LOGERR("makeAbstract:"<<chron.millis()<<"mS: totalweight == 0.0 !\n");
|
||||
return ABSRES_ERROR;
|
||||
map<unsigned int, string>::iterator vit;
|
||||
Xapian::PositionIterator pos;
|
||||
for (pos = xrdb.positionlist_begin(docid, *term);
|
||||
pos != xrdb.positionlist_end(docid, *term); pos++) {
|
||||
if (m_q->m_snipMaxPosWalk > 0 && cutoff-- < 0) {
|
||||
ret |= ABSRES_TERMMISS;
|
||||
LOGDEB0("makeAbstract: max term count cutoff " <<
|
||||
m_q->m_snipMaxPosWalk << "\n");
|
||||
break;
|
||||
}
|
||||
// If we are beyond the max possible position, stop
|
||||
// for this term
|
||||
if (*pos > maxpos) {
|
||||
break;
|
||||
}
|
||||
if ((vit = sparseDoc.find(*pos)) != sparseDoc.end()) {
|
||||
// Don't replace a term: the terms list is in
|
||||
// alphabetic order, and we may have several terms
|
||||
// at the same position, we want to keep only the
|
||||
// first one (ie: dockes and dockes@wanadoo.fr)
|
||||
if (vit->second.empty()) {
|
||||
LOGDEB2("makeAbstract: populating: [" << *term <<
|
||||
"] at " << *pos << "\n");
|
||||
sparseDoc[*pos] = *term;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Rcl::Db::Native *ndb(m_q->m_db->m_ndb);
|
||||
// Creating the abstract from position data: final phase: extract the
|
||||
// snippets from the sparse array.
|
||||
void Query::Native::abstractCreateSnippetsVector(
|
||||
Rcl::Db::Native *ndb,
|
||||
map<unsigned int, string>& sparseDoc,
|
||||
unordered_set<unsigned int>& searchTermPositions,
|
||||
vector<int>& vpbreaks,
|
||||
vector<Snippet>& vabs)
|
||||
{
|
||||
vabs.clear();
|
||||
string chunk;
|
||||
bool incjk = false;
|
||||
int page = 0;
|
||||
string term;
|
||||
|
||||
for (const auto& ent : sparseDoc) {
|
||||
LOGDEB2("Abtract:output "<< ent.first <<" -> [" <<ent.second <<"]\n");
|
||||
if (!occupiedmarker.compare(ent.second)) {
|
||||
LOGDEB("Abstract: qtrm position not filled ??\n");
|
||||
continue;
|
||||
}
|
||||
if (chunk.empty() && !vpbreaks.empty()) {
|
||||
page = ndb->getPageNumberForPosition(vpbreaks, ent.first);
|
||||
if (page < 0)
|
||||
page = 0;
|
||||
term.clear();
|
||||
}
|
||||
Utf8Iter uit(ent.second);
|
||||
bool newcjk = false;
|
||||
if (TextSplit::isCJK(*uit))
|
||||
newcjk = true;
|
||||
if (!incjk || (incjk && !newcjk))
|
||||
chunk += " ";
|
||||
incjk = newcjk;
|
||||
if (searchTermPositions.find(ent.first) != searchTermPositions.end())
|
||||
term = ent.second;
|
||||
if (ent.second == cstr_ellipsis) {
|
||||
vabs.push_back(Snippet(page, chunk).setTerm(term));
|
||||
chunk.clear();
|
||||
} else {
|
||||
if (ent.second.compare(end_of_field_term) &&
|
||||
ent.second.compare(start_of_field_term))
|
||||
chunk += ent.second;
|
||||
}
|
||||
}
|
||||
if (!chunk.empty())
|
||||
vabs.push_back(Snippet(page, chunk).setTerm(term));
|
||||
}
|
||||
|
||||
// Creating the abstract from index position data: top level routine
|
||||
int Query::Native::abstractFromIndex(
|
||||
Rcl::Db::Native *ndb,
|
||||
Xapian::docid docid,
|
||||
const vector<string>& matchTerms,
|
||||
const multimap<double, vector<string>> byQ,
|
||||
double totalweight,
|
||||
int ctxwords,
|
||||
unsigned int maxtotaloccs,
|
||||
vector<Snippet>& vabs,
|
||||
Chrono& chron
|
||||
)
|
||||
{
|
||||
Xapian::Database& xrdb(ndb->xrdb);
|
||||
|
||||
///////////////////
|
||||
// For each of the query terms, ask xapian for its positions list
|
||||
// in the document. For each position entry, insert it and its
|
||||
// neighbours in the set of 'interesting' positions
|
||||
|
||||
int ret = ABSRES_OK;
|
||||
// The terms 'array' that we partially populate with the document
|
||||
// terms, at their positions around the search terms positions:
|
||||
map<unsigned int, string> sparseDoc;
|
||||
@ -370,22 +505,12 @@ int Query::Native::makeAbstract(Xapian::docid docid,
|
||||
// Total number of occurences for all terms. We stop when we have too much
|
||||
unsigned int totaloccs = 0;
|
||||
|
||||
// Total number of slots we populate. The 7 is taken as
|
||||
// average word size. It was a mistake to have the user max
|
||||
// abstract size parameter in characters, we basically only deal
|
||||
// with words. We used to limit the character size at the end, but
|
||||
// this damaged our careful selection of terms
|
||||
const unsigned int maxtotaloccs = imaxoccs > 0 ? imaxoccs :
|
||||
m_q->m_db->getAbsLen() /(7 * (m_q->m_db->getAbsCtxLen() + 1));
|
||||
int ctxwords = ictxwords == -1 ? m_q->m_db->getAbsCtxLen() : ictxwords;
|
||||
LOGABS("makeAbstract:" << chron.millis() << "mS: mxttloccs " <<
|
||||
maxtotaloccs << " ctxwords " << ctxwords << "\n");
|
||||
|
||||
int ret = ABSRES_OK;
|
||||
|
||||
// Let's go populate
|
||||
for (multimap<double, vector<string> >::reverse_iterator mit = byQ.rbegin();
|
||||
mit != byQ.rend(); mit++) {
|
||||
// First pass to populate the sparse document: we walk the term
|
||||
// groups, beginning with the better ones, and insert each term at
|
||||
// its position. We also insert empty strings at the surrounding
|
||||
// positions. These are markers showing where we should insert
|
||||
// data during the next pass.
|
||||
for (auto mit = byQ.rbegin(); mit != byQ.rend(); mit++) {
|
||||
unsigned int maxgrpoccs;
|
||||
double q;
|
||||
if (byQ.size() == 1) {
|
||||
@ -398,87 +523,30 @@ int Query::Native::makeAbstract(Xapian::docid docid,
|
||||
}
|
||||
unsigned int grpoccs = 0;
|
||||
|
||||
for (vector<string>::const_iterator qit = mit->second.begin();
|
||||
qit != mit->second.end(); qit++) {
|
||||
|
||||
// Group done ?
|
||||
// For each term in user term expansion group
|
||||
for (const auto& qterm : mit->second) {
|
||||
// Enough for this group ?
|
||||
if (grpoccs >= maxgrpoccs)
|
||||
break;
|
||||
|
||||
string qterm = *qit;
|
||||
|
||||
LOGABS("makeAbstract: [" << qterm << "] " << maxgrpoccs <<
|
||||
" max grp occs (coef " << q << ")\n");
|
||||
|
||||
// The match term may span several words
|
||||
// The match term may span several words (more than one position)
|
||||
int qtrmwrdcnt =
|
||||
TextSplit::countWords(qterm, TextSplit::TXTS_NOSPANS);
|
||||
|
||||
Xapian::PositionIterator pos;
|
||||
// Populate positions for this query term.
|
||||
// There may be query terms not in this doc. This raises an
|
||||
// exception when requesting the position list, we catch it ??
|
||||
// Not clear how this can happen because we are walking the
|
||||
// match list returned by Xapian. Maybe something with the
|
||||
// fields?
|
||||
string emptys;
|
||||
try {
|
||||
for (pos = xrdb.positionlist_begin(docid, qterm);
|
||||
pos != xrdb.positionlist_end(docid, qterm); pos++) {
|
||||
int ipos = *pos;
|
||||
if (ipos < int(baseTextPosition)) // Not in text body
|
||||
continue;
|
||||
LOGABS("makeAbstract: [" << qterm << "] at pos " <<
|
||||
ipos << " grpoccs " << grpoccs << " maxgrpoccs " <<
|
||||
maxgrpoccs << "\n");
|
||||
|
||||
totaloccs++;
|
||||
grpoccs++;
|
||||
|
||||
// Add adjacent slots to the set to populate at next
|
||||
// step by inserting empty strings. Special provisions
|
||||
// for adding ellipsis and for positions overlapped by
|
||||
// the match term.
|
||||
unsigned int sta = MAX(int(baseTextPosition),
|
||||
ipos - ctxwords);
|
||||
unsigned int sto = ipos + qtrmwrdcnt-1 +
|
||||
m_q->m_db->getAbsCtxLen();
|
||||
for (unsigned int ii = sta; ii <= sto; ii++) {
|
||||
if (ii == (unsigned int)ipos) {
|
||||
sparseDoc[ii] = qterm;
|
||||
searchTermPositions.insert(ii);
|
||||
if (ii > maxpos)
|
||||
maxpos = ii;
|
||||
} else if (ii > (unsigned int)ipos &&
|
||||
ii < (unsigned int)ipos + qtrmwrdcnt) {
|
||||
sparseDoc[ii] = occupiedmarker;
|
||||
} else if (!sparseDoc[ii].compare(cstr_ellipsis)) {
|
||||
// For an empty slot, the test has a side
|
||||
// effect of inserting an empty string which
|
||||
// is what we want.
|
||||
sparseDoc[ii] = emptys;
|
||||
}
|
||||
}
|
||||
// Add ellipsis at the end. This may be replaced later by
|
||||
// an overlapping extract. Take care not to replace an
|
||||
// empty string here, we really want an empty slot,
|
||||
// use find()
|
||||
if (sparseDoc.find(sto+1) == sparseDoc.end()) {
|
||||
sparseDoc[sto+1] = cstr_ellipsis;
|
||||
}
|
||||
|
||||
// Group done ?
|
||||
if (grpoccs >= maxgrpoccs) {
|
||||
ret |= ABSRES_TRUNC;
|
||||
LOGABS("Db::makeAbstract: max group occs cutoff\n");
|
||||
break;
|
||||
}
|
||||
// Global done ?
|
||||
if (totaloccs >= maxtotaloccs) {
|
||||
ret |= ABSRES_TRUNC;
|
||||
LOGABS("Db::makeAbstract: max occurrences cutoff\n");
|
||||
break;
|
||||
}
|
||||
}
|
||||
abstractPopulateQTerm(xrdb, docid, qterm, qtrmwrdcnt, ctxwords,
|
||||
maxgrpoccs,maxtotaloccs, sparseDoc,
|
||||
searchTermPositions, maxpos, totaloccs,
|
||||
grpoccs, ret);
|
||||
} catch (...) {
|
||||
// Term does not occur. No problem.
|
||||
}
|
||||
@ -494,6 +562,7 @@ int Query::Native::makeAbstract(Xapian::docid docid,
|
||||
|
||||
LOGABS("makeAbstract:" << chron.millis() <<
|
||||
"mS:chosen number of positions " << totaloccs << "\n");
|
||||
|
||||
// This can happen if there are term occurences in the keywords
|
||||
// etc. but not elsewhere ?
|
||||
if (totaloccs == 0) {
|
||||
@ -501,124 +570,95 @@ int Query::Native::makeAbstract(Xapian::docid docid,
|
||||
return ABSRES_OK;
|
||||
}
|
||||
|
||||
// Walk all document's terms position lists and populate slots
|
||||
// around the query terms. We arbitrarily truncate the list to
|
||||
// avoid taking forever. If we do cutoff, the abstract may be
|
||||
// inconsistant (missing words, potentially altering meaning),
|
||||
// which is bad.
|
||||
{
|
||||
Xapian::TermIterator term;
|
||||
int cutoff = m_q->m_snipMaxPosWalk;
|
||||
for (term = xrdb.termlist_begin(docid);
|
||||
term != xrdb.termlist_end(docid); term++) {
|
||||
// Ignore prefixed terms
|
||||
if (has_prefix(*term))
|
||||
continue;
|
||||
if (m_q->m_snipMaxPosWalk > 0 && cutoff-- < 0) {
|
||||
ret |= ABSRES_TERMMISS;
|
||||
LOGDEB0("makeAbstract: max term count cutoff " <<
|
||||
m_q->m_snipMaxPosWalk << "\n");
|
||||
break;
|
||||
}
|
||||
|
||||
map<unsigned int, string>::iterator vit;
|
||||
Xapian::PositionIterator pos;
|
||||
for (pos = xrdb.positionlist_begin(docid, *term);
|
||||
pos != xrdb.positionlist_end(docid, *term); pos++) {
|
||||
if (m_q->m_snipMaxPosWalk > 0 && cutoff-- < 0) {
|
||||
ret |= ABSRES_TERMMISS;
|
||||
LOGDEB0("makeAbstract: max term count cutoff " <<
|
||||
m_q->m_snipMaxPosWalk << "\n");
|
||||
break;
|
||||
}
|
||||
// If we are beyond the max possible position, stop
|
||||
// for this term
|
||||
if (*pos > maxpos) {
|
||||
break;
|
||||
}
|
||||
if ((vit = sparseDoc.find(*pos)) != sparseDoc.end()) {
|
||||
// Don't replace a term: the terms list is in
|
||||
// alphabetic order, and we may have several terms
|
||||
// at the same position, we want to keep only the
|
||||
// first one (ie: dockes and dockes@wanadoo.fr)
|
||||
if (vit->second.empty()) {
|
||||
LOGDEB2("makeAbstract: populating: [" << *term <<
|
||||
"] at " << *pos << "\n");
|
||||
sparseDoc[*pos] = *term;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
abstractPopulateContextTerms(xrdb, docid, maxpos, sparseDoc, ret);
|
||||
|
||||
LOGABS("makeAbstract:" << chron.millis() << "mS: all term poslist read\n");
|
||||
|
||||
#if 0
|
||||
// Debug only: output the full term[position] vector
|
||||
bool epty = false;
|
||||
int ipos = 0;
|
||||
for (map<unsigned int, string>::iterator it = sparseDoc.begin();
|
||||
it != sparseDoc.end();
|
||||
it++, ipos++) {
|
||||
if (it->empty()) {
|
||||
if (!epty)
|
||||
LOGDEB("makeAbstract:vec[" << ipos << "]: [" << it << "]\n");
|
||||
epty=true;
|
||||
} else {
|
||||
epty = false;
|
||||
LOGDEB("makeAbstract:vec[" << ipos << "]: [" << it << "]\n");
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
vector<int> vpbreaks;
|
||||
ndb->getPagePositions(docid, vpbreaks);
|
||||
|
||||
LOGABS("makeAbstract:" << chron.millis() << "mS: extracting. Got " <<
|
||||
vpbreaks.size() << " pages\n");
|
||||
// Finally build the abstract by walking the map (in order of position)
|
||||
vabs.clear();
|
||||
string chunk;
|
||||
bool incjk = false;
|
||||
int page = 0;
|
||||
string term;
|
||||
for (map<unsigned int, string>::const_iterator it = sparseDoc.begin();
|
||||
it != sparseDoc.end(); it++) {
|
||||
LOGDEB2("Abtract:output " << it->first << " -> [" << it->second <<
|
||||
"]\n");
|
||||
if (!occupiedmarker.compare(it->second)) {
|
||||
LOGDEB("Abstract: qtrm position not filled ??\n");
|
||||
continue;
|
||||
}
|
||||
if (chunk.empty() && !vpbreaks.empty()) {
|
||||
page = ndb->getPageNumberForPosition(vpbreaks, it->first);
|
||||
if (page < 0)
|
||||
page = 0;
|
||||
term.clear();
|
||||
}
|
||||
Utf8Iter uit(it->second);
|
||||
bool newcjk = false;
|
||||
if (TextSplit::isCJK(*uit))
|
||||
newcjk = true;
|
||||
if (!incjk || (incjk && !newcjk))
|
||||
chunk += " ";
|
||||
incjk = newcjk;
|
||||
if (searchTermPositions.find(it->first) != searchTermPositions.end())
|
||||
term = it->second;
|
||||
if (it->second == cstr_ellipsis) {
|
||||
vabs.push_back(Snippet(page, chunk).setTerm(term));
|
||||
chunk.clear();
|
||||
} else {
|
||||
if (it->second.compare(end_of_field_term) &&
|
||||
it->second.compare(start_of_field_term))
|
||||
chunk += it->second;
|
||||
}
|
||||
}
|
||||
if (!chunk.empty())
|
||||
vabs.push_back(Snippet(page, chunk).setTerm(term));
|
||||
|
||||
// Finally build the abstract by walking the map (in order of position)
|
||||
abstractCreateSnippetsVector(ndb, sparseDoc, searchTermPositions,
|
||||
vpbreaks, vabs);
|
||||
|
||||
LOGABS("makeAbtract: done in " << chron.millis() << " mS\n");
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
// Build a document abstract by extracting text chunks around the
|
||||
// query terms. This can either uses the index position lists, or the
|
||||
// stored document text, with very different implementations.
|
||||
//
|
||||
// DatabaseModified and other general exceptions are catched and
|
||||
// possibly retried by our caller.
|
||||
//
|
||||
// @param[out] vabs the abstract is returned as a vector of snippets.
|
||||
int Query::Native::makeAbstract(Xapian::docid docid,
|
||||
vector<Snippet>& vabs,
|
||||
int imaxoccs, int ictxwords)
|
||||
{
|
||||
Chrono chron;
|
||||
LOGABS("makeAbstract: docid " << docid << " imaxoccs " <<
|
||||
imaxoccs << " ictxwords " << ictxwords << "\n");
|
||||
|
||||
// The (unprefixed) terms matched by this document
|
||||
vector<string> matchedTerms;
|
||||
getMatchTerms(docid, matchedTerms);
|
||||
if (matchedTerms.empty()) {
|
||||
LOGDEB("makeAbstract:" << chron.millis() << "mS:Empty term list\n");
|
||||
return ABSRES_ERROR;
|
||||
}
|
||||
|
||||
LOGDEB("Match terms: " << stringsToString(matchedTerms) << endl);
|
||||
|
||||
// Retrieve the term frequencies for the query terms. This is
|
||||
// actually computed only once for a query, and for all terms in
|
||||
// the query (not only the matches for this doc)
|
||||
setDbWideQTermsFreqs();
|
||||
|
||||
// Build a sorted by quality container for the match terms We are
|
||||
// going to try and show text around the less common search terms.
|
||||
// Terms issued from an original one by stem expansion are
|
||||
// aggregated by the qualityTerms() routine (this is what we call
|
||||
// 'term groups' in the following: index terms expanded from the
|
||||
// same user term).
|
||||
multimap<double, vector<string>> byQ;
|
||||
double totalweight = qualityTerms(docid, matchedTerms, byQ);
|
||||
LOGABS("makeAbstract:" << chron.millis() << "mS: computed Qcoefs.\n");
|
||||
// This can't happen, but would crash us
|
||||
if (totalweight == 0.0) {
|
||||
LOGERR("makeAbstract:"<<chron.millis()<<"mS: totalweight == 0.0 !\n");
|
||||
return ABSRES_ERROR;
|
||||
}
|
||||
|
||||
Rcl::Db::Native *ndb(m_q->m_db->m_ndb);
|
||||
Xapian::Database& xrdb(ndb->xrdb);
|
||||
|
||||
// Total number of slots we populate. The 7 is taken as
|
||||
// average word size. It was a mistake to have the user max
|
||||
// abstract size parameter in characters, we basically only deal
|
||||
// with words. We used to limit the character size at the end, but
|
||||
// this damaged our careful selection of terms
|
||||
const unsigned int maxtotaloccs = imaxoccs > 0 ? imaxoccs :
|
||||
m_q->m_db->getAbsLen() /(7 * (m_q->m_db->getAbsCtxLen() + 1));
|
||||
int ctxwords = ictxwords == -1 ? m_q->m_db->getAbsCtxLen() : ictxwords;
|
||||
LOGABS("makeAbstract:" << chron.millis() << "mS: mxttloccs " <<
|
||||
maxtotaloccs << " ctxwords " << ctxwords << "\n");
|
||||
|
||||
if (o_index_storerawtext) {
|
||||
return abstractFromText(ndb, docid, matchedTerms, byQ,
|
||||
totalweight, ctxwords, maxtotaloccs, vabs,
|
||||
chron);
|
||||
} else {
|
||||
return abstractFromIndex(ndb, docid, matchedTerms, byQ,
|
||||
totalweight, ctxwords, maxtotaloccs, vabs,
|
||||
chron);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
@ -61,6 +61,7 @@ using namespace std;
|
||||
#ifdef RCL_USE_ASPELL
|
||||
#include "rclaspell.h"
|
||||
#endif
|
||||
#include "zlibut.h"
|
||||
|
||||
// Recoll index format version is stored in user metadata. When this change,
|
||||
// we can't open the db and will have to reindex.
|
||||
@ -1458,8 +1459,19 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc)
|
||||
#ifdef TEXTSPLIT_STATS
|
||||
splitter.resetStats();
|
||||
#endif
|
||||
if (!splitter.text_to_words(doc.text))
|
||||
if (!splitter.text_to_words(doc.text)) {
|
||||
LOGDEB("Db::addOrUpdate: split failed for main text\n");
|
||||
} else {
|
||||
#ifdef RAWTEXT_IN_VALUE
|
||||
if (o_index_storerawtext) {
|
||||
ZLibUtBuf buf;
|
||||
deflateToBuf(doc.text.c_str(), doc.text.size(), buf);
|
||||
string tt;
|
||||
tt.assign(buf.getBuf(), buf.getCnt());
|
||||
newdocument.add_value(VALUE_RAWTEXT, tt);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef TEXTSPLIT_STATS
|
||||
// Reject bad data. unrecognized base64 text is characterized by
|
||||
@ -1670,6 +1682,12 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc)
|
||||
newdocument.add_boolean_term(wrap_prefix("XM") + *md5);
|
||||
}
|
||||
|
||||
#ifdef RAWTEXT_IN_DATA
|
||||
if (o_index_storerawtext) {
|
||||
RECORD_APPEND(record, string("RAWTEXT"),
|
||||
neutchars(doc.text, cstr_nc));
|
||||
}
|
||||
#endif
|
||||
LOGDEB0("Rcl::Db::add: new doc record:\n" << record << "\n");
|
||||
newdocument.set_data(record);
|
||||
}
|
||||
|
||||
@ -67,8 +67,14 @@ enum value_slot {
|
||||
VALUE_MD5 = 1, // 16 byte MD5 checksum of original document.
|
||||
VALUE_SIZE = 2, // sortable_serialise(<file size in bytes>)
|
||||
|
||||
// Recoll only:
|
||||
VALUE_SIG = 10 // Doc sig as chosen by app (ex: mtime+size
|
||||
////////// Recoll only:
|
||||
// Doc sig as chosen by app (ex: mtime+size
|
||||
VALUE_SIG = 10,
|
||||
// Doc extracted text, with punctuation: splitter input. Used for
|
||||
// generating snippets. This is only used if RAWTEXT_IN_VALUE is
|
||||
// defined (else the text goes to the data record), but reserve
|
||||
// the value in any case.
|
||||
VALUE_RAWTEXT= 11,
|
||||
};
|
||||
|
||||
class SearchData;
|
||||
|
||||
@ -177,5 +177,12 @@ class Db::Native {
|
||||
// (abstract, keywords, etc.. are stored before this)
|
||||
static const unsigned int baseTextPosition = 100000;
|
||||
|
||||
// Store raw doc text in data record or value slot ?
|
||||
#if 0
|
||||
#define RAWTEXT_IN_DATA 1
|
||||
#elif 1
|
||||
#define RAWTEXT_IN_VALUE 1
|
||||
#endif
|
||||
|
||||
}
|
||||
#endif /* _rcldb_p_h_included_ */
|
||||
|
||||
@ -20,10 +20,13 @@
|
||||
#include <map>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <unordered_set>
|
||||
|
||||
#include <xapian.h>
|
||||
#include "rclquery.h"
|
||||
|
||||
class Chrono;
|
||||
|
||||
namespace Rcl {
|
||||
|
||||
class Query::Native {
|
||||
@ -58,6 +61,56 @@ public:
|
||||
double qualityTerms(Xapian::docid docid,
|
||||
const std::vector<std::string>& terms,
|
||||
std::multimap<double, std::vector<std::string> >& byQ);
|
||||
void abstractPopulateQTerm(
|
||||
Xapian::Database& xrdb,
|
||||
Xapian::docid docid,
|
||||
const string& qterm,
|
||||
int qtrmwrdcnt,
|
||||
int ctxwords,
|
||||
unsigned int maxgrpoccs,
|
||||
unsigned int maxtotaloccs,
|
||||
std::map<unsigned int, std::string>& sparseDoc,
|
||||
std::unordered_set<unsigned int>& searchTermPositions,
|
||||
unsigned int& maxpos,
|
||||
unsigned int& totaloccs,
|
||||
unsigned int& grpoccs,
|
||||
int& ret
|
||||
);
|
||||
void abstractPopulateContextTerms(
|
||||
Xapian::Database& xrdb,
|
||||
Xapian::docid docid,
|
||||
unsigned int maxpos,
|
||||
std::map<unsigned int, std::string>& sparseDoc,
|
||||
int& ret
|
||||
);
|
||||
void abstractCreateSnippetsVector(
|
||||
Db::Native *ndb,
|
||||
std::map<unsigned int, std::string>& sparseDoc,
|
||||
std::unordered_set<unsigned int>& searchTermPositions,
|
||||
std::vector<int>& vpbreaks,
|
||||
std::vector<Snippet>& vabs);
|
||||
int abstractFromIndex(
|
||||
Rcl::Db::Native *ndb,
|
||||
Xapian::docid docid,
|
||||
const std::vector<std::string>& matchTerms,
|
||||
const std::multimap<double, std::vector<std::string>> byQ,
|
||||
double totalweight,
|
||||
int ctxwords,
|
||||
unsigned int maxtotaloccs,
|
||||
std::vector<Snippet>& vabs,
|
||||
Chrono& chron
|
||||
);
|
||||
int abstractFromText(
|
||||
Rcl::Db::Native *ndb,
|
||||
Xapian::docid docid,
|
||||
const std::vector<std::string>& matchTerms,
|
||||
const std::multimap<double, std::vector<std::string>> byQ,
|
||||
double totalweight,
|
||||
int ctxwords,
|
||||
unsigned int maxtotaloccs,
|
||||
vector<Snippet>& vabs,
|
||||
Chrono& chron
|
||||
);
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
@ -214,7 +214,7 @@ membermaxkbs = 50000
|
||||
|
||||
|
||||
# <grouptitle id="TERMS">Parameters affecting how we generate
|
||||
# terms</grouptitle>
|
||||
# terms and organize the index</grouptitle>
|
||||
|
||||
# Changing some of these parameters will imply a full
|
||||
# reindex. Also, when using multiple indexes, it may not make sense
|
||||
@ -231,6 +231,21 @@ membermaxkbs = 50000
|
||||
# implies an index reset.</descr></var>
|
||||
indexStripChars = 1
|
||||
|
||||
# <var name="indexStoreRawText" type="bool"><brief>Decide if we store the
|
||||
# documents' text content in the index.</brief><descr>Storing the text
|
||||
# allows extracting snippets from it at query time,
|
||||
# instead of building them from index position data. This Has become
|
||||
# necessary for versions of Xapian 1.6, which have dropped support
|
||||
# for the chert index format, and adopted a setup which renders our
|
||||
# use of positions list unacceptably slow in cases. 'raw' text here
|
||||
# means that the text is not stripped of upper-case, diacritics, or
|
||||
# punctuation signs. It is still translated from its original format
|
||||
# to UTF-8 plain text. This increases the index size by 10-20% typically,
|
||||
# but also allows for nicer snippets, so it may be worth enabling it even
|
||||
# if not strictly needed for performance if you can afford the space.
|
||||
# </desc></var>
|
||||
indexStoreRawText = 0
|
||||
|
||||
# <var name="nonumbers" type="bool"><brief>Decides if terms will be
|
||||
# generated for numbers.</brief><descr>For example "123", "1.5e6",
|
||||
# 192.168.1.4, would not be indexed if nonumbers is set ("value123" would
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user