improved detection of incomplete snippets lists

This commit is contained in:
Jean-Francois Dockes 2013-02-26 15:49:36 +01:00
parent 87120798c2
commit 3abfd00635
6 changed files with 47 additions and 25 deletions

View File

@ -136,19 +136,26 @@ void SnippetsW::init()
"<html><head>" "<html><head>"
"<meta http-equiv=\"content-type\" " "<meta http-equiv=\"content-type\" "
"content=\"text/html; charset=utf-8\"></head>" "content=\"text/html; charset=utf-8\"></head>"
"<body style='overflow-x: scroll; white-space: nowrap'>" "<body>"
"<table>" "<table style='overflow-x: scroll; white-space: nowrap'>"
; ;
g_hiliter.set_inputhtml(false); g_hiliter.set_inputhtml(false);
bool nomatch = true;
for (vector<Rcl::Snippet>::const_iterator it = vpabs.begin(); for (vector<Rcl::Snippet>::const_iterator it = vpabs.begin();
it != vpabs.end(); it++) { it != vpabs.end(); it++) {
if (it->page == -1) {
oss << "<tr><td colspan=\"2\">" <<
it->snippet << "</td></tr>" << endl;
continue;
}
list<string> lr; list<string> lr;
if (!g_hiliter.plaintorich(it->snippet, lr, hdata)) { if (!g_hiliter.plaintorich(it->snippet, lr, hdata)) {
LOGDEB1(("No match for [%s]\n", it->snippet.c_str())); LOGDEB1(("No match for [%s]\n", it->snippet.c_str()));
continue; continue;
} }
nomatch = false;
oss << "<tr><td>"; oss << "<tr><td>";
if (it->page > 0) { if (it->page > 0) {
oss << "<a href=\"P" << it->page << "T" << it->term << "\">" oss << "<a href=\"P" << it->page << "T" << it->term << "\">"
@ -156,6 +163,13 @@ void SnippetsW::init()
} }
oss << "</td><td>" << lr.front().c_str() << "</td></tr>" << endl; oss << "</td><td>" << lr.front().c_str() << "</td></tr>" << endl;
} }
oss << "</table>" << endl;
if (nomatch) {
oss.str("<html><head></head><body>");
oss << "<p>Sorry, no exact match was found within limits. "
"Probably the document is very big "
"and the snippets generator got lost in a maze...</p>" << endl;
}
oss << "</body></html>"; oss << "</body></html>";
#ifdef SNIPPETS_WEBKIT #ifdef SNIPPETS_WEBKIT
browser->setHtml(QString::fromUtf8(oss.str().c_str())); browser->setHtml(QString::fromUtf8(oss.str().c_str()));

View File

@ -69,6 +69,7 @@ int DocSequenceDb::getResCnt()
} }
return m_rescnt; return m_rescnt;
} }
static const string cstr_mre("[...]"); static const string cstr_mre("[...]");
// This one only gets called to fill-up the snippets window // This one only gets called to fill-up the snippets window
@ -81,18 +82,22 @@ bool DocSequenceDb::getAbstract(Rcl::Doc &doc, vector<Rcl::Snippet>& vpabs)
// Have to put the limit somewhere. // Have to put the limit somewhere.
int maxoccs = 1000; int maxoccs = 1000;
Rcl::abstract_result ret = Rcl::ABSRES_ERROR; int ret = Rcl::ABSRES_ERROR;
if (m_q->whatDb()) { if (m_q->whatDb()) {
ret = m_q->makeDocAbstract(doc, vpabs, maxoccs, ret = m_q->makeDocAbstract(doc, vpabs, maxoccs,
m_q->whatDb()->getAbsCtxLen()+ 2); m_q->whatDb()->getAbsCtxLen()+ 2);
} }
if (vpabs.empty()) LOGDEB(("DocSequenceDb::getAbstract: got ret %d vpabs len %u\n", ret,
vpabs.push_back(Rcl::Snippet(0, doc.meta[Rcl::Doc::keyabs])); vpabs.size()));
if (vpabs.empty()) {
return true;
}
// If the list was probably truncated, indicate it. // If the list was probably truncated, indicate it.
if (ret == Rcl::ABSRES_TRUNC) { if (ret | Rcl::ABSRES_TRUNC) {
vpabs.push_back(Rcl::Snippet(-1, cstr_mre)); vpabs.push_back(Rcl::Snippet(-1, cstr_mre));
} else if (ret == Rcl::ABSRES_TERMMISS) { }
if (ret | Rcl::ABSRES_TERMMISS) {
vpabs.insert(vpabs.begin(), vpabs.insert(vpabs.begin(),
Rcl::Snippet(-1, "(Words missing in snippets)")); Rcl::Snippet(-1, "(Words missing in snippets)"));
} }

View File

@ -309,9 +309,9 @@ int Query::Native::getFirstMatchPage(Xapian::docid docid, string& term)
// //
// DatabaseModified and other general exceptions are catched and // DatabaseModified and other general exceptions are catched and
// possibly retried by our caller // possibly retried by our caller
abstract_result Query::Native::makeAbstract(Xapian::docid docid, int Query::Native::makeAbstract(Xapian::docid docid,
vector<Snippet>& vabs, vector<Snippet>& vabs,
int imaxoccs, int ictxwords) int imaxoccs, int ictxwords)
{ {
Chrono chron; Chrono chron;
LOGABS(("makeAbstract: docid %ld imaxoccs %d ictxwords %d\n", LOGABS(("makeAbstract: docid %ld imaxoccs %d ictxwords %d\n",
@ -381,7 +381,7 @@ abstract_result Query::Native::makeAbstract(Xapian::docid docid,
LOGABS(("makeAbstract:%d: mxttloccs %d ctxwords %d\n", LOGABS(("makeAbstract:%d: mxttloccs %d ctxwords %d\n",
chron.ms(), maxtotaloccs, ctxwords)); chron.ms(), maxtotaloccs, ctxwords));
abstract_result ret = ABSRES_OK; int ret = ABSRES_OK;
// Let's go populate // Let's go populate
for (map<double, vector<string> >::reverse_iterator mit = byQ.rbegin(); for (map<double, vector<string> >::reverse_iterator mit = byQ.rbegin();
@ -466,11 +466,14 @@ abstract_result Query::Native::makeAbstract(Xapian::docid docid,
} }
// Group done ? // Group done ?
if (grpoccs >= maxgrpoccs) if (grpoccs >= maxgrpoccs) {
ret |= ABSRES_TRUNC;
LOGABS(("Db::makeAbstract: max group occs cutoff\n"));
break; break;
}
// Global done ? // Global done ?
if (totaloccs >= maxtotaloccs) { if (totaloccs >= maxtotaloccs) {
ret = ABSRES_TRUNC; ret |= ABSRES_TRUNC;
LOGABS(("Db::makeAbstract: max occurrences cutoff\n")); LOGABS(("Db::makeAbstract: max occurrences cutoff\n"));
break; break;
} }
@ -480,7 +483,7 @@ abstract_result Query::Native::makeAbstract(Xapian::docid docid,
} }
if (totaloccs >= maxtotaloccs) { if (totaloccs >= maxtotaloccs) {
ret = ABSRES_TRUNC; ret |= ABSRES_TRUNC;
LOGABS(("Db::makeAbstract: max1 occurrences cutoff\n")); LOGABS(("Db::makeAbstract: max1 occurrences cutoff\n"));
break; break;
} }
@ -511,7 +514,7 @@ abstract_result Query::Native::makeAbstract(Xapian::docid docid,
if (has_prefix(*term)) if (has_prefix(*term))
continue; continue;
if (m_q->m_snipMaxPosWalk > 0 && cutoff-- < 0) { if (m_q->m_snipMaxPosWalk > 0 && cutoff-- < 0) {
ret = ABSRES_TERMMISS; ret |= ABSRES_TERMMISS;
LOGDEB0(("makeAbstract: max term count cutoff %d\n", LOGDEB0(("makeAbstract: max term count cutoff %d\n",
m_q->m_snipMaxPosWalk)); m_q->m_snipMaxPosWalk));
break; break;
@ -522,7 +525,7 @@ abstract_result Query::Native::makeAbstract(Xapian::docid docid,
for (pos = xrdb.positionlist_begin(docid, *term); for (pos = xrdb.positionlist_begin(docid, *term);
pos != xrdb.positionlist_end(docid, *term); pos++) { pos != xrdb.positionlist_end(docid, *term); pos++) {
if (m_q->m_snipMaxPosWalk > 0 && cutoff-- < 0) { if (m_q->m_snipMaxPosWalk > 0 && cutoff-- < 0) {
ret = ABSRES_TERMMISS; ret |= ABSRES_TERMMISS;
LOGDEB0(("makeAbstract: max term count cutoff %d\n", LOGDEB0(("makeAbstract: max term count cutoff %d\n",
m_q->m_snipMaxPosWalk)); m_q->m_snipMaxPosWalk));
break; break;

View File

@ -264,16 +264,16 @@ bool Query::getQueryTerms(vector<string>& terms)
return true; return true;
} }
abstract_result Query::makeDocAbstract(Doc &doc, int Query::makeDocAbstract(Doc &doc,
vector<Snippet>& abstract, vector<Snippet>& abstract,
int maxoccs, int ctxwords) int maxoccs, int ctxwords)
{ {
LOGDEB(("makeDocAbstract: maxoccs %d ctxwords %d\n", maxoccs, ctxwords)); LOGDEB(("makeDocAbstract: maxoccs %d ctxwords %d\n", maxoccs, ctxwords));
if (!m_db || !m_db->m_ndb || !m_db->m_ndb->m_isopen || !m_nq) { if (!m_db || !m_db->m_ndb || !m_db->m_ndb->m_isopen || !m_nq) {
LOGERR(("Query::makeDocAbstract: no db or no nq\n")); LOGERR(("Query::makeDocAbstract: no db or no nq\n"));
return ABSRES_ERROR; return ABSRES_ERROR;
} }
abstract_result ret = ABSRES_ERROR; int ret = ABSRES_ERROR;
XAPTRY(ret = m_nq->makeAbstract(doc.xdocid, abstract, maxoccs, ctxwords), XAPTRY(ret = m_nq->makeAbstract(doc.xdocid, abstract, maxoccs, ctxwords),
m_db->m_ndb->xrdb, m_reason); m_db->m_ndb->xrdb, m_reason);
if (!m_reason.empty()) { if (!m_reason.empty()) {

View File

@ -33,7 +33,7 @@ enum abstract_result {
ABSRES_ERROR = 0, ABSRES_ERROR = 0,
ABSRES_OK = 1, ABSRES_OK = 1,
ABSRES_TRUNC = 2, ABSRES_TRUNC = 2,
ABSRES_TERMMISS = 3 ABSRES_TERMMISS = 4
}; };
// Snippet entry for makeDocAbstract // Snippet entry for makeDocAbstract
@ -110,8 +110,8 @@ class Query {
// Returned as a snippets vector // Returned as a snippets vector
bool makeDocAbstract(Doc &doc, std::vector<std::string>& abstract); bool makeDocAbstract(Doc &doc, std::vector<std::string>& abstract);
// Returned as a vector of pair<page,snippet> page is 0 if unknown // Returned as a vector of pair<page,snippet> page is 0 if unknown
abstract_result makeDocAbstract(Doc &doc, std::vector<Snippet>& abst, int makeDocAbstract(Doc &doc, std::vector<Snippet>& abst,
int maxoccs= -1, int ctxwords = -1); int maxoccs= -1, int ctxwords = -1);
/** Retrieve page number for first match for term */ /** Retrieve page number for first match for term */
int getFirstMatchPage(Doc &doc, std::string& term); int getFirstMatchPage(Doc &doc, std::string& term);

View File

@ -59,8 +59,8 @@ public:
} }
/** Return a list of terms which matched for a specific result document */ /** Return a list of terms which matched for a specific result document */
bool getMatchTerms(unsigned long xdocid, std::vector<std::string>& terms); bool getMatchTerms(unsigned long xdocid, std::vector<std::string>& terms);
abstract_result makeAbstract(Xapian::docid id, vector<Snippet>&, int makeAbstract(Xapian::docid id, vector<Snippet>&,
int maxoccs = -1, int ctxwords = -1); int maxoccs = -1, int ctxwords = -1);
int getFirstMatchPage(Xapian::docid docid, std::string& term); int getFirstMatchPage(Xapian::docid docid, std::string& term);
void setDbWideQTermsFreqs(); void setDbWideQTermsFreqs();
double qualityTerms(Xapian::docid docid, double qualityTerms(Xapian::docid docid,