From ab32062fcc847e260040c979d85a37d47b41680b Mon Sep 17 00:00:00 2001 From: "\"Jean-Francois Dockes ext:(%22)" Date: Sun, 23 Sep 2012 18:19:43 +0200 Subject: [PATCH] Separate count and context for snippets in the snippets popup from the default values for the result list --- src/qtgui/snippets.ui | 4 +-- src/qtgui/snippets_w.cpp | 11 ++++++ src/query/docseq.h | 1 - src/query/docseqdb.cpp | 19 +++++++++-- src/query/docseqdb.h | 4 +++ src/rcldb/rcldb.cpp | 63 +++++++++++++++++++++++------------ src/rcldb/rcldb.h | 16 +++++++-- src/rcldb/rcldb_p.h | 5 +-- src/sampleconf/recoll.conf.in | 2 ++ website/index.html.en | 7 ++++ 10 files changed, 99 insertions(+), 33 deletions(-) diff --git a/src/qtgui/snippets.ui b/src/qtgui/snippets.ui index 5e3b3892..0d7d4613 100644 --- a/src/qtgui/snippets.ui +++ b/src/qtgui/snippets.ui @@ -6,8 +6,8 @@ 0 0 - 516 - 395 + 640 + 400 diff --git a/src/qtgui/snippets_w.cpp b/src/qtgui/snippets_w.cpp index 917c2041..751c3624 100644 --- a/src/qtgui/snippets_w.cpp +++ b/src/qtgui/snippets_w.cpp @@ -50,6 +50,17 @@ void SnippetsW::init() if (m_source.isNull()) return; + // Make title out of file name if none yet + string titleOrFilename; + string utf8fn; + m_doc.getmeta(Rcl::Doc::keytt, &titleOrFilename); + m_doc.getmeta(Rcl::Doc::keyfn, &utf8fn); + if (titleOrFilename.empty()) { + titleOrFilename = utf8fn; + } + + setWindowTitle(QString::fromUtf8(titleOrFilename.c_str())); + vector > vpabs; m_source->getAbstract(m_doc, vpabs); diff --git a/src/query/docseq.h b/src/query/docseq.h index 69169975..4d3ba12c 100644 --- a/src/query/docseq.h +++ b/src/query/docseq.h @@ -98,7 +98,6 @@ class DocSequence { virtual bool getAbstract(Rcl::Doc& doc, std::vector >& abs) { - fprintf(stderr, "DocSequence::getAbstract/pair\n"); abs.push_back(std::pair(0, doc.meta[Rcl::Doc::keyabs])); return true; diff --git a/src/query/docseqdb.cpp b/src/query/docseqdb.cpp index a1e1762d..195519a8 100644 --- a/src/query/docseqdb.cpp +++ b/src/query/docseqdb.cpp @@ -65,19 +65,32 @@ int DocSequenceDb::getResCnt() return m_rescnt; } +// This one only gets called to fill-up the snippets window +// We ignore most abstract/snippets preferences. bool DocSequenceDb::getAbstract(Rcl::Doc &doc, vector >& vpabs) { LOGDEB(("DocSequenceDb::getAbstract/pair\n")); setQuery(); - if (m_q->whatDb() && - m_queryBuildAbstract && (doc.syntabs || m_queryReplaceAbstract)) { - m_q->whatDb()->makeDocAbstract(doc, m_q.getptr(), vpabs); + + // Have to put the limit somewhere. + int maxoccs = 500; + Rcl::abstract_result ret = Rcl::ABSRES_ERROR; + if (m_q->whatDb()) { + ret = m_q->whatDb()->makeDocAbstract(doc, m_q.getptr(), vpabs, + maxoccs, + m_q->whatDb()->getAbsCtxLen()+ 2); } if (vpabs.empty()) vpabs.push_back(pair(0, doc.meta[Rcl::Doc::keyabs])); + + // If the list was probably truncated, indicate it. + if (ret == Rcl::ABSRES_TRUNC) + vpabs.push_back(pair(-1, "[...]")); + return true; } + bool DocSequenceDb::getAbstract(Rcl::Doc &doc, vector& vabs) { setQuery(); diff --git a/src/query/docseqdb.h b/src/query/docseqdb.h index e4d1ad67..7bdef8f5 100644 --- a/src/query/docseqdb.h +++ b/src/query/docseqdb.h @@ -31,7 +31,11 @@ class DocSequenceDb : public DocSequence { virtual bool getDoc(int num, Rcl::Doc &doc, string * = 0); virtual int getResCnt(); virtual void getTerms(HighlightData& hld); + + // Called to fill-up the snippets window. Ignoers + // buildabstract/replaceabstract and syntabslen virtual bool getAbstract(Rcl::Doc &doc, vector >&); + virtual bool getAbstract(Rcl::Doc &doc, vector&); virtual int getFirstMatchPage(Rcl::Doc&); virtual bool getEnclosing(Rcl::Doc& doc, Rcl::Doc& pdoc); diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index f26ae7ae..dec9fdb5 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -244,7 +244,7 @@ void Db::Native::setDbWideQTermsFreqs(Query *query) for (vector::const_iterator qit = qterms.begin(); qit != qterms.end(); qit++) { query->m_nq->termfreqs[*qit] = xrdb.get_termfreq(*qit) / doccnt; - LOGABS(("makeAbstract: [%s] db freq %.1e\n", qit->c_str(), + LOGABS(("set..QTermFreqs: [%s] db freq %.1e\n", qit->c_str(), query->m_nq->termfreqs[*qit])); } } @@ -298,6 +298,7 @@ double Db::Native::qualityTerms(Xapian::docid docid, } #ifdef DEBUGABSTRACT + LOGDEB(("Db::qualityTerms:\n")); for (multimap::reverse_iterator qit = byQ.rbegin(); qit != byQ.rend(); qit++) { LOGDEB(("%.1e->[%s]\n", qit->first, qit->second.c_str())); @@ -415,12 +416,13 @@ int Db::Native::getFirstMatchPage(Xapian::docid docid, Query *query) // // DatabaseModified and other general exceptions are catched and // possibly retried by our caller -bool Db::Native::makeAbstract(Xapian::docid docid, Query *query, - vector >& vabs) +abstract_result Db::Native::makeAbstract(Xapian::docid docid, Query *query, + vector >& vabs, + int imaxoccs, int ictxwords) { Chrono chron; - LOGDEB2(("makeAbstract:%d: maxlen %d wWidth %d\n", chron.ms(), - m_rcldb->m_synthAbsLen, m_rcldb->m_synthAbsWordCtxLen)); + LOGDEB2(("makeAbstract:%d: maxlen %d wWidth %d imaxoccs %d\n", chron.ms(), + m_rcldb->m_synthAbsLen, m_rcldb->m_synthAbsWordCtxLen, imaxoccs)); // The (unprefixed) terms matched by this document vector matchedTerms; @@ -430,7 +432,7 @@ bool Db::Native::makeAbstract(Xapian::docid docid, Query *query, noPrefixList(iterms, matchedTerms); if (matchedTerms.empty()) { LOGDEB(("makeAbstract::Empty term list\n")); - return false; + return ABSRES_ERROR; } } listList("Match terms: ", matchedTerms); @@ -453,7 +455,7 @@ bool Db::Native::makeAbstract(Xapian::docid docid, Query *query, // This can't happen, but would crash us if (totalweight == 0.0) { LOGERR(("makeAbstract: totalweight == 0.0 !\n")); - return false; + return ABSRES_ERROR; } /////////////////// @@ -474,13 +476,17 @@ bool Db::Native::makeAbstract(Xapian::docid docid, Query *query, // abstract size parameter in characters, we basically only deal // with words. We used to limit the character size at the end, but // this damaged our careful selection of terms - const unsigned int maxtotaloccs = + const unsigned int maxtotaloccs = imaxoccs > 0 ? imaxoccs : m_rcldb->m_synthAbsLen /(7 * (m_rcldb->m_synthAbsWordCtxLen+1)); - LOGABS(("makeAbstract:%d: mxttloccs %d\n", chron.ms(), maxtotaloccs)); + int ctxwords = ictxwords == -1 ? m_rcldb->m_synthAbsWordCtxLen : ictxwords; + LOGABS(("makeAbstract:%d: mxttloccs %d ctxwords %d\n", + chron.ms(), maxtotaloccs, ctxwords)); // This is used to mark positions overlapped by a multi-word match term const string occupiedmarker("?"); + abstract_result ret = ABSRES_OK; + // Let's go populate for (multimap::reverse_iterator qit = byQ.rbegin(); qit != byQ.rend(); qit++) { @@ -522,7 +528,7 @@ bool Db::Native::makeAbstract(Xapian::docid docid, Query *query, // step by inserting empty strings. Special provisions // for adding ellipsis and for positions overlapped by // the match term. - unsigned int sta = MAX(0, ipos-m_rcldb->m_synthAbsWordCtxLen); + unsigned int sta = MAX(0, ipos - ctxwords); unsigned int sto = ipos + qtrmwrdcnt-1 + m_rcldb->m_synthAbsWordCtxLen; for (unsigned int ii = sta; ii <= sto; ii++) { @@ -548,14 +554,20 @@ bool Db::Native::makeAbstract(Xapian::docid docid, Query *query, // Limit to allocated occurences and total size if (++occurrences >= maxoccs || - totaloccs >= maxtotaloccs) + totaloccs >= maxtotaloccs) { + ret = ABSRES_TRUNC; + LOGDEB(("Db::makeAbstract: max occurrences cutoff\n")); break; + } } } catch (...) { // Term does not occur. No problem. } - if (totaloccs >= maxtotaloccs) + if (totaloccs >= maxtotaloccs) { + ret = ABSRES_TRUNC; + LOGDEB(("Db::makeAbstract: max1 occurrences cutoff\n")); break; + } } LOGABS(("makeAbstract:%d:chosen number of positions %d\n", chron.millis(), totaloccs)); @@ -564,7 +576,7 @@ bool Db::Native::makeAbstract(Xapian::docid docid, Query *query, // etc. but not elsewhere ? if (totaloccs == 0) { LOGDEB1(("makeAbstract: no occurrences\n")); - return false; + return ABSRES_ERROR; } // Walk all document's terms position lists and populate slots @@ -582,6 +594,7 @@ bool Db::Native::makeAbstract(Xapian::docid docid, Query *query, if ('A' <= (*term).at(0) && (*term).at(0) <= 'Z') continue; if (cutoff-- < 0) { + ret = ABSRES_TRUNC; LOGDEB0(("makeAbstract: max term count cutoff\n")); break; } @@ -590,6 +603,7 @@ bool Db::Native::makeAbstract(Xapian::docid docid, Query *query, for (pos = xrdb.positionlist_begin(docid, *term); pos != xrdb.positionlist_end(docid, *term); pos++) { if (cutoff-- < 0) { + ret = ABSRES_TRUNC; LOGDEB0(("makeAbstract: max term count cutoff\n")); break; } @@ -600,8 +614,8 @@ bool Db::Native::makeAbstract(Xapian::docid docid, Query *query, // at the same position, we want to keep only the // first one (ie: dockes and dockes@wanadoo.fr) if (vit->second.empty()) { - LOGABS(("makeAbstract: populating: [%s] at %d\n", - (*term).c_str(), *pos)); + LOGDEB2(("makeAbstract: populating: [%s] at %d\n", + (*term).c_str(), *pos)); sparseDoc[*pos] = *term; } } @@ -665,7 +679,7 @@ bool Db::Native::makeAbstract(Xapian::docid docid, Query *query, vabs.push_back(pair(page, chunk)); LOGDEB2(("makeAbtract: done in %d mS\n", chron.millis())); - return true; + return ret; } /* Rcl::Db methods ///////////////////////////////// */ @@ -2119,17 +2133,22 @@ bool Db::stemDiffers(const string& lang, const string& word, return true; } -bool Db::makeDocAbstract(Doc &doc, Query *query, - vector >& abstract) +abstract_result Db::makeDocAbstract(Doc &doc, Query *query, + vector >& abstract, + int maxoccs, int ctxwords) { + LOGDEB(("makeDocAbstract: maxoccs %d ctxwords %d\n", maxoccs, ctxwords)); if (!m_ndb || !m_ndb->m_isopen) { LOGERR(("Db::makeDocAbstract: no db\n")); - return false; + return ABSRES_ERROR; } - bool ret = false; - XAPTRY(ret = m_ndb->makeAbstract(doc.xdocid, query, abstract), + abstract_result ret = ABSRES_ERROR; + XAPTRY(ret = m_ndb->makeAbstract(doc.xdocid, query, abstract, + maxoccs, ctxwords), m_ndb->xrdb, m_reason); - return (ret && m_reason.empty()) ? true : false; + if (!m_reason.empty()) + return ABSRES_ERROR; + return ret; } bool Db::makeDocAbstract(Doc &doc, Query *query, vector& abstract) diff --git a/src/rcldb/rcldb.h b/src/rcldb/rcldb.h index 96630b16..80ef2eb8 100644 --- a/src/rcldb/rcldb.h +++ b/src/rcldb/rcldb.h @@ -66,6 +66,11 @@ enum value_slot { VALUE_SIG = 10 // Doc sig as chosen by app (ex: mtime+size }; +enum abstract_result { + ABSRES_ERROR = 0, + ABSRES_OK = 1, + ABSRES_TRUNC = 2 +}; class SearchData; class TermIter; class Query; @@ -220,6 +225,10 @@ class Db { /** Set parameters for synthetic abstract generation */ void setAbstractParams(int idxTrunc, int synthLen, int syntCtxLen); + int getAbsCtxLen() const + { + return m_synthAbsWordCtxLen; + } /** Build synthetic abstract for document, extracting chunks relevant for * the input query. This uses index data only (no access to the file) */ @@ -227,9 +236,10 @@ class Db { bool makeDocAbstract(Doc &doc, Query *query, string& abstract); // Returned as a snippets vector bool makeDocAbstract(Doc &doc, Query *query, vector& abstract); - // Returned as a vector of page,snippet page is 0 if unknown - bool makeDocAbstract(Doc &doc, Query *query, - vector >& abstract); + // Returned as a vector of pair page is 0 if unknown + abstract_result makeDocAbstract(Doc &doc, Query *query, + vector >& abstract, + int maxoccs= -1, int ctxwords = -1); /** Retrieve detected page breaks positions */ int getFirstMatchPage(Doc &doc, Query *query); diff --git a/src/rcldb/rcldb_p.h b/src/rcldb/rcldb_p.h index f5af8ece..efbe8f91 100644 --- a/src/rcldb/rcldb_p.h +++ b/src/rcldb/rcldb_p.h @@ -89,8 +89,9 @@ class Db::Native { const vector& terms, std::multimap& byQ); void setDbWideQTermsFreqs(Query *query); - bool makeAbstract(Xapian::docid id, Query *query, - vector >&); + abstract_result makeAbstract(Xapian::docid id, Query *query, + vector >&, int maxoccs = -1, + int ctxwords = -1); bool getPagePositions(Xapian::docid docid, vector& vpos); int getFirstMatchPage(Xapian::docid docid, Query *query); int getPageNumberForPosition(const vector& pbreaks, unsigned int pos); diff --git a/src/sampleconf/recoll.conf.in b/src/sampleconf/recoll.conf.in index d79eeb0d..bfe24711 100644 --- a/src/sampleconf/recoll.conf.in +++ b/src/sampleconf/recoll.conf.in @@ -81,6 +81,8 @@ indexstemminglanguages = english # unac_except_trans = Ää Öö Üü ää öö üü ßss # In French, you probably want to decompose oe and ae # unac_except_trans = œoe Œoe æae Æae +# Actually, this seems a reasonable default for all until someone protests. +unac_except_trans = åå Åå ää Ää öö Öö üü Üü ßss œoe Œoe æae ÆAE fifi flfl # Where to store the database (directory). This may be an absolute path, # else it is taken as relative to the configuration directory (-c argument diff --git a/website/index.html.en b/website/index.html.en index 968b5f67..baf58fb4 100644 --- a/website/index.html.en +++ b/website/index.html.en @@ -86,6 +86,13 @@

News

    +
  • 2012-09-21: an + easy + way to extend the "Beagle queue" + Recoll web history indexing mechanism to other browsers than + Firefox (Elinks in this case). +
  • +
  • 2012-09-13: the next Recoll version will maybe acquire switchable case and diacritics sensitivity. I am writing a few pages about the