From 7b4290744130aa5ca179c673cd22bb87cb9f6b58 Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Fri, 7 Jan 2022 11:43:46 +0100 Subject: [PATCH] Add callback for textsplit to report line breaks. Use it to implement looking up the first line where a term appears to use with a %l spec for executing a viewer --- src/common/textsplit.cpp | 26 +++++---- src/common/textsplit.h | 3 + src/qtgui/rclm_view.cpp | 43 +++++++++----- src/query/docseq.h | 3 + src/query/docseqdb.cpp | 11 ++++ src/query/docseqdb.h | 1 + src/rcldb/rclabsfromtext.cpp | 109 +++++++++++++++++++++-------------- src/rcldb/rclquery.h | 12 +++- 8 files changed, 138 insertions(+), 70 deletions(-) diff --git a/src/common/textsplit.cpp b/src/common/textsplit.cpp index 49b234c0..be61c917 100644 --- a/src/common/textsplit.cpp +++ b/src/common/textsplit.cpp @@ -596,6 +596,7 @@ bool TextSplit::text_to_words(const string &in) clearsplitstate(); bool pagepending = false; + bool nlpending = false; bool softhyphenpending = false; // Running count of non-alphanum chars. Reset when we see one; @@ -705,6 +706,10 @@ bool TextSplit::text_to_words(const string &in) pagepending = false; newpage(m_wordpos); } + if (nlpending) { + nlpending = false; + newline(m_wordpos); + } break; case WILD: @@ -745,6 +750,12 @@ bool TextSplit::text_to_words(const string &in) break; } } else { + // Note about dangling hyphens: we always strip '-' found before whitespace, + // even before a newline, then generate two terms, before and after the line + // break. We have no way to know if '-' is there because a word was broken by + // justification or if it was part of an actual compound word (would need a + // dictionary to check). As soft-hyphen *should* be used if the '-' is not part + // of the text. if (nextc == -1 || isvisiblewhite(nextc)) { goto SPACE; } @@ -844,19 +855,10 @@ bool TextSplit::text_to_words(const string &in) break; case '\n': + nlpending = true; + /* FALLTHROUGH */ case '\r': - if (m_span.length() && *m_span.rbegin() == '-') { - // if '-' is the last char before end of line, we - // strip it. We have no way to know if this is added - // because of the line split or if it was part of an - // actual compound word (would need a dictionary to - // check). As soft-hyphen *should* be used if the '-' - // is not part of the text, it is better to properly - // process a real compound word, and produce wrong - // output from wrong text. The word-emitting routine - // will strip the trailing '-'. - goto SPACE; - } else if (softhyphenpending) { + if (softhyphenpending) { // Don't reset soft-hyphen continue; } else { diff --git a/src/common/textsplit.h b/src/common/textsplit.h index 0821ee04..c09e867f 100644 --- a/src/common/textsplit.h +++ b/src/common/textsplit.h @@ -73,6 +73,9 @@ public: * just don't know about pages. */ virtual void newpage(int /*pos*/) {} + /** Called when we encounter newline \n 0x0a. Override to use the event. */ + virtual void newline(int /*pos*/) {} + // Static utility functions: /** Count words in string, as the splitter would generate them */ diff --git a/src/qtgui/rclm_view.cpp b/src/qtgui/rclm_view.cpp index d9a75b90..6aa43e00 100644 --- a/src/qtgui/rclm_view.cpp +++ b/src/qtgui/rclm_view.cpp @@ -34,6 +34,7 @@ #include "rclmain_w.h" #include "rclzg.h" #include "pathut.h" +#include "unacpp.h" using namespace std; @@ -42,7 +43,6 @@ static const vector browser_list{ "opera", "google-chrome", "chromium-browser", "palemoon", "iceweasel", "firefox", "konqueror", "epiphany"}; - // Start native viewer or preview for input Doc. This is used to allow // using recoll from another app (e.g. Unity Scope) to view embedded // result docs (docs with an ipath). . We act as a proxy to extract @@ -155,13 +155,27 @@ void RclMain::openWith(Rcl::Doc doc, string cmdspec) execViewer(subs, false, execname, lcmd, cmdspec, doc); } -void RclMain::startNativeViewer(Rcl::Doc doc, int pagenum, QString term) +static bool pagenumNeeded(const std::string& cmd) { + return cmd.find("%p") != std::string::npos; +} +static bool linenumNeeded(const std::string& cmd) +{ + return cmd.find("%l") != std::string::npos; +} +static bool termNeeded(const std::string& cmd) +{ + return cmd.find("%s") != std::string::npos; +} + +void RclMain::startNativeViewer(Rcl::Doc doc, int pagenum, QString qterm) +{ + std::string term = qs2utf8s(qterm); string apptag; doc.getmeta(Rcl::Doc::keyapptg, &apptag); LOGDEB("RclMain::startNativeViewer: mtype [" << doc.mimetype << "] apptag [" << apptag << "] page " << pagenum << " term [" << - qs2utf8s(term) << "] url [" << doc.url << "] ipath [" << + term << "] url [" << doc.url << "] ipath [" << doc.ipath << "]\n"); // Look for appropriate viewer @@ -377,19 +391,19 @@ void RclMain::startNativeViewer(Rcl::Doc doc, int pagenum, QString term) // If we are not called with a page number (which would happen for a call // from the snippets window), see if we can compute a page number anyway. - if (pagenum == -1) { - pagenum = 1; - string lterm; - if (m_source) - pagenum = m_source->getFirstMatchPage(doc, lterm); + if (m_source && pagenum == -1 && (pagenumNeeded(cmd) || termNeeded(cmd)|| linenumNeeded(cmd))) { + pagenum = m_source->getFirstMatchPage(doc, term); if (pagenum == -1) pagenum = 1; - else // We get the match term used to compute the page - term = QString::fromUtf8(lterm.c_str()); } - char cpagenum[20]; - sprintf(cpagenum, "%d", pagenum); + int line = 1; + if (m_source && !term.empty() && linenumNeeded(cmd)) { + if (doc.text.empty()) { + rcldb->getDocRawText(doc); + } + line = m_source->getFirstMatchLine(doc, term); + } // Substitute %xx inside arguments string efftime; @@ -408,9 +422,10 @@ void RclMain::startNativeViewer(Rcl::Doc doc, int pagenum, QString term) subs["f"] = fn; subs["F"] = fn; subs["i"] = FileInterner::getLastIpathElt(doc.ipath); + subs["l"] = ulltodecstr(line); subs["M"] = doc.mimetype; - subs["p"] = cpagenum; - subs["s"] = (const char*)term.toLocal8Bit(); + subs["p"] = ulltodecstr(pagenum); + subs["s"] = term; subs["U"] = url_encode(url); subs["u"] = url; // Let %(xx) access all metadata. diff --git a/src/query/docseq.h b/src/query/docseq.h index 4dd6f50f..650b9d89 100644 --- a/src/query/docseq.h +++ b/src/query/docseq.h @@ -111,6 +111,9 @@ public: virtual int getFirstMatchPage(Rcl::Doc&, std::string&) { return -1; } + virtual int getFirstMatchLine(const Rcl::Doc&, const std::string&) { + return 1; + } /** Get duplicates. */ virtual bool docDups(const Rcl::Doc&, std::vector&) { return false; diff --git a/src/query/docseqdb.cpp b/src/query/docseqdb.cpp index fab028bd..df06c6a3 100644 --- a/src/query/docseqdb.cpp +++ b/src/query/docseqdb.cpp @@ -126,6 +126,17 @@ int DocSequenceDb::getFirstMatchPage(Rcl::Doc &doc, string& term) return -1; } +int DocSequenceDb::getFirstMatchLine(const Rcl::Doc &doc, const string& term) +{ + std::unique_lock locker(o_dblock); + if (!setQuery()) + return false; + if (m_q->whatDb()) { + return m_q->getFirstMatchLine(doc, term); + } + return 1; +} + list DocSequenceDb::expand(Rcl::Doc &doc) { std::unique_lock locker(o_dblock); diff --git a/src/query/docseqdb.h b/src/query/docseqdb.h index 69535d79..b77051b4 100644 --- a/src/query/docseqdb.h +++ b/src/query/docseqdb.h @@ -43,6 +43,7 @@ public: virtual bool getAbstract(Rcl::Doc &doc, std::vector&) override; virtual int getFirstMatchPage(Rcl::Doc&, std::string& term) override; + virtual int getFirstMatchLine(const Rcl::Doc&, const std::string& term) override; virtual bool docDups(const Rcl::Doc& doc, std::vector& dups) override; virtual std::string getDescription() override; diff --git a/src/rcldb/rclabsfromtext.cpp b/src/rcldb/rclabsfromtext.cpp index 32783b1f..4195b1c4 100644 --- a/src/rcldb/rclabsfromtext.cpp +++ b/src/rcldb/rclabsfromtext.cpp @@ -141,11 +141,9 @@ public: // add/update fragment definition. virtual bool takeword(const std::string& term, int pos, int bts, int bte) { LOGDEB1("takeword: [" << term << "] bytepos: "< maxtermcount) { LOGINF("Rclabsfromtext: stopping because maxtermcount reached: "<< maxtermcount << endl); @@ -154,8 +152,7 @@ public: } // Also limit the number of fragments (just in case safety) if (m_fragments.size() > maxtermcount / 100) { - LOGINF("Rclabsfromtext: stopping because maxfragments reached: "<< - maxtermcount/100 << endl); + LOGINF("Rclabsfromtext: stopping: max fragments count: " << maxtermcount/100 << "\n"); retflags |= ABSRES_TRUNC; return false; } @@ -193,8 +190,7 @@ public: m_curterm = term; m_curtermcoef = coef; } else { - LOGDEB2("Extending current fragment: " << m_remainingWords << - " -> " << m_ctxwords << endl); + LOGDEB2("Extending current fragment: "< "< 5) { - // Limit expansion of contiguous fragments (this is to - // avoid common terms in search causing long - // heavyweight meaningless fragments. Also, limit length). + // Limit expansion of contiguous fragments (this is to avoid common terms in search + // causing long heavyweight meaningless fragments. Also, limit length). m_remainingWords = 1; m_extcount = 0; } @@ -247,18 +242,14 @@ public: LOGDEB1("FRAGMENT: from byte " << m_curfrag.first << " to byte " << m_curfrag.second << endl); LOGDEB1("FRAGMENT TEXT [" << m_rawtext.substr( - m_curfrag.first, m_curfrag.second-m_curfrag.first) - << "]\n"); - // We used to not push weak fragments if we had a lot - // already. This can cause problems if the fragments - // we drop are actually group fragments (which have - // not got their boost yet). The right cut value is - // difficult to determine, because the absolute values - // of the coefs depend on many things (index size, - // etc.) The old test was if (m_totalcoef < 5.0 || - // m_curfragcoef >= 1.0) We now just avoid creating a - // monster by testing the current fragments count at - // the top of the function + m_curfrag.first, m_curfrag.second-m_curfrag.first) << "]\n"); + // We used to not push weak fragments if we had a lot already. This can cause + // problems if the fragments we drop are actually group fragments (which have not + // got their boost yet). The right cut value is difficult to determine, because the + // absolute values of the coefs depend on many things (index size, etc.) The old + // test was if (m_totalcoef < 5.0 || m_curfragcoef >= 1.0) We now just avoid + // creating a monster by testing the current fragments count at the top of the + // function m_fragments.push_back(MatchFragment(m_curfrag.first, m_curfrag.second, m_curfragcoef, @@ -298,8 +289,7 @@ public: m_curtermcoef = 0.0; } - LOGDEB("TextSplitABS: stored total " << m_fragments.size() << - " fragments" << endl); + LOGDEB("TextSplitABS: stored total " << m_fragments.size() << " fragments" << endl); vector tboffs; // Look for matches to PHRASE and NEAR term groups and finalize @@ -340,9 +330,8 @@ public: } auto fragit = m_fragments.begin(); for (const auto& grpmatch : tboffs) { - LOGDEB2("LOOKING FOR FRAGMENT: group: " << grpmatch.offs.first << - "-" << grpmatch.offs.second << " curfrag " << - fragit->start << "-" << fragit->stop << endl); + LOGDEB2("LOOKING FOR FRAGMENT: group: " << grpmatch.offs.first << "-" << + grpmatch.offs.second<<" curfrag "<start<<"-"<stop<<"\n"); while (fragit->stop < grpmatch.offs.first) { fragit++; if (fragit == m_fragments.end()) { @@ -417,21 +406,19 @@ int Query::Native::abstractFromText( bool sortbypage ) { - (void)chron; + PRETEND_USE(chron); LOGABS("abstractFromText: entry: " << chron.millis() << "mS\n"); string rawtext; if (!ndb->getRawText(docid, rawtext)) { LOGDEB0("abstractFromText: can't fetch text\n"); return ABSRES_ERROR; } - LOGABS("abstractFromText: got raw text: size " << rawtext.size() << " " << - chron.millis() << "mS\n"); + LOGABS("abstractFromText: got raw text: size "<m_snipMaxPosWalk); splitter.text_to_words(rawtext); LOGABS("abstractFromText: text_to_words: " << chron.millis() << "mS\n"); @@ -484,8 +470,7 @@ int Query::Native::abstractFromText( // main term and the page positions. unsigned int count = 0; for (const auto& entry : result) { - string frag( - fixfrag(rawtext.substr(entry.start, entry.stop - entry.start))); + string frag(fixfrag(rawtext.substr(entry.start, entry.stop - entry.start))); #ifdef COMPUTE_HLZONES // This would need to be modified to take tag parameters @@ -506,8 +491,7 @@ int Query::Native::abstractFromText( if (page < 0) page = 0; } - LOGDEB0("=== FRAGMENT: p. " << page << " Coef: " << entry.coef << - ": " << frag << endl); + LOGDEB0("=== FRAGMENT: p. " << page << " Coef: " << entry.coef << ": " << frag << endl); vabs.push_back(Snippet(page, frag).setTerm(entry.term)); if (count++ >= maxtotaloccs) break; @@ -515,4 +499,45 @@ int Query::Native::abstractFromText( return ABSRES_OK | splitter.getretflags(); } +class TermLineSplitter : public TextSplit { +public: + TermLineSplitter(const std::string& term) + : TextSplit(TextSplit::TXTS_NOSPANS), m_term(term) { + } + bool takeword(const std::string& _term, int, int, int) override { + std::string term; + if (o_index_stripchars) { + if (!unacmaybefold(_term, term, "UTF-8", UNACOP_UNACFOLD)) { + LOGINFO("PlainToRich::takeword: unac failed for [" << term << "]\n"); + return true; + } + } + if (term == m_term) { + return false; + } + return true; + } + void newline(int) override { + m_line++; + } + int getline() { + return m_line; + } +private: + int m_line{1}; + std::string m_term; +}; + +int Query::getFirstMatchLine(const Doc &doc, const std::string& term) +{ + int line = 1; + TermLineSplitter splitter(term); + bool ret = splitter.text_to_words(doc.text); + // The splitter takeword() breaks by returning false as soon as the term is found + if (ret == false) { + line = splitter.getline(); + } + return line; +} + } diff --git a/src/rcldb/rclquery.h b/src/rcldb/rclquery.h index cade3650..fd8874d3 100644 --- a/src/rcldb/rclquery.h +++ b/src/rcldb/rclquery.h @@ -115,10 +115,18 @@ public: // Returned as a vector of pair page is 0 if unknown int makeDocAbstract(const Doc &doc, std::vector& abst, int maxoccs= -1, int ctxwords= -1,bool sortbypage=false); - /** Retrieve page number for first match for "significant" query term - * @param term returns the chosen term */ + + /** Choose most interesting term and return the page number for its first match + * @param term returns the chosen term + * @return page number or -1 if term not found or other issue + */ int getFirstMatchPage(const Doc &doc, std::string& term); + /** Compute line number for first match of term. Only works if doc.text has text. + * This uses a text split. Both this and the above getFirstMaxPage() could be done and saved + * while we compute the abstracts, quite a lot of waste here. */ + int getFirstMatchLine(const Doc &doc, const std::string& term); + /** Retrieve a reference to the searchData we are using */ std::shared_ptr getSD() { return m_sd;