Add callback for textsplit to report line breaks. Use it to implement looking up the first line where a term appears to use with a %l spec for executing a viewer

2022-01-07 11:43:46 +01:00 · 2022-01-07 11:43:46 +01:00 · 7b42907441
commit 7b42907441
parent 25d6d78902
8 changed files with 138 additions and 70 deletions
--- a/src/common/textsplit.cpp
+++ b/src/common/textsplit.cpp
@ -596,6 +596,7 @@ bool TextSplit::text_to_words(const string &in)
    clearsplitstate();
    
    bool pagepending = false;
+    bool nlpending = false;
    bool softhyphenpending = false;

    // Running count of non-alphanum chars. Reset when we see one;
@ -705,6 +706,10 @@ bool TextSplit::text_to_words(const string &in)
                pagepending = false;
                newpage(m_wordpos);
            }
+            if (nlpending) {
+                nlpending = false;
+                newline(m_wordpos);
+            }
            break;

        case WILD:
@ -745,6 +750,12 @@ bool TextSplit::text_to_words(const string &in)
                        break;
                    }
                } else {
+                    // Note about dangling hyphens: we always strip '-' found before whitespace,
+                    // even before a newline, then generate two terms, before and after the line
+                    // break. We have no way to know if '-' is there because a word was broken by
+                    // justification or if it was part of an actual compound word (would need a
+                    // dictionary to check). As soft-hyphen *should* be used if the '-' is not part
+                    // of the text.
                    if (nextc == -1 || isvisiblewhite(nextc)) {
                        goto SPACE;
                    }
@ -844,19 +855,10 @@ bool TextSplit::text_to_words(const string &in)
            break;

        case '\n':
+            nlpending = true;
+            /* FALLTHROUGH */
        case '\r':
-            if (m_span.length() && *m_span.rbegin() == '-') {
-                // if '-' is the last char before end of line, we
-                // strip it.  We have no way to know if this is added
-                // because of the line split or if it was part of an
-                // actual compound word (would need a dictionary to
-                // check).  As soft-hyphen *should* be used if the '-'
-                // is not part of the text, it is better to properly
-                // process a real compound word, and produce wrong
-                // output from wrong text. The word-emitting routine
-                // will strip the trailing '-'.
-                goto SPACE;
-            } else if (softhyphenpending) {
+            if (softhyphenpending) {
                // Don't reset soft-hyphen
                continue;
            } else {
--- a/src/common/textsplit.h
+++ b/src/common/textsplit.h
@ -73,6 +73,9 @@ public:
     * just don't know about pages. */
    virtual void newpage(int /*pos*/) {}

+    /** Called when we encounter newline \n 0x0a. Override to use the event. */
+    virtual void newline(int /*pos*/) {}
+
    // Static utility functions:

    /** Count words in string, as the splitter would generate them */
--- a/src/qtgui/rclm_view.cpp
+++ b/src/qtgui/rclm_view.cpp
@ -34,6 +34,7 @@
 #include "rclmain_w.h"
 #include "rclzg.h"
 #include "pathut.h"
+#include "unacpp.h"

 using namespace std;

@ -42,7 +43,6 @@ static const vector<string> browser_list{
    "opera", "google-chrome", "chromium-browser",
    "palemoon", "iceweasel", "firefox", "konqueror", "epiphany"};

-
 // Start native viewer or preview for input Doc. This is used to allow
 // using recoll from another app (e.g. Unity Scope) to view embedded
 // result docs (docs with an ipath). . We act as a proxy to extract
@ -155,13 +155,27 @@ void RclMain::openWith(Rcl::Doc doc, string cmdspec)
    execViewer(subs, false, execname, lcmd, cmdspec, doc);
 }

-void RclMain::startNativeViewer(Rcl::Doc doc, int pagenum, QString term)
+static bool pagenumNeeded(const std::string& cmd)
 {
+    return cmd.find("%p") != std::string::npos;
+}
+static bool linenumNeeded(const std::string& cmd)
+{
+    return cmd.find("%l") != std::string::npos;
+}
+static bool termNeeded(const std::string& cmd)
+{
+    return cmd.find("%s") != std::string::npos;
+}
+
+void RclMain::startNativeViewer(Rcl::Doc doc, int pagenum, QString qterm)
+{
+    std::string term = qs2utf8s(qterm);
    string apptag;
    doc.getmeta(Rcl::Doc::keyapptg, &apptag);
    LOGDEB("RclMain::startNativeViewer: mtype [" << doc.mimetype <<
           "] apptag ["  << apptag << "] page "  << pagenum << " term ["  <<
-           qs2utf8s(term) << "] url ["  << doc.url << "] ipath [" <<
+           term << "] url ["  << doc.url << "] ipath [" <<
           doc.ipath << "]\n");

    // Look for appropriate viewer
@ -377,19 +391,19 @@ void RclMain::startNativeViewer(Rcl::Doc doc, int pagenum, QString term)

    // If we are not called with a page number (which would happen for a call
    // from the snippets window), see if we can compute a page number anyway.
-    if (pagenum == -1) {
-        pagenum = 1;
-        string lterm;
-        if (m_source)
-            pagenum = m_source->getFirstMatchPage(doc, lterm);
+    if (m_source && pagenum == -1 && (pagenumNeeded(cmd) || termNeeded(cmd)|| linenumNeeded(cmd))) {
+        pagenum = m_source->getFirstMatchPage(doc, term);
        if (pagenum == -1)
            pagenum = 1;
-        else // We get the match term used to compute the page
-            term = QString::fromUtf8(lterm.c_str());
    }
-    char cpagenum[20];
-    sprintf(cpagenum, "%d", pagenum);

+    int line = 1;
+    if (m_source && !term.empty() && linenumNeeded(cmd)) {
+        if (doc.text.empty()) {
+            rcldb->getDocRawText(doc);
+        }
+        line = m_source->getFirstMatchLine(doc, term);
+    }

    // Substitute %xx inside arguments
    string efftime;
@ -408,9 +422,10 @@ void RclMain::startNativeViewer(Rcl::Doc doc, int pagenum, QString term)
    subs["f"] = fn;
    subs["F"] = fn;
    subs["i"] = FileInterner::getLastIpathElt(doc.ipath);
+    subs["l"] = ulltodecstr(line);
    subs["M"] = doc.mimetype;
-    subs["p"] = cpagenum;
-    subs["s"] = (const char*)term.toLocal8Bit();
+    subs["p"] = ulltodecstr(pagenum);
+    subs["s"] = term;
    subs["U"] = url_encode(url);
    subs["u"] = url;
    // Let %(xx) access all metadata.
--- a/src/query/docseq.h
+++ b/src/query/docseq.h
@ -111,6 +111,9 @@ public:
    virtual int getFirstMatchPage(Rcl::Doc&, std::string&) {
        return -1;
    }
+    virtual int getFirstMatchLine(const Rcl::Doc&, const std::string&) {
+        return 1;
+    }
    /** Get duplicates. */
    virtual bool docDups(const Rcl::Doc&, std::vector<Rcl::Doc>&) {
        return false;
--- a/src/query/docseqdb.cpp
+++ b/src/query/docseqdb.cpp
@ -126,6 +126,17 @@ int DocSequenceDb::getFirstMatchPage(Rcl::Doc &doc, string& term)
    return -1;
 }

+int DocSequenceDb::getFirstMatchLine(const Rcl::Doc &doc, const string& term)
+{
+    std::unique_lock<std::mutex> locker(o_dblock);
+    if (!setQuery())
+        return false;
+    if (m_q->whatDb()) {
+        return m_q->getFirstMatchLine(doc, term);
+    }
+    return 1;
+}
+
 list<string> DocSequenceDb::expand(Rcl::Doc &doc)
 {
    std::unique_lock<std::mutex> locker(o_dblock);
--- a/src/query/docseqdb.h
+++ b/src/query/docseqdb.h
@ -43,6 +43,7 @@ public:

    virtual bool getAbstract(Rcl::Doc &doc, std::vector<std::string>&) override;
    virtual int getFirstMatchPage(Rcl::Doc&, std::string& term) override;
+    virtual int getFirstMatchLine(const Rcl::Doc&, const std::string& term) override;
    virtual bool docDups(const Rcl::Doc& doc, std::vector<Rcl::Doc>& dups)
        override;
    virtual std::string getDescription() override;
--- a/src/rcldb/rclabsfromtext.cpp
+++ b/src/rcldb/rclabsfromtext.cpp
@ -141,11 +141,9 @@ public:
    // add/update fragment definition.
    virtual bool takeword(const std::string& term, int pos, int bts, int bte) {
        LOGDEB1("takeword: [" << term << "] bytepos: "<<bts<<":"<<bte<<endl);
-        // Limit time taken with monster documents. The resulting
-        // abstract will be incorrect or inexistent, but this is
-        // better than taking forever (the default cutoff value comes
-        // from the snippetMaxPosWalk configuration parameter, and is
-        // 10E6)
+        // Limit time taken with monster documents. The resulting abstract will be incorrect or
+        // inexistent, but this is better than taking forever (the default cutoff value comes from
+        // the snippetMaxPosWalk configuration parameter, and is 10E6)
        if (maxtermcount && termcount++ > maxtermcount) {
            LOGINF("Rclabsfromtext: stopping because maxtermcount reached: "<<
                   maxtermcount << endl);
@ -154,8 +152,7 @@ public:
        }
        // Also limit the number of fragments (just in case safety)
        if (m_fragments.size() > maxtermcount / 100) {
-            LOGINF("Rclabsfromtext: stopping because maxfragments reached: "<<
-                   maxtermcount/100 << endl);
+            LOGINF("Rclabsfromtext: stopping: max fragments count: " << maxtermcount/100 << "\n");
            retflags |= ABSRES_TRUNC;
            return false;
        }
@ -193,8 +190,7 @@ public:
                m_curterm = term;
                m_curtermcoef = coef;
            } else {
-                LOGDEB2("Extending current fragment: " << m_remainingWords <<
-                        " -> " << m_ctxwords << endl);
+                LOGDEB2("Extending current fragment: "<<m_remainingWords<<" -> "<<m_ctxwords<< "\n");
                m_extcount++;
 #ifdef COMPUTE_HLZONES
                if (m_prevwordhit) {
@ -215,9 +211,8 @@ public:
            m_curfragcoef += coef;
            m_remainingWords = m_ctxwords + 1;
            if (m_extcount > 5) {
-                // Limit expansion of contiguous fragments (this is to
-                // avoid common terms in search causing long
-                // heavyweight meaningless fragments. Also, limit length).
+                // Limit expansion of contiguous fragments (this is to avoid common terms in search
+                // causing long heavyweight meaningless fragments. Also, limit length).
                m_remainingWords = 1;
                m_extcount = 0;
            }
@ -247,18 +242,14 @@ public:
                LOGDEB1("FRAGMENT: from byte " << m_curfrag.first <<
                        " to  byte " << m_curfrag.second << endl);
                LOGDEB1("FRAGMENT TEXT [" << m_rawtext.substr(
-                            m_curfrag.first, m_curfrag.second-m_curfrag.first)
-                        << "]\n");
-                // We used to not push weak fragments if we had a lot
-                // already. This can cause problems if the fragments
-                // we drop are actually group fragments (which have
-                // not got their boost yet). The right cut value is
-                // difficult to determine, because the absolute values
-                // of the coefs depend on many things (index size,
-                // etc.) The old test was if (m_totalcoef < 5.0 ||
-                // m_curfragcoef >= 1.0) We now just avoid creating a
-                // monster by testing the current fragments count at
-                // the top of the function
+                            m_curfrag.first, m_curfrag.second-m_curfrag.first) << "]\n");
+                // We used to not push weak fragments if we had a lot already. This can cause
+                // problems if the fragments we drop are actually group fragments (which have not
+                // got their boost yet). The right cut value is difficult to determine, because the
+                // absolute values of the coefs depend on many things (index size, etc.) The old
+                // test was if (m_totalcoef < 5.0 || m_curfragcoef >= 1.0) We now just avoid
+                // creating a monster by testing the current fragments count at the top of the
+                // function
                m_fragments.push_back(MatchFragment(m_curfrag.first,
                                                    m_curfrag.second,
                                                    m_curfragcoef,
@ -298,8 +289,7 @@ public:
                m_curtermcoef = 0.0;
        }

-        LOGDEB("TextSplitABS: stored total " << m_fragments.size() <<
-               " fragments" << endl);
+        LOGDEB("TextSplitABS: stored total " << m_fragments.size() << " fragments" << endl);
        vector<GroupMatchEntry> tboffs;

        // Look for matches to PHRASE and NEAR term groups and finalize
@ -340,9 +330,8 @@ public:
        }
        auto fragit = m_fragments.begin();
        for (const auto& grpmatch : tboffs) {
-            LOGDEB2("LOOKING FOR FRAGMENT: group: " << grpmatch.offs.first <<
-                    "-" << grpmatch.offs.second << " curfrag " <<
-                    fragit->start << "-" << fragit->stop << endl);
+            LOGDEB2("LOOKING FOR FRAGMENT: group: " << grpmatch.offs.first << "-" <<
+                    grpmatch.offs.second<<" curfrag "<<fragit->start<<"-"<<fragit->stop<<"\n");
            while (fragit->stop < grpmatch.offs.first) {
                fragit++;
                if (fragit == m_fragments.end()) {
@ -417,21 +406,19 @@ int Query::Native::abstractFromText(
    bool sortbypage
    )
 {
-    (void)chron;
+    PRETEND_USE(chron);
    LOGABS("abstractFromText: entry: " << chron.millis() << "mS\n");
    string rawtext;
    if (!ndb->getRawText(docid, rawtext)) {
        LOGDEB0("abstractFromText: can't fetch text\n");
        return ABSRES_ERROR;
    }
-    LOGABS("abstractFromText: got raw text: size " << rawtext.size() << " " <<
-           chron.millis() << "mS\n");
+    LOGABS("abstractFromText: got raw text: size "<<rawtext.size()<<" "<<chron.millis()<<"mS\n");

-#if 0 && ! (XAPIAN_MAJOR_VERSION <= 1 && XAPIAN_MINOR_VERSION <= 2)  && \
-    (defined(RAWTEXT_IN_DATA))
+#if 0 && XAPIAN_AT_LEAST(1,3,5)
    // Tryout the Xapian internal method.
-    string snippet = xmset.snippet(rawtext);
-    LOGDEB("SNIPPET: [" << snippet << "] END SNIPPET\n");
+    string snippet = xmset.snippet(rawtext, 60);
+    std::cerr << "XAPIAN SNIPPET: [" << snippet << "] END SNIPPET\n";
 #endif

    // We need the q coefs for individual terms
@ -452,8 +439,7 @@ int Query::Native::abstractFromText(
    }
    LOGABS("abstractFromText: getterms: " << chron.millis() << "mS\n");

-    TextSplitABS splitter(rawtext, matchTerms, hld, wordcoefs, ctxwords,
-                          TextSplit::TXTS_NONE,
+    TextSplitABS splitter(rawtext, matchTerms, hld, wordcoefs, ctxwords, TextSplit::TXTS_NONE,
                          m_q->m_snipMaxPosWalk);
    splitter.text_to_words(rawtext);
    LOGABS("abstractFromText: text_to_words: " << chron.millis() << "mS\n");
@ -484,8 +470,7 @@ int Query::Native::abstractFromText(
    // main term and the page positions. 
    unsigned int count = 0;
    for (const auto& entry : result) {
-        string frag(
-            fixfrag(rawtext.substr(entry.start, entry.stop - entry.start)));
+        string frag(fixfrag(rawtext.substr(entry.start, entry.stop - entry.start)));

 #ifdef COMPUTE_HLZONES
        // This would need to be modified to take tag parameters
@ -506,8 +491,7 @@ int Query::Native::abstractFromText(
            if (page < 0)
                page = 0;
        }
-        LOGDEB0("=== FRAGMENT: p. " << page << " Coef: " << entry.coef <<
-                ": " << frag << endl);
+        LOGDEB0("=== FRAGMENT: p. " << page << " Coef: " << entry.coef << ": " << frag << endl);
        vabs.push_back(Snippet(page, frag).setTerm(entry.term));
        if (count++ >= maxtotaloccs)
            break;
@ -515,4 +499,45 @@ int Query::Native::abstractFromText(
    return ABSRES_OK | splitter.getretflags();
 }

+class TermLineSplitter : public TextSplit {
+public:
+    TermLineSplitter(const std::string& term)
+        : TextSplit(TextSplit::TXTS_NOSPANS), m_term(term) {
+    }
+    bool takeword(const std::string& _term, int, int, int) override {
+        std::string term;
+        if (o_index_stripchars) {
+            if (!unacmaybefold(_term, term, "UTF-8", UNACOP_UNACFOLD)) {
+                LOGINFO("PlainToRich::takeword: unac failed for [" << term << "]\n");
+                return true;
+            }
+        }
+        if (term == m_term) {
+            return false;
+        }
+        return true;
+    }
+    void newline(int) override {
+        m_line++;
+    }
+    int getline() {
+        return m_line;
+    }
+private:
+    int m_line{1};
+    std::string m_term;
+};
+
+int Query::getFirstMatchLine(const Doc &doc, const std::string& term)
+{
+    int line = 1;
+    TermLineSplitter splitter(term);
+    bool ret = splitter.text_to_words(doc.text);
+    // The splitter takeword() breaks by returning false as soon as the term is found
+    if (ret == false) {
+        line = splitter.getline();
+    }
+    return line;
+}
+
 }
--- a/src/rcldb/rclquery.h
+++ b/src/rcldb/rclquery.h
@ -115,10 +115,18 @@ public:
    // Returned as a vector of pair<page,snippet> page is 0 if unknown
    int makeDocAbstract(const Doc &doc, std::vector<Snippet>& abst, 
                        int maxoccs= -1, int ctxwords= -1,bool sortbypage=false);
-    /** Retrieve page number for first match for "significant" query term 
-     *  @param term returns the chosen term */
+
+    /** Choose most interesting term and return the page number for its first match
+     *  @param term returns the chosen term 
+     *  @return page number or -1 if term not found or other issue
+     */
    int getFirstMatchPage(const Doc &doc, std::string& term);

+    /** Compute line number for first match of term. Only works if doc.text has text.
+     * This uses a text split. Both this and the above getFirstMaxPage() could be done and saved
+     * while we compute the abstracts, quite a lot of waste here. */
+    int getFirstMatchLine(const Doc &doc, const std::string& term);
+    
    /** Retrieve a reference to the searchData we are using */
    std::shared_ptr<SearchData> getSD() {
        return m_sd;