From 7b4290744130aa5ca179c673cd22bb87cb9f6b58 Mon Sep 17 00:00:00 2001
From: Jean-Francois Dockes <jf@dockes.org>
Date: Fri, 7 Jan 2022 11:43:46 +0100
Subject: [PATCH] Add callback for textsplit to report line breaks. Use it to
 implement looking up the first line where a term appears to use with a %l
 spec for executing a viewer

---
 src/common/textsplit.cpp     |  26 +++++----
 src/common/textsplit.h       |   3 +
 src/qtgui/rclm_view.cpp      |  43 +++++++++-----
 src/query/docseq.h           |   3 +
 src/query/docseqdb.cpp       |  11 ++++
 src/query/docseqdb.h         |   1 +
 src/rcldb/rclabsfromtext.cpp | 109 +++++++++++++++++++++--------------
 src/rcldb/rclquery.h         |  12 +++-
 8 files changed, 138 insertions(+), 70 deletions(-)

diff --git a/src/common/textsplit.cpp b/src/common/textsplit.cpp
index 49b234c0..be61c917 100644
--- a/src/common/textsplit.cpp
+++ b/src/common/textsplit.cpp
@@ -596,6 +596,7 @@ bool TextSplit::text_to_words(const string &in)
     clearsplitstate();
     
     bool pagepending = false;
+    bool nlpending = false;
     bool softhyphenpending = false;
 
     // Running count of non-alphanum chars. Reset when we see one;
@@ -705,6 +706,10 @@ bool TextSplit::text_to_words(const string &in)
                 pagepending = false;
                 newpage(m_wordpos);
             }
+            if (nlpending) {
+                nlpending = false;
+                newline(m_wordpos);
+            }
             break;
 
         case WILD:
@@ -745,6 +750,12 @@ bool TextSplit::text_to_words(const string &in)
                         break;
                     }
                 } else {
+                    // Note about dangling hyphens: we always strip '-' found before whitespace,
+                    // even before a newline, then generate two terms, before and after the line
+                    // break. We have no way to know if '-' is there because a word was broken by
+                    // justification or if it was part of an actual compound word (would need a
+                    // dictionary to check). As soft-hyphen *should* be used if the '-' is not part
+                    // of the text.
                     if (nextc == -1 || isvisiblewhite(nextc)) {
                         goto SPACE;
                     }
@@ -844,19 +855,10 @@ bool TextSplit::text_to_words(const string &in)
             break;
 
         case '\n':
+            nlpending = true;
+            /* FALLTHROUGH */
         case '\r':
-            if (m_span.length() && *m_span.rbegin() == '-') {
-                // if '-' is the last char before end of line, we
-                // strip it.  We have no way to know if this is added
-                // because of the line split or if it was part of an
-                // actual compound word (would need a dictionary to
-                // check).  As soft-hyphen *should* be used if the '-'
-                // is not part of the text, it is better to properly
-                // process a real compound word, and produce wrong
-                // output from wrong text. The word-emitting routine
-                // will strip the trailing '-'.
-                goto SPACE;
-            } else if (softhyphenpending) {
+            if (softhyphenpending) {
                 // Don't reset soft-hyphen
                 continue;
             } else {
diff --git a/src/common/textsplit.h b/src/common/textsplit.h
index 0821ee04..c09e867f 100644
--- a/src/common/textsplit.h
+++ b/src/common/textsplit.h
@@ -73,6 +73,9 @@ public:
      * just don't know about pages. */
     virtual void newpage(int /*pos*/) {}
 
+    /** Called when we encounter newline \n 0x0a. Override to use the event. */
+    virtual void newline(int /*pos*/) {}
+
     // Static utility functions:
 
     /** Count words in string, as the splitter would generate them */
diff --git a/src/qtgui/rclm_view.cpp b/src/qtgui/rclm_view.cpp
index d9a75b90..6aa43e00 100644
--- a/src/qtgui/rclm_view.cpp
+++ b/src/qtgui/rclm_view.cpp
@@ -34,6 +34,7 @@
 #include "rclmain_w.h"
 #include "rclzg.h"
 #include "pathut.h"
+#include "unacpp.h"
 
 using namespace std;
 
@@ -42,7 +43,6 @@ static const vector<string> browser_list{
     "opera", "google-chrome", "chromium-browser",
     "palemoon", "iceweasel", "firefox", "konqueror", "epiphany"};
 
-
 // Start native viewer or preview for input Doc. This is used to allow
 // using recoll from another app (e.g. Unity Scope) to view embedded
 // result docs (docs with an ipath). . We act as a proxy to extract
@@ -155,13 +155,27 @@ void RclMain::openWith(Rcl::Doc doc, string cmdspec)
     execViewer(subs, false, execname, lcmd, cmdspec, doc);
 }
 
-void RclMain::startNativeViewer(Rcl::Doc doc, int pagenum, QString term)
+static bool pagenumNeeded(const std::string& cmd)
 {
+    return cmd.find("%p") != std::string::npos;
+}
+static bool linenumNeeded(const std::string& cmd)
+{
+    return cmd.find("%l") != std::string::npos;
+}
+static bool termNeeded(const std::string& cmd)
+{
+    return cmd.find("%s") != std::string::npos;
+}
+
+void RclMain::startNativeViewer(Rcl::Doc doc, int pagenum, QString qterm)
+{
+    std::string term = qs2utf8s(qterm);
     string apptag;
     doc.getmeta(Rcl::Doc::keyapptg, &apptag);
     LOGDEB("RclMain::startNativeViewer: mtype [" << doc.mimetype <<
            "] apptag ["  << apptag << "] page "  << pagenum << " term ["  <<
-           qs2utf8s(term) << "] url ["  << doc.url << "] ipath [" <<
+           term << "] url ["  << doc.url << "] ipath [" <<
            doc.ipath << "]\n");
 
     // Look for appropriate viewer
@@ -377,19 +391,19 @@ void RclMain::startNativeViewer(Rcl::Doc doc, int pagenum, QString term)
 
     // If we are not called with a page number (which would happen for a call
     // from the snippets window), see if we can compute a page number anyway.
-    if (pagenum == -1) {
-        pagenum = 1;
-        string lterm;
-        if (m_source)
-            pagenum = m_source->getFirstMatchPage(doc, lterm);
+    if (m_source && pagenum == -1 && (pagenumNeeded(cmd) || termNeeded(cmd)|| linenumNeeded(cmd))) {
+        pagenum = m_source->getFirstMatchPage(doc, term);
         if (pagenum == -1)
             pagenum = 1;
-        else // We get the match term used to compute the page
-            term = QString::fromUtf8(lterm.c_str());
     }
-    char cpagenum[20];
-    sprintf(cpagenum, "%d", pagenum);
 
+    int line = 1;
+    if (m_source && !term.empty() && linenumNeeded(cmd)) {
+        if (doc.text.empty()) {
+            rcldb->getDocRawText(doc);
+        }
+        line = m_source->getFirstMatchLine(doc, term);
+    }
 
     // Substitute %xx inside arguments
     string efftime;
@@ -408,9 +422,10 @@ void RclMain::startNativeViewer(Rcl::Doc doc, int pagenum, QString term)
     subs["f"] = fn;
     subs["F"] = fn;
     subs["i"] = FileInterner::getLastIpathElt(doc.ipath);
+    subs["l"] = ulltodecstr(line);
     subs["M"] = doc.mimetype;
-    subs["p"] = cpagenum;
-    subs["s"] = (const char*)term.toLocal8Bit();
+    subs["p"] = ulltodecstr(pagenum);
+    subs["s"] = term;
     subs["U"] = url_encode(url);
     subs["u"] = url;
     // Let %(xx) access all metadata.
diff --git a/src/query/docseq.h b/src/query/docseq.h
index 4dd6f50f..650b9d89 100644
--- a/src/query/docseq.h
+++ b/src/query/docseq.h
@@ -111,6 +111,9 @@ public:
     virtual int getFirstMatchPage(Rcl::Doc&, std::string&) {
         return -1;
     }
+    virtual int getFirstMatchLine(const Rcl::Doc&, const std::string&) {
+        return 1;
+    }
     /** Get duplicates. */
     virtual bool docDups(const Rcl::Doc&, std::vector<Rcl::Doc>&) {
         return false;
diff --git a/src/query/docseqdb.cpp b/src/query/docseqdb.cpp
index fab028bd..df06c6a3 100644
--- a/src/query/docseqdb.cpp
+++ b/src/query/docseqdb.cpp
@@ -126,6 +126,17 @@ int DocSequenceDb::getFirstMatchPage(Rcl::Doc &doc, string& term)
     return -1;
 }
 
+int DocSequenceDb::getFirstMatchLine(const Rcl::Doc &doc, const string& term)
+{
+    std::unique_lock<std::mutex> locker(o_dblock);
+    if (!setQuery())
+        return false;
+    if (m_q->whatDb()) {
+        return m_q->getFirstMatchLine(doc, term);
+    }
+    return 1;
+}
+
 list<string> DocSequenceDb::expand(Rcl::Doc &doc)
 {
     std::unique_lock<std::mutex> locker(o_dblock);
diff --git a/src/query/docseqdb.h b/src/query/docseqdb.h
index 69535d79..b77051b4 100644
--- a/src/query/docseqdb.h
+++ b/src/query/docseqdb.h
@@ -43,6 +43,7 @@ public:
 
     virtual bool getAbstract(Rcl::Doc &doc, std::vector<std::string>&) override;
     virtual int getFirstMatchPage(Rcl::Doc&, std::string& term) override;
+    virtual int getFirstMatchLine(const Rcl::Doc&, const std::string& term) override;
     virtual bool docDups(const Rcl::Doc& doc, std::vector<Rcl::Doc>& dups)
         override;
     virtual std::string getDescription() override;
diff --git a/src/rcldb/rclabsfromtext.cpp b/src/rcldb/rclabsfromtext.cpp
index 32783b1f..4195b1c4 100644
--- a/src/rcldb/rclabsfromtext.cpp
+++ b/src/rcldb/rclabsfromtext.cpp
@@ -141,11 +141,9 @@ public:
     // add/update fragment definition.
     virtual bool takeword(const std::string& term, int pos, int bts, int bte) {
         LOGDEB1("takeword: [" << term << "] bytepos: "<<bts<<":"<<bte<<endl);
-        // Limit time taken with monster documents. The resulting
-        // abstract will be incorrect or inexistent, but this is
-        // better than taking forever (the default cutoff value comes
-        // from the snippetMaxPosWalk configuration parameter, and is
-        // 10E6)
+        // Limit time taken with monster documents. The resulting abstract will be incorrect or
+        // inexistent, but this is better than taking forever (the default cutoff value comes from
+        // the snippetMaxPosWalk configuration parameter, and is 10E6)
         if (maxtermcount && termcount++ > maxtermcount) {
             LOGINF("Rclabsfromtext: stopping because maxtermcount reached: "<<
                    maxtermcount << endl);
@@ -154,8 +152,7 @@ public:
         }
         // Also limit the number of fragments (just in case safety)
         if (m_fragments.size() > maxtermcount / 100) {
-            LOGINF("Rclabsfromtext: stopping because maxfragments reached: "<<
-                   maxtermcount/100 << endl);
+            LOGINF("Rclabsfromtext: stopping: max fragments count: " << maxtermcount/100 << "\n");
             retflags |= ABSRES_TRUNC;
             return false;
         }
@@ -193,8 +190,7 @@ public:
                 m_curterm = term;
                 m_curtermcoef = coef;
             } else {
-                LOGDEB2("Extending current fragment: " << m_remainingWords <<
-                        " -> " << m_ctxwords << endl);
+                LOGDEB2("Extending current fragment: "<<m_remainingWords<<" -> "<<m_ctxwords<< "\n");
                 m_extcount++;
 #ifdef COMPUTE_HLZONES
                 if (m_prevwordhit) {
@@ -215,9 +211,8 @@ public:
             m_curfragcoef += coef;
             m_remainingWords = m_ctxwords + 1;
             if (m_extcount > 5) {
-                // Limit expansion of contiguous fragments (this is to
-                // avoid common terms in search causing long
-                // heavyweight meaningless fragments. Also, limit length).
+                // Limit expansion of contiguous fragments (this is to avoid common terms in search
+                // causing long heavyweight meaningless fragments. Also, limit length).
                 m_remainingWords = 1;
                 m_extcount = 0;
             }
@@ -247,18 +242,14 @@ public:
                 LOGDEB1("FRAGMENT: from byte " << m_curfrag.first <<
                         " to  byte " << m_curfrag.second << endl);
                 LOGDEB1("FRAGMENT TEXT [" << m_rawtext.substr(
-                            m_curfrag.first, m_curfrag.second-m_curfrag.first)
-                        << "]\n");
-                // We used to not push weak fragments if we had a lot
-                // already. This can cause problems if the fragments
-                // we drop are actually group fragments (which have
-                // not got their boost yet). The right cut value is
-                // difficult to determine, because the absolute values
-                // of the coefs depend on many things (index size,
-                // etc.) The old test was if (m_totalcoef < 5.0 ||
-                // m_curfragcoef >= 1.0) We now just avoid creating a
-                // monster by testing the current fragments count at
-                // the top of the function
+                            m_curfrag.first, m_curfrag.second-m_curfrag.first) << "]\n");
+                // We used to not push weak fragments if we had a lot already. This can cause
+                // problems if the fragments we drop are actually group fragments (which have not
+                // got their boost yet). The right cut value is difficult to determine, because the
+                // absolute values of the coefs depend on many things (index size, etc.) The old
+                // test was if (m_totalcoef < 5.0 || m_curfragcoef >= 1.0) We now just avoid
+                // creating a monster by testing the current fragments count at the top of the
+                // function
                 m_fragments.push_back(MatchFragment(m_curfrag.first,
                                                     m_curfrag.second,
                                                     m_curfragcoef,
@@ -298,8 +289,7 @@ public:
                 m_curtermcoef = 0.0;
         }
 
-        LOGDEB("TextSplitABS: stored total " << m_fragments.size() <<
-               " fragments" << endl);
+        LOGDEB("TextSplitABS: stored total " << m_fragments.size() << " fragments" << endl);
         vector<GroupMatchEntry> tboffs;
 
         // Look for matches to PHRASE and NEAR term groups and finalize
@@ -340,9 +330,8 @@ public:
         }
         auto fragit = m_fragments.begin();
         for (const auto& grpmatch : tboffs) {
-            LOGDEB2("LOOKING FOR FRAGMENT: group: " << grpmatch.offs.first <<
-                    "-" << grpmatch.offs.second << " curfrag " <<
-                    fragit->start << "-" << fragit->stop << endl);
+            LOGDEB2("LOOKING FOR FRAGMENT: group: " << grpmatch.offs.first << "-" <<
+                    grpmatch.offs.second<<" curfrag "<<fragit->start<<"-"<<fragit->stop<<"\n");
             while (fragit->stop < grpmatch.offs.first) {
                 fragit++;
                 if (fragit == m_fragments.end()) {
@@ -417,21 +406,19 @@ int Query::Native::abstractFromText(
     bool sortbypage
     )
 {
-    (void)chron;
+    PRETEND_USE(chron);
     LOGABS("abstractFromText: entry: " << chron.millis() << "mS\n");
     string rawtext;
     if (!ndb->getRawText(docid, rawtext)) {
         LOGDEB0("abstractFromText: can't fetch text\n");
         return ABSRES_ERROR;
     }
-    LOGABS("abstractFromText: got raw text: size " << rawtext.size() << " " <<
-           chron.millis() << "mS\n");
+    LOGABS("abstractFromText: got raw text: size "<<rawtext.size()<<" "<<chron.millis()<<"mS\n");
 
-#if 0 && ! (XAPIAN_MAJOR_VERSION <= 1 && XAPIAN_MINOR_VERSION <= 2)  && \
-    (defined(RAWTEXT_IN_DATA))
+#if 0 && XAPIAN_AT_LEAST(1,3,5)
     // Tryout the Xapian internal method.
-    string snippet = xmset.snippet(rawtext);
-    LOGDEB("SNIPPET: [" << snippet << "] END SNIPPET\n");
+    string snippet = xmset.snippet(rawtext, 60);
+    std::cerr << "XAPIAN SNIPPET: [" << snippet << "] END SNIPPET\n";
 #endif
 
     // We need the q coefs for individual terms
@@ -452,8 +439,7 @@ int Query::Native::abstractFromText(
     }
     LOGABS("abstractFromText: getterms: " << chron.millis() << "mS\n");
 
-    TextSplitABS splitter(rawtext, matchTerms, hld, wordcoefs, ctxwords,
-                          TextSplit::TXTS_NONE,
+    TextSplitABS splitter(rawtext, matchTerms, hld, wordcoefs, ctxwords, TextSplit::TXTS_NONE,
                           m_q->m_snipMaxPosWalk);
     splitter.text_to_words(rawtext);
     LOGABS("abstractFromText: text_to_words: " << chron.millis() << "mS\n");
@@ -484,8 +470,7 @@ int Query::Native::abstractFromText(
     // main term and the page positions. 
     unsigned int count = 0;
     for (const auto& entry : result) {
-        string frag(
-            fixfrag(rawtext.substr(entry.start, entry.stop - entry.start)));
+        string frag(fixfrag(rawtext.substr(entry.start, entry.stop - entry.start)));
 
 #ifdef COMPUTE_HLZONES
         // This would need to be modified to take tag parameters
@@ -506,8 +491,7 @@ int Query::Native::abstractFromText(
             if (page < 0)
                 page = 0;
         }
-        LOGDEB0("=== FRAGMENT: p. " << page << " Coef: " << entry.coef <<
-                ": " << frag << endl);
+        LOGDEB0("=== FRAGMENT: p. " << page << " Coef: " << entry.coef << ": " << frag << endl);
         vabs.push_back(Snippet(page, frag).setTerm(entry.term));
         if (count++ >= maxtotaloccs)
             break;
@@ -515,4 +499,45 @@ int Query::Native::abstractFromText(
     return ABSRES_OK | splitter.getretflags();
 }
 
+class TermLineSplitter : public TextSplit {
+public:
+    TermLineSplitter(const std::string& term)
+        : TextSplit(TextSplit::TXTS_NOSPANS), m_term(term) {
+    }
+    bool takeword(const std::string& _term, int, int, int) override {
+        std::string term;
+        if (o_index_stripchars) {
+            if (!unacmaybefold(_term, term, "UTF-8", UNACOP_UNACFOLD)) {
+                LOGINFO("PlainToRich::takeword: unac failed for [" << term << "]\n");
+                return true;
+            }
+        }
+        if (term == m_term) {
+            return false;
+        }
+        return true;
+    }
+    void newline(int) override {
+        m_line++;
+    }
+    int getline() {
+        return m_line;
+    }
+private:
+    int m_line{1};
+    std::string m_term;
+};
+
+int Query::getFirstMatchLine(const Doc &doc, const std::string& term)
+{
+    int line = 1;
+    TermLineSplitter splitter(term);
+    bool ret = splitter.text_to_words(doc.text);
+    // The splitter takeword() breaks by returning false as soon as the term is found
+    if (ret == false) {
+        line = splitter.getline();
+    }
+    return line;
+}
+
 }
diff --git a/src/rcldb/rclquery.h b/src/rcldb/rclquery.h
index cade3650..fd8874d3 100644
--- a/src/rcldb/rclquery.h
+++ b/src/rcldb/rclquery.h
@@ -115,10 +115,18 @@ public:
     // Returned as a vector of pair<page,snippet> page is 0 if unknown
     int makeDocAbstract(const Doc &doc, std::vector<Snippet>& abst, 
                         int maxoccs= -1, int ctxwords= -1,bool sortbypage=false);
-    /** Retrieve page number for first match for "significant" query term 
-     *  @param term returns the chosen term */
+
+    /** Choose most interesting term and return the page number for its first match
+     *  @param term returns the chosen term 
+     *  @return page number or -1 if term not found or other issue
+     */
     int getFirstMatchPage(const Doc &doc, std::string& term);
 
+    /** Compute line number for first match of term. Only works if doc.text has text.
+     * This uses a text split. Both this and the above getFirstMaxPage() could be done and saved
+     * while we compute the abstracts, quite a lot of waste here. */
+    int getFirstMatchLine(const Doc &doc, const std::string& term);
+    
     /** Retrieve a reference to the searchData we are using */
     std::shared_ptr<SearchData> getSD() {
         return m_sd;