Add callback for textsplit to report line breaks. Use it to implement looking up the first line where a term appears to use with a %l spec for executing a viewer

This commit is contained in:
Jean-Francois Dockes 2022-01-07 11:43:46 +01:00
parent 25d6d78902
commit 7b42907441
8 changed files with 138 additions and 70 deletions

View File

@ -596,6 +596,7 @@ bool TextSplit::text_to_words(const string &in)
clearsplitstate();
bool pagepending = false;
bool nlpending = false;
bool softhyphenpending = false;
// Running count of non-alphanum chars. Reset when we see one;
@ -705,6 +706,10 @@ bool TextSplit::text_to_words(const string &in)
pagepending = false;
newpage(m_wordpos);
}
if (nlpending) {
nlpending = false;
newline(m_wordpos);
}
break;
case WILD:
@ -745,6 +750,12 @@ bool TextSplit::text_to_words(const string &in)
break;
}
} else {
// Note about dangling hyphens: we always strip '-' found before whitespace,
// even before a newline, then generate two terms, before and after the line
// break. We have no way to know if '-' is there because a word was broken by
// justification or if it was part of an actual compound word (would need a
// dictionary to check). As soft-hyphen *should* be used if the '-' is not part
// of the text.
if (nextc == -1 || isvisiblewhite(nextc)) {
goto SPACE;
}
@ -844,19 +855,10 @@ bool TextSplit::text_to_words(const string &in)
break;
case '\n':
nlpending = true;
/* FALLTHROUGH */
case '\r':
if (m_span.length() && *m_span.rbegin() == '-') {
// if '-' is the last char before end of line, we
// strip it. We have no way to know if this is added
// because of the line split or if it was part of an
// actual compound word (would need a dictionary to
// check). As soft-hyphen *should* be used if the '-'
// is not part of the text, it is better to properly
// process a real compound word, and produce wrong
// output from wrong text. The word-emitting routine
// will strip the trailing '-'.
goto SPACE;
} else if (softhyphenpending) {
if (softhyphenpending) {
// Don't reset soft-hyphen
continue;
} else {

View File

@ -73,6 +73,9 @@ public:
* just don't know about pages. */
virtual void newpage(int /*pos*/) {}
/** Called when we encounter newline \n 0x0a. Override to use the event. */
virtual void newline(int /*pos*/) {}
// Static utility functions:
/** Count words in string, as the splitter would generate them */

View File

@ -34,6 +34,7 @@
#include "rclmain_w.h"
#include "rclzg.h"
#include "pathut.h"
#include "unacpp.h"
using namespace std;
@ -42,7 +43,6 @@ static const vector<string> browser_list{
"opera", "google-chrome", "chromium-browser",
"palemoon", "iceweasel", "firefox", "konqueror", "epiphany"};
// Start native viewer or preview for input Doc. This is used to allow
// using recoll from another app (e.g. Unity Scope) to view embedded
// result docs (docs with an ipath). . We act as a proxy to extract
@ -155,13 +155,27 @@ void RclMain::openWith(Rcl::Doc doc, string cmdspec)
execViewer(subs, false, execname, lcmd, cmdspec, doc);
}
void RclMain::startNativeViewer(Rcl::Doc doc, int pagenum, QString term)
static bool pagenumNeeded(const std::string& cmd)
{
return cmd.find("%p") != std::string::npos;
}
static bool linenumNeeded(const std::string& cmd)
{
return cmd.find("%l") != std::string::npos;
}
static bool termNeeded(const std::string& cmd)
{
return cmd.find("%s") != std::string::npos;
}
void RclMain::startNativeViewer(Rcl::Doc doc, int pagenum, QString qterm)
{
std::string term = qs2utf8s(qterm);
string apptag;
doc.getmeta(Rcl::Doc::keyapptg, &apptag);
LOGDEB("RclMain::startNativeViewer: mtype [" << doc.mimetype <<
"] apptag [" << apptag << "] page " << pagenum << " term [" <<
qs2utf8s(term) << "] url [" << doc.url << "] ipath [" <<
term << "] url [" << doc.url << "] ipath [" <<
doc.ipath << "]\n");
// Look for appropriate viewer
@ -377,19 +391,19 @@ void RclMain::startNativeViewer(Rcl::Doc doc, int pagenum, QString term)
// If we are not called with a page number (which would happen for a call
// from the snippets window), see if we can compute a page number anyway.
if (pagenum == -1) {
pagenum = 1;
string lterm;
if (m_source)
pagenum = m_source->getFirstMatchPage(doc, lterm);
if (m_source && pagenum == -1 && (pagenumNeeded(cmd) || termNeeded(cmd)|| linenumNeeded(cmd))) {
pagenum = m_source->getFirstMatchPage(doc, term);
if (pagenum == -1)
pagenum = 1;
else // We get the match term used to compute the page
term = QString::fromUtf8(lterm.c_str());
}
char cpagenum[20];
sprintf(cpagenum, "%d", pagenum);
int line = 1;
if (m_source && !term.empty() && linenumNeeded(cmd)) {
if (doc.text.empty()) {
rcldb->getDocRawText(doc);
}
line = m_source->getFirstMatchLine(doc, term);
}
// Substitute %xx inside arguments
string efftime;
@ -408,9 +422,10 @@ void RclMain::startNativeViewer(Rcl::Doc doc, int pagenum, QString term)
subs["f"] = fn;
subs["F"] = fn;
subs["i"] = FileInterner::getLastIpathElt(doc.ipath);
subs["l"] = ulltodecstr(line);
subs["M"] = doc.mimetype;
subs["p"] = cpagenum;
subs["s"] = (const char*)term.toLocal8Bit();
subs["p"] = ulltodecstr(pagenum);
subs["s"] = term;
subs["U"] = url_encode(url);
subs["u"] = url;
// Let %(xx) access all metadata.

View File

@ -111,6 +111,9 @@ public:
virtual int getFirstMatchPage(Rcl::Doc&, std::string&) {
return -1;
}
virtual int getFirstMatchLine(const Rcl::Doc&, const std::string&) {
return 1;
}
/** Get duplicates. */
virtual bool docDups(const Rcl::Doc&, std::vector<Rcl::Doc>&) {
return false;

View File

@ -126,6 +126,17 @@ int DocSequenceDb::getFirstMatchPage(Rcl::Doc &doc, string& term)
return -1;
}
int DocSequenceDb::getFirstMatchLine(const Rcl::Doc &doc, const string& term)
{
std::unique_lock<std::mutex> locker(o_dblock);
if (!setQuery())
return false;
if (m_q->whatDb()) {
return m_q->getFirstMatchLine(doc, term);
}
return 1;
}
list<string> DocSequenceDb::expand(Rcl::Doc &doc)
{
std::unique_lock<std::mutex> locker(o_dblock);

View File

@ -43,6 +43,7 @@ public:
virtual bool getAbstract(Rcl::Doc &doc, std::vector<std::string>&) override;
virtual int getFirstMatchPage(Rcl::Doc&, std::string& term) override;
virtual int getFirstMatchLine(const Rcl::Doc&, const std::string& term) override;
virtual bool docDups(const Rcl::Doc& doc, std::vector<Rcl::Doc>& dups)
override;
virtual std::string getDescription() override;

View File

@ -141,11 +141,9 @@ public:
// add/update fragment definition.
virtual bool takeword(const std::string& term, int pos, int bts, int bte) {
LOGDEB1("takeword: [" << term << "] bytepos: "<<bts<<":"<<bte<<endl);
// Limit time taken with monster documents. The resulting
// abstract will be incorrect or inexistent, but this is
// better than taking forever (the default cutoff value comes
// from the snippetMaxPosWalk configuration parameter, and is
// 10E6)
// Limit time taken with monster documents. The resulting abstract will be incorrect or
// inexistent, but this is better than taking forever (the default cutoff value comes from
// the snippetMaxPosWalk configuration parameter, and is 10E6)
if (maxtermcount && termcount++ > maxtermcount) {
LOGINF("Rclabsfromtext: stopping because maxtermcount reached: "<<
maxtermcount << endl);
@ -154,8 +152,7 @@ public:
}
// Also limit the number of fragments (just in case safety)
if (m_fragments.size() > maxtermcount / 100) {
LOGINF("Rclabsfromtext: stopping because maxfragments reached: "<<
maxtermcount/100 << endl);
LOGINF("Rclabsfromtext: stopping: max fragments count: " << maxtermcount/100 << "\n");
retflags |= ABSRES_TRUNC;
return false;
}
@ -193,8 +190,7 @@ public:
m_curterm = term;
m_curtermcoef = coef;
} else {
LOGDEB2("Extending current fragment: " << m_remainingWords <<
" -> " << m_ctxwords << endl);
LOGDEB2("Extending current fragment: "<<m_remainingWords<<" -> "<<m_ctxwords<< "\n");
m_extcount++;
#ifdef COMPUTE_HLZONES
if (m_prevwordhit) {
@ -215,9 +211,8 @@ public:
m_curfragcoef += coef;
m_remainingWords = m_ctxwords + 1;
if (m_extcount > 5) {
// Limit expansion of contiguous fragments (this is to
// avoid common terms in search causing long
// heavyweight meaningless fragments. Also, limit length).
// Limit expansion of contiguous fragments (this is to avoid common terms in search
// causing long heavyweight meaningless fragments. Also, limit length).
m_remainingWords = 1;
m_extcount = 0;
}
@ -247,18 +242,14 @@ public:
LOGDEB1("FRAGMENT: from byte " << m_curfrag.first <<
" to byte " << m_curfrag.second << endl);
LOGDEB1("FRAGMENT TEXT [" << m_rawtext.substr(
m_curfrag.first, m_curfrag.second-m_curfrag.first)
<< "]\n");
// We used to not push weak fragments if we had a lot
// already. This can cause problems if the fragments
// we drop are actually group fragments (which have
// not got their boost yet). The right cut value is
// difficult to determine, because the absolute values
// of the coefs depend on many things (index size,
// etc.) The old test was if (m_totalcoef < 5.0 ||
// m_curfragcoef >= 1.0) We now just avoid creating a
// monster by testing the current fragments count at
// the top of the function
m_curfrag.first, m_curfrag.second-m_curfrag.first) << "]\n");
// We used to not push weak fragments if we had a lot already. This can cause
// problems if the fragments we drop are actually group fragments (which have not
// got their boost yet). The right cut value is difficult to determine, because the
// absolute values of the coefs depend on many things (index size, etc.) The old
// test was if (m_totalcoef < 5.0 || m_curfragcoef >= 1.0) We now just avoid
// creating a monster by testing the current fragments count at the top of the
// function
m_fragments.push_back(MatchFragment(m_curfrag.first,
m_curfrag.second,
m_curfragcoef,
@ -298,8 +289,7 @@ public:
m_curtermcoef = 0.0;
}
LOGDEB("TextSplitABS: stored total " << m_fragments.size() <<
" fragments" << endl);
LOGDEB("TextSplitABS: stored total " << m_fragments.size() << " fragments" << endl);
vector<GroupMatchEntry> tboffs;
// Look for matches to PHRASE and NEAR term groups and finalize
@ -340,9 +330,8 @@ public:
}
auto fragit = m_fragments.begin();
for (const auto& grpmatch : tboffs) {
LOGDEB2("LOOKING FOR FRAGMENT: group: " << grpmatch.offs.first <<
"-" << grpmatch.offs.second << " curfrag " <<
fragit->start << "-" << fragit->stop << endl);
LOGDEB2("LOOKING FOR FRAGMENT: group: " << grpmatch.offs.first << "-" <<
grpmatch.offs.second<<" curfrag "<<fragit->start<<"-"<<fragit->stop<<"\n");
while (fragit->stop < grpmatch.offs.first) {
fragit++;
if (fragit == m_fragments.end()) {
@ -417,21 +406,19 @@ int Query::Native::abstractFromText(
bool sortbypage
)
{
(void)chron;
PRETEND_USE(chron);
LOGABS("abstractFromText: entry: " << chron.millis() << "mS\n");
string rawtext;
if (!ndb->getRawText(docid, rawtext)) {
LOGDEB0("abstractFromText: can't fetch text\n");
return ABSRES_ERROR;
}
LOGABS("abstractFromText: got raw text: size " << rawtext.size() << " " <<
chron.millis() << "mS\n");
LOGABS("abstractFromText: got raw text: size "<<rawtext.size()<<" "<<chron.millis()<<"mS\n");
#if 0 && ! (XAPIAN_MAJOR_VERSION <= 1 && XAPIAN_MINOR_VERSION <= 2) && \
(defined(RAWTEXT_IN_DATA))
#if 0 && XAPIAN_AT_LEAST(1,3,5)
// Tryout the Xapian internal method.
string snippet = xmset.snippet(rawtext);
LOGDEB("SNIPPET: [" << snippet << "] END SNIPPET\n");
string snippet = xmset.snippet(rawtext, 60);
std::cerr << "XAPIAN SNIPPET: [" << snippet << "] END SNIPPET\n";
#endif
// We need the q coefs for individual terms
@ -452,8 +439,7 @@ int Query::Native::abstractFromText(
}
LOGABS("abstractFromText: getterms: " << chron.millis() << "mS\n");
TextSplitABS splitter(rawtext, matchTerms, hld, wordcoefs, ctxwords,
TextSplit::TXTS_NONE,
TextSplitABS splitter(rawtext, matchTerms, hld, wordcoefs, ctxwords, TextSplit::TXTS_NONE,
m_q->m_snipMaxPosWalk);
splitter.text_to_words(rawtext);
LOGABS("abstractFromText: text_to_words: " << chron.millis() << "mS\n");
@ -484,8 +470,7 @@ int Query::Native::abstractFromText(
// main term and the page positions.
unsigned int count = 0;
for (const auto& entry : result) {
string frag(
fixfrag(rawtext.substr(entry.start, entry.stop - entry.start)));
string frag(fixfrag(rawtext.substr(entry.start, entry.stop - entry.start)));
#ifdef COMPUTE_HLZONES
// This would need to be modified to take tag parameters
@ -506,8 +491,7 @@ int Query::Native::abstractFromText(
if (page < 0)
page = 0;
}
LOGDEB0("=== FRAGMENT: p. " << page << " Coef: " << entry.coef <<
": " << frag << endl);
LOGDEB0("=== FRAGMENT: p. " << page << " Coef: " << entry.coef << ": " << frag << endl);
vabs.push_back(Snippet(page, frag).setTerm(entry.term));
if (count++ >= maxtotaloccs)
break;
@ -515,4 +499,45 @@ int Query::Native::abstractFromText(
return ABSRES_OK | splitter.getretflags();
}
class TermLineSplitter : public TextSplit {
public:
TermLineSplitter(const std::string& term)
: TextSplit(TextSplit::TXTS_NOSPANS), m_term(term) {
}
bool takeword(const std::string& _term, int, int, int) override {
std::string term;
if (o_index_stripchars) {
if (!unacmaybefold(_term, term, "UTF-8", UNACOP_UNACFOLD)) {
LOGINFO("PlainToRich::takeword: unac failed for [" << term << "]\n");
return true;
}
}
if (term == m_term) {
return false;
}
return true;
}
void newline(int) override {
m_line++;
}
int getline() {
return m_line;
}
private:
int m_line{1};
std::string m_term;
};
int Query::getFirstMatchLine(const Doc &doc, const std::string& term)
{
int line = 1;
TermLineSplitter splitter(term);
bool ret = splitter.text_to_words(doc.text);
// The splitter takeword() breaks by returning false as soon as the term is found
if (ret == false) {
line = splitter.getline();
}
return line;
}
}

View File

@ -115,10 +115,18 @@ public:
// Returned as a vector of pair<page,snippet> page is 0 if unknown
int makeDocAbstract(const Doc &doc, std::vector<Snippet>& abst,
int maxoccs= -1, int ctxwords= -1,bool sortbypage=false);
/** Retrieve page number for first match for "significant" query term
* @param term returns the chosen term */
/** Choose most interesting term and return the page number for its first match
* @param term returns the chosen term
* @return page number or -1 if term not found or other issue
*/
int getFirstMatchPage(const Doc &doc, std::string& term);
/** Compute line number for first match of term. Only works if doc.text has text.
* This uses a text split. Both this and the above getFirstMaxPage() could be done and saved
* while we compute the abstracts, quite a lot of waste here. */
int getFirstMatchLine(const Doc &doc, const std::string& term);
/** Retrieve a reference to the searchData we are using */
std::shared_ptr<SearchData> getSD() {
return m_sd;