Add callback for textsplit to report line breaks. Use it to implement looking up the first line where a term appears to use with a %l spec for executing a viewer

This commit is contained in:
Jean-Francois Dockes 2022-01-07 11:43:46 +01:00
parent 25d6d78902
commit 7b42907441
8 changed files with 138 additions and 70 deletions

View File

@ -596,6 +596,7 @@ bool TextSplit::text_to_words(const string &in)
clearsplitstate(); clearsplitstate();
bool pagepending = false; bool pagepending = false;
bool nlpending = false;
bool softhyphenpending = false; bool softhyphenpending = false;
// Running count of non-alphanum chars. Reset when we see one; // Running count of non-alphanum chars. Reset when we see one;
@ -705,6 +706,10 @@ bool TextSplit::text_to_words(const string &in)
pagepending = false; pagepending = false;
newpage(m_wordpos); newpage(m_wordpos);
} }
if (nlpending) {
nlpending = false;
newline(m_wordpos);
}
break; break;
case WILD: case WILD:
@ -745,6 +750,12 @@ bool TextSplit::text_to_words(const string &in)
break; break;
} }
} else { } else {
// Note about dangling hyphens: we always strip '-' found before whitespace,
// even before a newline, then generate two terms, before and after the line
// break. We have no way to know if '-' is there because a word was broken by
// justification or if it was part of an actual compound word (would need a
// dictionary to check). As soft-hyphen *should* be used if the '-' is not part
// of the text.
if (nextc == -1 || isvisiblewhite(nextc)) { if (nextc == -1 || isvisiblewhite(nextc)) {
goto SPACE; goto SPACE;
} }
@ -844,19 +855,10 @@ bool TextSplit::text_to_words(const string &in)
break; break;
case '\n': case '\n':
nlpending = true;
/* FALLTHROUGH */
case '\r': case '\r':
if (m_span.length() && *m_span.rbegin() == '-') { if (softhyphenpending) {
// if '-' is the last char before end of line, we
// strip it. We have no way to know if this is added
// because of the line split or if it was part of an
// actual compound word (would need a dictionary to
// check). As soft-hyphen *should* be used if the '-'
// is not part of the text, it is better to properly
// process a real compound word, and produce wrong
// output from wrong text. The word-emitting routine
// will strip the trailing '-'.
goto SPACE;
} else if (softhyphenpending) {
// Don't reset soft-hyphen // Don't reset soft-hyphen
continue; continue;
} else { } else {

View File

@ -73,6 +73,9 @@ public:
* just don't know about pages. */ * just don't know about pages. */
virtual void newpage(int /*pos*/) {} virtual void newpage(int /*pos*/) {}
/** Called when we encounter newline \n 0x0a. Override to use the event. */
virtual void newline(int /*pos*/) {}
// Static utility functions: // Static utility functions:
/** Count words in string, as the splitter would generate them */ /** Count words in string, as the splitter would generate them */

View File

@ -34,6 +34,7 @@
#include "rclmain_w.h" #include "rclmain_w.h"
#include "rclzg.h" #include "rclzg.h"
#include "pathut.h" #include "pathut.h"
#include "unacpp.h"
using namespace std; using namespace std;
@ -42,7 +43,6 @@ static const vector<string> browser_list{
"opera", "google-chrome", "chromium-browser", "opera", "google-chrome", "chromium-browser",
"palemoon", "iceweasel", "firefox", "konqueror", "epiphany"}; "palemoon", "iceweasel", "firefox", "konqueror", "epiphany"};
// Start native viewer or preview for input Doc. This is used to allow // Start native viewer or preview for input Doc. This is used to allow
// using recoll from another app (e.g. Unity Scope) to view embedded // using recoll from another app (e.g. Unity Scope) to view embedded
// result docs (docs with an ipath). . We act as a proxy to extract // result docs (docs with an ipath). . We act as a proxy to extract
@ -155,13 +155,27 @@ void RclMain::openWith(Rcl::Doc doc, string cmdspec)
execViewer(subs, false, execname, lcmd, cmdspec, doc); execViewer(subs, false, execname, lcmd, cmdspec, doc);
} }
void RclMain::startNativeViewer(Rcl::Doc doc, int pagenum, QString term) static bool pagenumNeeded(const std::string& cmd)
{ {
return cmd.find("%p") != std::string::npos;
}
static bool linenumNeeded(const std::string& cmd)
{
return cmd.find("%l") != std::string::npos;
}
static bool termNeeded(const std::string& cmd)
{
return cmd.find("%s") != std::string::npos;
}
void RclMain::startNativeViewer(Rcl::Doc doc, int pagenum, QString qterm)
{
std::string term = qs2utf8s(qterm);
string apptag; string apptag;
doc.getmeta(Rcl::Doc::keyapptg, &apptag); doc.getmeta(Rcl::Doc::keyapptg, &apptag);
LOGDEB("RclMain::startNativeViewer: mtype [" << doc.mimetype << LOGDEB("RclMain::startNativeViewer: mtype [" << doc.mimetype <<
"] apptag [" << apptag << "] page " << pagenum << " term [" << "] apptag [" << apptag << "] page " << pagenum << " term [" <<
qs2utf8s(term) << "] url [" << doc.url << "] ipath [" << term << "] url [" << doc.url << "] ipath [" <<
doc.ipath << "]\n"); doc.ipath << "]\n");
// Look for appropriate viewer // Look for appropriate viewer
@ -377,19 +391,19 @@ void RclMain::startNativeViewer(Rcl::Doc doc, int pagenum, QString term)
// If we are not called with a page number (which would happen for a call // If we are not called with a page number (which would happen for a call
// from the snippets window), see if we can compute a page number anyway. // from the snippets window), see if we can compute a page number anyway.
if (pagenum == -1) { if (m_source && pagenum == -1 && (pagenumNeeded(cmd) || termNeeded(cmd)|| linenumNeeded(cmd))) {
pagenum = 1; pagenum = m_source->getFirstMatchPage(doc, term);
string lterm;
if (m_source)
pagenum = m_source->getFirstMatchPage(doc, lterm);
if (pagenum == -1) if (pagenum == -1)
pagenum = 1; pagenum = 1;
else // We get the match term used to compute the page
term = QString::fromUtf8(lterm.c_str());
} }
char cpagenum[20];
sprintf(cpagenum, "%d", pagenum);
int line = 1;
if (m_source && !term.empty() && linenumNeeded(cmd)) {
if (doc.text.empty()) {
rcldb->getDocRawText(doc);
}
line = m_source->getFirstMatchLine(doc, term);
}
// Substitute %xx inside arguments // Substitute %xx inside arguments
string efftime; string efftime;
@ -408,9 +422,10 @@ void RclMain::startNativeViewer(Rcl::Doc doc, int pagenum, QString term)
subs["f"] = fn; subs["f"] = fn;
subs["F"] = fn; subs["F"] = fn;
subs["i"] = FileInterner::getLastIpathElt(doc.ipath); subs["i"] = FileInterner::getLastIpathElt(doc.ipath);
subs["l"] = ulltodecstr(line);
subs["M"] = doc.mimetype; subs["M"] = doc.mimetype;
subs["p"] = cpagenum; subs["p"] = ulltodecstr(pagenum);
subs["s"] = (const char*)term.toLocal8Bit(); subs["s"] = term;
subs["U"] = url_encode(url); subs["U"] = url_encode(url);
subs["u"] = url; subs["u"] = url;
// Let %(xx) access all metadata. // Let %(xx) access all metadata.

View File

@ -111,6 +111,9 @@ public:
virtual int getFirstMatchPage(Rcl::Doc&, std::string&) { virtual int getFirstMatchPage(Rcl::Doc&, std::string&) {
return -1; return -1;
} }
virtual int getFirstMatchLine(const Rcl::Doc&, const std::string&) {
return 1;
}
/** Get duplicates. */ /** Get duplicates. */
virtual bool docDups(const Rcl::Doc&, std::vector<Rcl::Doc>&) { virtual bool docDups(const Rcl::Doc&, std::vector<Rcl::Doc>&) {
return false; return false;

View File

@ -126,6 +126,17 @@ int DocSequenceDb::getFirstMatchPage(Rcl::Doc &doc, string& term)
return -1; return -1;
} }
int DocSequenceDb::getFirstMatchLine(const Rcl::Doc &doc, const string& term)
{
std::unique_lock<std::mutex> locker(o_dblock);
if (!setQuery())
return false;
if (m_q->whatDb()) {
return m_q->getFirstMatchLine(doc, term);
}
return 1;
}
list<string> DocSequenceDb::expand(Rcl::Doc &doc) list<string> DocSequenceDb::expand(Rcl::Doc &doc)
{ {
std::unique_lock<std::mutex> locker(o_dblock); std::unique_lock<std::mutex> locker(o_dblock);

View File

@ -43,6 +43,7 @@ public:
virtual bool getAbstract(Rcl::Doc &doc, std::vector<std::string>&) override; virtual bool getAbstract(Rcl::Doc &doc, std::vector<std::string>&) override;
virtual int getFirstMatchPage(Rcl::Doc&, std::string& term) override; virtual int getFirstMatchPage(Rcl::Doc&, std::string& term) override;
virtual int getFirstMatchLine(const Rcl::Doc&, const std::string& term) override;
virtual bool docDups(const Rcl::Doc& doc, std::vector<Rcl::Doc>& dups) virtual bool docDups(const Rcl::Doc& doc, std::vector<Rcl::Doc>& dups)
override; override;
virtual std::string getDescription() override; virtual std::string getDescription() override;

View File

@ -141,11 +141,9 @@ public:
// add/update fragment definition. // add/update fragment definition.
virtual bool takeword(const std::string& term, int pos, int bts, int bte) { virtual bool takeword(const std::string& term, int pos, int bts, int bte) {
LOGDEB1("takeword: [" << term << "] bytepos: "<<bts<<":"<<bte<<endl); LOGDEB1("takeword: [" << term << "] bytepos: "<<bts<<":"<<bte<<endl);
// Limit time taken with monster documents. The resulting // Limit time taken with monster documents. The resulting abstract will be incorrect or
// abstract will be incorrect or inexistent, but this is // inexistent, but this is better than taking forever (the default cutoff value comes from
// better than taking forever (the default cutoff value comes // the snippetMaxPosWalk configuration parameter, and is 10E6)
// from the snippetMaxPosWalk configuration parameter, and is
// 10E6)
if (maxtermcount && termcount++ > maxtermcount) { if (maxtermcount && termcount++ > maxtermcount) {
LOGINF("Rclabsfromtext: stopping because maxtermcount reached: "<< LOGINF("Rclabsfromtext: stopping because maxtermcount reached: "<<
maxtermcount << endl); maxtermcount << endl);
@ -154,8 +152,7 @@ public:
} }
// Also limit the number of fragments (just in case safety) // Also limit the number of fragments (just in case safety)
if (m_fragments.size() > maxtermcount / 100) { if (m_fragments.size() > maxtermcount / 100) {
LOGINF("Rclabsfromtext: stopping because maxfragments reached: "<< LOGINF("Rclabsfromtext: stopping: max fragments count: " << maxtermcount/100 << "\n");
maxtermcount/100 << endl);
retflags |= ABSRES_TRUNC; retflags |= ABSRES_TRUNC;
return false; return false;
} }
@ -193,8 +190,7 @@ public:
m_curterm = term; m_curterm = term;
m_curtermcoef = coef; m_curtermcoef = coef;
} else { } else {
LOGDEB2("Extending current fragment: " << m_remainingWords << LOGDEB2("Extending current fragment: "<<m_remainingWords<<" -> "<<m_ctxwords<< "\n");
" -> " << m_ctxwords << endl);
m_extcount++; m_extcount++;
#ifdef COMPUTE_HLZONES #ifdef COMPUTE_HLZONES
if (m_prevwordhit) { if (m_prevwordhit) {
@ -215,9 +211,8 @@ public:
m_curfragcoef += coef; m_curfragcoef += coef;
m_remainingWords = m_ctxwords + 1; m_remainingWords = m_ctxwords + 1;
if (m_extcount > 5) { if (m_extcount > 5) {
// Limit expansion of contiguous fragments (this is to // Limit expansion of contiguous fragments (this is to avoid common terms in search
// avoid common terms in search causing long // causing long heavyweight meaningless fragments. Also, limit length).
// heavyweight meaningless fragments. Also, limit length).
m_remainingWords = 1; m_remainingWords = 1;
m_extcount = 0; m_extcount = 0;
} }
@ -247,18 +242,14 @@ public:
LOGDEB1("FRAGMENT: from byte " << m_curfrag.first << LOGDEB1("FRAGMENT: from byte " << m_curfrag.first <<
" to byte " << m_curfrag.second << endl); " to byte " << m_curfrag.second << endl);
LOGDEB1("FRAGMENT TEXT [" << m_rawtext.substr( LOGDEB1("FRAGMENT TEXT [" << m_rawtext.substr(
m_curfrag.first, m_curfrag.second-m_curfrag.first) m_curfrag.first, m_curfrag.second-m_curfrag.first) << "]\n");
<< "]\n"); // We used to not push weak fragments if we had a lot already. This can cause
// We used to not push weak fragments if we had a lot // problems if the fragments we drop are actually group fragments (which have not
// already. This can cause problems if the fragments // got their boost yet). The right cut value is difficult to determine, because the
// we drop are actually group fragments (which have // absolute values of the coefs depend on many things (index size, etc.) The old
// not got their boost yet). The right cut value is // test was if (m_totalcoef < 5.0 || m_curfragcoef >= 1.0) We now just avoid
// difficult to determine, because the absolute values // creating a monster by testing the current fragments count at the top of the
// of the coefs depend on many things (index size, // function
// etc.) The old test was if (m_totalcoef < 5.0 ||
// m_curfragcoef >= 1.0) We now just avoid creating a
// monster by testing the current fragments count at
// the top of the function
m_fragments.push_back(MatchFragment(m_curfrag.first, m_fragments.push_back(MatchFragment(m_curfrag.first,
m_curfrag.second, m_curfrag.second,
m_curfragcoef, m_curfragcoef,
@ -298,8 +289,7 @@ public:
m_curtermcoef = 0.0; m_curtermcoef = 0.0;
} }
LOGDEB("TextSplitABS: stored total " << m_fragments.size() << LOGDEB("TextSplitABS: stored total " << m_fragments.size() << " fragments" << endl);
" fragments" << endl);
vector<GroupMatchEntry> tboffs; vector<GroupMatchEntry> tboffs;
// Look for matches to PHRASE and NEAR term groups and finalize // Look for matches to PHRASE and NEAR term groups and finalize
@ -340,9 +330,8 @@ public:
} }
auto fragit = m_fragments.begin(); auto fragit = m_fragments.begin();
for (const auto& grpmatch : tboffs) { for (const auto& grpmatch : tboffs) {
LOGDEB2("LOOKING FOR FRAGMENT: group: " << grpmatch.offs.first << LOGDEB2("LOOKING FOR FRAGMENT: group: " << grpmatch.offs.first << "-" <<
"-" << grpmatch.offs.second << " curfrag " << grpmatch.offs.second<<" curfrag "<<fragit->start<<"-"<<fragit->stop<<"\n");
fragit->start << "-" << fragit->stop << endl);
while (fragit->stop < grpmatch.offs.first) { while (fragit->stop < grpmatch.offs.first) {
fragit++; fragit++;
if (fragit == m_fragments.end()) { if (fragit == m_fragments.end()) {
@ -417,21 +406,19 @@ int Query::Native::abstractFromText(
bool sortbypage bool sortbypage
) )
{ {
(void)chron; PRETEND_USE(chron);
LOGABS("abstractFromText: entry: " << chron.millis() << "mS\n"); LOGABS("abstractFromText: entry: " << chron.millis() << "mS\n");
string rawtext; string rawtext;
if (!ndb->getRawText(docid, rawtext)) { if (!ndb->getRawText(docid, rawtext)) {
LOGDEB0("abstractFromText: can't fetch text\n"); LOGDEB0("abstractFromText: can't fetch text\n");
return ABSRES_ERROR; return ABSRES_ERROR;
} }
LOGABS("abstractFromText: got raw text: size " << rawtext.size() << " " << LOGABS("abstractFromText: got raw text: size "<<rawtext.size()<<" "<<chron.millis()<<"mS\n");
chron.millis() << "mS\n");
#if 0 && ! (XAPIAN_MAJOR_VERSION <= 1 && XAPIAN_MINOR_VERSION <= 2) && \ #if 0 && XAPIAN_AT_LEAST(1,3,5)
(defined(RAWTEXT_IN_DATA))
// Tryout the Xapian internal method. // Tryout the Xapian internal method.
string snippet = xmset.snippet(rawtext); string snippet = xmset.snippet(rawtext, 60);
LOGDEB("SNIPPET: [" << snippet << "] END SNIPPET\n"); std::cerr << "XAPIAN SNIPPET: [" << snippet << "] END SNIPPET\n";
#endif #endif
// We need the q coefs for individual terms // We need the q coefs for individual terms
@ -452,8 +439,7 @@ int Query::Native::abstractFromText(
} }
LOGABS("abstractFromText: getterms: " << chron.millis() << "mS\n"); LOGABS("abstractFromText: getterms: " << chron.millis() << "mS\n");
TextSplitABS splitter(rawtext, matchTerms, hld, wordcoefs, ctxwords, TextSplitABS splitter(rawtext, matchTerms, hld, wordcoefs, ctxwords, TextSplit::TXTS_NONE,
TextSplit::TXTS_NONE,
m_q->m_snipMaxPosWalk); m_q->m_snipMaxPosWalk);
splitter.text_to_words(rawtext); splitter.text_to_words(rawtext);
LOGABS("abstractFromText: text_to_words: " << chron.millis() << "mS\n"); LOGABS("abstractFromText: text_to_words: " << chron.millis() << "mS\n");
@ -484,8 +470,7 @@ int Query::Native::abstractFromText(
// main term and the page positions. // main term and the page positions.
unsigned int count = 0; unsigned int count = 0;
for (const auto& entry : result) { for (const auto& entry : result) {
string frag( string frag(fixfrag(rawtext.substr(entry.start, entry.stop - entry.start)));
fixfrag(rawtext.substr(entry.start, entry.stop - entry.start)));
#ifdef COMPUTE_HLZONES #ifdef COMPUTE_HLZONES
// This would need to be modified to take tag parameters // This would need to be modified to take tag parameters
@ -506,8 +491,7 @@ int Query::Native::abstractFromText(
if (page < 0) if (page < 0)
page = 0; page = 0;
} }
LOGDEB0("=== FRAGMENT: p. " << page << " Coef: " << entry.coef << LOGDEB0("=== FRAGMENT: p. " << page << " Coef: " << entry.coef << ": " << frag << endl);
": " << frag << endl);
vabs.push_back(Snippet(page, frag).setTerm(entry.term)); vabs.push_back(Snippet(page, frag).setTerm(entry.term));
if (count++ >= maxtotaloccs) if (count++ >= maxtotaloccs)
break; break;
@ -515,4 +499,45 @@ int Query::Native::abstractFromText(
return ABSRES_OK | splitter.getretflags(); return ABSRES_OK | splitter.getretflags();
} }
class TermLineSplitter : public TextSplit {
public:
TermLineSplitter(const std::string& term)
: TextSplit(TextSplit::TXTS_NOSPANS), m_term(term) {
}
bool takeword(const std::string& _term, int, int, int) override {
std::string term;
if (o_index_stripchars) {
if (!unacmaybefold(_term, term, "UTF-8", UNACOP_UNACFOLD)) {
LOGINFO("PlainToRich::takeword: unac failed for [" << term << "]\n");
return true;
}
}
if (term == m_term) {
return false;
}
return true;
}
void newline(int) override {
m_line++;
}
int getline() {
return m_line;
}
private:
int m_line{1};
std::string m_term;
};
int Query::getFirstMatchLine(const Doc &doc, const std::string& term)
{
int line = 1;
TermLineSplitter splitter(term);
bool ret = splitter.text_to_words(doc.text);
// The splitter takeword() breaks by returning false as soon as the term is found
if (ret == false) {
line = splitter.getline();
}
return line;
}
} }

View File

@ -115,10 +115,18 @@ public:
// Returned as a vector of pair<page,snippet> page is 0 if unknown // Returned as a vector of pair<page,snippet> page is 0 if unknown
int makeDocAbstract(const Doc &doc, std::vector<Snippet>& abst, int makeDocAbstract(const Doc &doc, std::vector<Snippet>& abst,
int maxoccs= -1, int ctxwords= -1,bool sortbypage=false); int maxoccs= -1, int ctxwords= -1,bool sortbypage=false);
/** Retrieve page number for first match for "significant" query term
* @param term returns the chosen term */ /** Choose most interesting term and return the page number for its first match
* @param term returns the chosen term
* @return page number or -1 if term not found or other issue
*/
int getFirstMatchPage(const Doc &doc, std::string& term); int getFirstMatchPage(const Doc &doc, std::string& term);
/** Compute line number for first match of term. Only works if doc.text has text.
* This uses a text split. Both this and the above getFirstMaxPage() could be done and saved
* while we compute the abstracts, quite a lot of waste here. */
int getFirstMatchLine(const Doc &doc, const std::string& term);
/** Retrieve a reference to the searchData we are using */ /** Retrieve a reference to the searchData we are using */
std::shared_ptr<SearchData> getSD() { std::shared_ptr<SearchData> getSD() {
return m_sd; return m_sd;