Add callback for textsplit to report line breaks. Use it to implement looking up the first line where a term appears to use with a %l spec for executing a viewer
This commit is contained in:
parent
25d6d78902
commit
7b42907441
@ -596,6 +596,7 @@ bool TextSplit::text_to_words(const string &in)
|
||||
clearsplitstate();
|
||||
|
||||
bool pagepending = false;
|
||||
bool nlpending = false;
|
||||
bool softhyphenpending = false;
|
||||
|
||||
// Running count of non-alphanum chars. Reset when we see one;
|
||||
@ -705,6 +706,10 @@ bool TextSplit::text_to_words(const string &in)
|
||||
pagepending = false;
|
||||
newpage(m_wordpos);
|
||||
}
|
||||
if (nlpending) {
|
||||
nlpending = false;
|
||||
newline(m_wordpos);
|
||||
}
|
||||
break;
|
||||
|
||||
case WILD:
|
||||
@ -745,6 +750,12 @@ bool TextSplit::text_to_words(const string &in)
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
// Note about dangling hyphens: we always strip '-' found before whitespace,
|
||||
// even before a newline, then generate two terms, before and after the line
|
||||
// break. We have no way to know if '-' is there because a word was broken by
|
||||
// justification or if it was part of an actual compound word (would need a
|
||||
// dictionary to check). As soft-hyphen *should* be used if the '-' is not part
|
||||
// of the text.
|
||||
if (nextc == -1 || isvisiblewhite(nextc)) {
|
||||
goto SPACE;
|
||||
}
|
||||
@ -844,19 +855,10 @@ bool TextSplit::text_to_words(const string &in)
|
||||
break;
|
||||
|
||||
case '\n':
|
||||
nlpending = true;
|
||||
/* FALLTHROUGH */
|
||||
case '\r':
|
||||
if (m_span.length() && *m_span.rbegin() == '-') {
|
||||
// if '-' is the last char before end of line, we
|
||||
// strip it. We have no way to know if this is added
|
||||
// because of the line split or if it was part of an
|
||||
// actual compound word (would need a dictionary to
|
||||
// check). As soft-hyphen *should* be used if the '-'
|
||||
// is not part of the text, it is better to properly
|
||||
// process a real compound word, and produce wrong
|
||||
// output from wrong text. The word-emitting routine
|
||||
// will strip the trailing '-'.
|
||||
goto SPACE;
|
||||
} else if (softhyphenpending) {
|
||||
if (softhyphenpending) {
|
||||
// Don't reset soft-hyphen
|
||||
continue;
|
||||
} else {
|
||||
|
||||
@ -73,6 +73,9 @@ public:
|
||||
* just don't know about pages. */
|
||||
virtual void newpage(int /*pos*/) {}
|
||||
|
||||
/** Called when we encounter newline \n 0x0a. Override to use the event. */
|
||||
virtual void newline(int /*pos*/) {}
|
||||
|
||||
// Static utility functions:
|
||||
|
||||
/** Count words in string, as the splitter would generate them */
|
||||
|
||||
@ -34,6 +34,7 @@
|
||||
#include "rclmain_w.h"
|
||||
#include "rclzg.h"
|
||||
#include "pathut.h"
|
||||
#include "unacpp.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
@ -42,7 +43,6 @@ static const vector<string> browser_list{
|
||||
"opera", "google-chrome", "chromium-browser",
|
||||
"palemoon", "iceweasel", "firefox", "konqueror", "epiphany"};
|
||||
|
||||
|
||||
// Start native viewer or preview for input Doc. This is used to allow
|
||||
// using recoll from another app (e.g. Unity Scope) to view embedded
|
||||
// result docs (docs with an ipath). . We act as a proxy to extract
|
||||
@ -155,13 +155,27 @@ void RclMain::openWith(Rcl::Doc doc, string cmdspec)
|
||||
execViewer(subs, false, execname, lcmd, cmdspec, doc);
|
||||
}
|
||||
|
||||
void RclMain::startNativeViewer(Rcl::Doc doc, int pagenum, QString term)
|
||||
static bool pagenumNeeded(const std::string& cmd)
|
||||
{
|
||||
return cmd.find("%p") != std::string::npos;
|
||||
}
|
||||
static bool linenumNeeded(const std::string& cmd)
|
||||
{
|
||||
return cmd.find("%l") != std::string::npos;
|
||||
}
|
||||
static bool termNeeded(const std::string& cmd)
|
||||
{
|
||||
return cmd.find("%s") != std::string::npos;
|
||||
}
|
||||
|
||||
void RclMain::startNativeViewer(Rcl::Doc doc, int pagenum, QString qterm)
|
||||
{
|
||||
std::string term = qs2utf8s(qterm);
|
||||
string apptag;
|
||||
doc.getmeta(Rcl::Doc::keyapptg, &apptag);
|
||||
LOGDEB("RclMain::startNativeViewer: mtype [" << doc.mimetype <<
|
||||
"] apptag [" << apptag << "] page " << pagenum << " term [" <<
|
||||
qs2utf8s(term) << "] url [" << doc.url << "] ipath [" <<
|
||||
term << "] url [" << doc.url << "] ipath [" <<
|
||||
doc.ipath << "]\n");
|
||||
|
||||
// Look for appropriate viewer
|
||||
@ -377,19 +391,19 @@ void RclMain::startNativeViewer(Rcl::Doc doc, int pagenum, QString term)
|
||||
|
||||
// If we are not called with a page number (which would happen for a call
|
||||
// from the snippets window), see if we can compute a page number anyway.
|
||||
if (pagenum == -1) {
|
||||
pagenum = 1;
|
||||
string lterm;
|
||||
if (m_source)
|
||||
pagenum = m_source->getFirstMatchPage(doc, lterm);
|
||||
if (m_source && pagenum == -1 && (pagenumNeeded(cmd) || termNeeded(cmd)|| linenumNeeded(cmd))) {
|
||||
pagenum = m_source->getFirstMatchPage(doc, term);
|
||||
if (pagenum == -1)
|
||||
pagenum = 1;
|
||||
else // We get the match term used to compute the page
|
||||
term = QString::fromUtf8(lterm.c_str());
|
||||
}
|
||||
char cpagenum[20];
|
||||
sprintf(cpagenum, "%d", pagenum);
|
||||
|
||||
int line = 1;
|
||||
if (m_source && !term.empty() && linenumNeeded(cmd)) {
|
||||
if (doc.text.empty()) {
|
||||
rcldb->getDocRawText(doc);
|
||||
}
|
||||
line = m_source->getFirstMatchLine(doc, term);
|
||||
}
|
||||
|
||||
// Substitute %xx inside arguments
|
||||
string efftime;
|
||||
@ -408,9 +422,10 @@ void RclMain::startNativeViewer(Rcl::Doc doc, int pagenum, QString term)
|
||||
subs["f"] = fn;
|
||||
subs["F"] = fn;
|
||||
subs["i"] = FileInterner::getLastIpathElt(doc.ipath);
|
||||
subs["l"] = ulltodecstr(line);
|
||||
subs["M"] = doc.mimetype;
|
||||
subs["p"] = cpagenum;
|
||||
subs["s"] = (const char*)term.toLocal8Bit();
|
||||
subs["p"] = ulltodecstr(pagenum);
|
||||
subs["s"] = term;
|
||||
subs["U"] = url_encode(url);
|
||||
subs["u"] = url;
|
||||
// Let %(xx) access all metadata.
|
||||
|
||||
@ -111,6 +111,9 @@ public:
|
||||
virtual int getFirstMatchPage(Rcl::Doc&, std::string&) {
|
||||
return -1;
|
||||
}
|
||||
virtual int getFirstMatchLine(const Rcl::Doc&, const std::string&) {
|
||||
return 1;
|
||||
}
|
||||
/** Get duplicates. */
|
||||
virtual bool docDups(const Rcl::Doc&, std::vector<Rcl::Doc>&) {
|
||||
return false;
|
||||
|
||||
@ -126,6 +126,17 @@ int DocSequenceDb::getFirstMatchPage(Rcl::Doc &doc, string& term)
|
||||
return -1;
|
||||
}
|
||||
|
||||
int DocSequenceDb::getFirstMatchLine(const Rcl::Doc &doc, const string& term)
|
||||
{
|
||||
std::unique_lock<std::mutex> locker(o_dblock);
|
||||
if (!setQuery())
|
||||
return false;
|
||||
if (m_q->whatDb()) {
|
||||
return m_q->getFirstMatchLine(doc, term);
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
list<string> DocSequenceDb::expand(Rcl::Doc &doc)
|
||||
{
|
||||
std::unique_lock<std::mutex> locker(o_dblock);
|
||||
|
||||
@ -43,6 +43,7 @@ public:
|
||||
|
||||
virtual bool getAbstract(Rcl::Doc &doc, std::vector<std::string>&) override;
|
||||
virtual int getFirstMatchPage(Rcl::Doc&, std::string& term) override;
|
||||
virtual int getFirstMatchLine(const Rcl::Doc&, const std::string& term) override;
|
||||
virtual bool docDups(const Rcl::Doc& doc, std::vector<Rcl::Doc>& dups)
|
||||
override;
|
||||
virtual std::string getDescription() override;
|
||||
|
||||
@ -141,11 +141,9 @@ public:
|
||||
// add/update fragment definition.
|
||||
virtual bool takeword(const std::string& term, int pos, int bts, int bte) {
|
||||
LOGDEB1("takeword: [" << term << "] bytepos: "<<bts<<":"<<bte<<endl);
|
||||
// Limit time taken with monster documents. The resulting
|
||||
// abstract will be incorrect or inexistent, but this is
|
||||
// better than taking forever (the default cutoff value comes
|
||||
// from the snippetMaxPosWalk configuration parameter, and is
|
||||
// 10E6)
|
||||
// Limit time taken with monster documents. The resulting abstract will be incorrect or
|
||||
// inexistent, but this is better than taking forever (the default cutoff value comes from
|
||||
// the snippetMaxPosWalk configuration parameter, and is 10E6)
|
||||
if (maxtermcount && termcount++ > maxtermcount) {
|
||||
LOGINF("Rclabsfromtext: stopping because maxtermcount reached: "<<
|
||||
maxtermcount << endl);
|
||||
@ -154,8 +152,7 @@ public:
|
||||
}
|
||||
// Also limit the number of fragments (just in case safety)
|
||||
if (m_fragments.size() > maxtermcount / 100) {
|
||||
LOGINF("Rclabsfromtext: stopping because maxfragments reached: "<<
|
||||
maxtermcount/100 << endl);
|
||||
LOGINF("Rclabsfromtext: stopping: max fragments count: " << maxtermcount/100 << "\n");
|
||||
retflags |= ABSRES_TRUNC;
|
||||
return false;
|
||||
}
|
||||
@ -193,8 +190,7 @@ public:
|
||||
m_curterm = term;
|
||||
m_curtermcoef = coef;
|
||||
} else {
|
||||
LOGDEB2("Extending current fragment: " << m_remainingWords <<
|
||||
" -> " << m_ctxwords << endl);
|
||||
LOGDEB2("Extending current fragment: "<<m_remainingWords<<" -> "<<m_ctxwords<< "\n");
|
||||
m_extcount++;
|
||||
#ifdef COMPUTE_HLZONES
|
||||
if (m_prevwordhit) {
|
||||
@ -215,9 +211,8 @@ public:
|
||||
m_curfragcoef += coef;
|
||||
m_remainingWords = m_ctxwords + 1;
|
||||
if (m_extcount > 5) {
|
||||
// Limit expansion of contiguous fragments (this is to
|
||||
// avoid common terms in search causing long
|
||||
// heavyweight meaningless fragments. Also, limit length).
|
||||
// Limit expansion of contiguous fragments (this is to avoid common terms in search
|
||||
// causing long heavyweight meaningless fragments. Also, limit length).
|
||||
m_remainingWords = 1;
|
||||
m_extcount = 0;
|
||||
}
|
||||
@ -247,18 +242,14 @@ public:
|
||||
LOGDEB1("FRAGMENT: from byte " << m_curfrag.first <<
|
||||
" to byte " << m_curfrag.second << endl);
|
||||
LOGDEB1("FRAGMENT TEXT [" << m_rawtext.substr(
|
||||
m_curfrag.first, m_curfrag.second-m_curfrag.first)
|
||||
<< "]\n");
|
||||
// We used to not push weak fragments if we had a lot
|
||||
// already. This can cause problems if the fragments
|
||||
// we drop are actually group fragments (which have
|
||||
// not got their boost yet). The right cut value is
|
||||
// difficult to determine, because the absolute values
|
||||
// of the coefs depend on many things (index size,
|
||||
// etc.) The old test was if (m_totalcoef < 5.0 ||
|
||||
// m_curfragcoef >= 1.0) We now just avoid creating a
|
||||
// monster by testing the current fragments count at
|
||||
// the top of the function
|
||||
m_curfrag.first, m_curfrag.second-m_curfrag.first) << "]\n");
|
||||
// We used to not push weak fragments if we had a lot already. This can cause
|
||||
// problems if the fragments we drop are actually group fragments (which have not
|
||||
// got their boost yet). The right cut value is difficult to determine, because the
|
||||
// absolute values of the coefs depend on many things (index size, etc.) The old
|
||||
// test was if (m_totalcoef < 5.0 || m_curfragcoef >= 1.0) We now just avoid
|
||||
// creating a monster by testing the current fragments count at the top of the
|
||||
// function
|
||||
m_fragments.push_back(MatchFragment(m_curfrag.first,
|
||||
m_curfrag.second,
|
||||
m_curfragcoef,
|
||||
@ -298,8 +289,7 @@ public:
|
||||
m_curtermcoef = 0.0;
|
||||
}
|
||||
|
||||
LOGDEB("TextSplitABS: stored total " << m_fragments.size() <<
|
||||
" fragments" << endl);
|
||||
LOGDEB("TextSplitABS: stored total " << m_fragments.size() << " fragments" << endl);
|
||||
vector<GroupMatchEntry> tboffs;
|
||||
|
||||
// Look for matches to PHRASE and NEAR term groups and finalize
|
||||
@ -340,9 +330,8 @@ public:
|
||||
}
|
||||
auto fragit = m_fragments.begin();
|
||||
for (const auto& grpmatch : tboffs) {
|
||||
LOGDEB2("LOOKING FOR FRAGMENT: group: " << grpmatch.offs.first <<
|
||||
"-" << grpmatch.offs.second << " curfrag " <<
|
||||
fragit->start << "-" << fragit->stop << endl);
|
||||
LOGDEB2("LOOKING FOR FRAGMENT: group: " << grpmatch.offs.first << "-" <<
|
||||
grpmatch.offs.second<<" curfrag "<<fragit->start<<"-"<<fragit->stop<<"\n");
|
||||
while (fragit->stop < grpmatch.offs.first) {
|
||||
fragit++;
|
||||
if (fragit == m_fragments.end()) {
|
||||
@ -417,21 +406,19 @@ int Query::Native::abstractFromText(
|
||||
bool sortbypage
|
||||
)
|
||||
{
|
||||
(void)chron;
|
||||
PRETEND_USE(chron);
|
||||
LOGABS("abstractFromText: entry: " << chron.millis() << "mS\n");
|
||||
string rawtext;
|
||||
if (!ndb->getRawText(docid, rawtext)) {
|
||||
LOGDEB0("abstractFromText: can't fetch text\n");
|
||||
return ABSRES_ERROR;
|
||||
}
|
||||
LOGABS("abstractFromText: got raw text: size " << rawtext.size() << " " <<
|
||||
chron.millis() << "mS\n");
|
||||
LOGABS("abstractFromText: got raw text: size "<<rawtext.size()<<" "<<chron.millis()<<"mS\n");
|
||||
|
||||
#if 0 && ! (XAPIAN_MAJOR_VERSION <= 1 && XAPIAN_MINOR_VERSION <= 2) && \
|
||||
(defined(RAWTEXT_IN_DATA))
|
||||
#if 0 && XAPIAN_AT_LEAST(1,3,5)
|
||||
// Tryout the Xapian internal method.
|
||||
string snippet = xmset.snippet(rawtext);
|
||||
LOGDEB("SNIPPET: [" << snippet << "] END SNIPPET\n");
|
||||
string snippet = xmset.snippet(rawtext, 60);
|
||||
std::cerr << "XAPIAN SNIPPET: [" << snippet << "] END SNIPPET\n";
|
||||
#endif
|
||||
|
||||
// We need the q coefs for individual terms
|
||||
@ -452,8 +439,7 @@ int Query::Native::abstractFromText(
|
||||
}
|
||||
LOGABS("abstractFromText: getterms: " << chron.millis() << "mS\n");
|
||||
|
||||
TextSplitABS splitter(rawtext, matchTerms, hld, wordcoefs, ctxwords,
|
||||
TextSplit::TXTS_NONE,
|
||||
TextSplitABS splitter(rawtext, matchTerms, hld, wordcoefs, ctxwords, TextSplit::TXTS_NONE,
|
||||
m_q->m_snipMaxPosWalk);
|
||||
splitter.text_to_words(rawtext);
|
||||
LOGABS("abstractFromText: text_to_words: " << chron.millis() << "mS\n");
|
||||
@ -484,8 +470,7 @@ int Query::Native::abstractFromText(
|
||||
// main term and the page positions.
|
||||
unsigned int count = 0;
|
||||
for (const auto& entry : result) {
|
||||
string frag(
|
||||
fixfrag(rawtext.substr(entry.start, entry.stop - entry.start)));
|
||||
string frag(fixfrag(rawtext.substr(entry.start, entry.stop - entry.start)));
|
||||
|
||||
#ifdef COMPUTE_HLZONES
|
||||
// This would need to be modified to take tag parameters
|
||||
@ -506,8 +491,7 @@ int Query::Native::abstractFromText(
|
||||
if (page < 0)
|
||||
page = 0;
|
||||
}
|
||||
LOGDEB0("=== FRAGMENT: p. " << page << " Coef: " << entry.coef <<
|
||||
": " << frag << endl);
|
||||
LOGDEB0("=== FRAGMENT: p. " << page << " Coef: " << entry.coef << ": " << frag << endl);
|
||||
vabs.push_back(Snippet(page, frag).setTerm(entry.term));
|
||||
if (count++ >= maxtotaloccs)
|
||||
break;
|
||||
@ -515,4 +499,45 @@ int Query::Native::abstractFromText(
|
||||
return ABSRES_OK | splitter.getretflags();
|
||||
}
|
||||
|
||||
class TermLineSplitter : public TextSplit {
|
||||
public:
|
||||
TermLineSplitter(const std::string& term)
|
||||
: TextSplit(TextSplit::TXTS_NOSPANS), m_term(term) {
|
||||
}
|
||||
bool takeword(const std::string& _term, int, int, int) override {
|
||||
std::string term;
|
||||
if (o_index_stripchars) {
|
||||
if (!unacmaybefold(_term, term, "UTF-8", UNACOP_UNACFOLD)) {
|
||||
LOGINFO("PlainToRich::takeword: unac failed for [" << term << "]\n");
|
||||
return true;
|
||||
}
|
||||
}
|
||||
if (term == m_term) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
void newline(int) override {
|
||||
m_line++;
|
||||
}
|
||||
int getline() {
|
||||
return m_line;
|
||||
}
|
||||
private:
|
||||
int m_line{1};
|
||||
std::string m_term;
|
||||
};
|
||||
|
||||
int Query::getFirstMatchLine(const Doc &doc, const std::string& term)
|
||||
{
|
||||
int line = 1;
|
||||
TermLineSplitter splitter(term);
|
||||
bool ret = splitter.text_to_words(doc.text);
|
||||
// The splitter takeword() breaks by returning false as soon as the term is found
|
||||
if (ret == false) {
|
||||
line = splitter.getline();
|
||||
}
|
||||
return line;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -115,10 +115,18 @@ public:
|
||||
// Returned as a vector of pair<page,snippet> page is 0 if unknown
|
||||
int makeDocAbstract(const Doc &doc, std::vector<Snippet>& abst,
|
||||
int maxoccs= -1, int ctxwords= -1,bool sortbypage=false);
|
||||
/** Retrieve page number for first match for "significant" query term
|
||||
* @param term returns the chosen term */
|
||||
|
||||
/** Choose most interesting term and return the page number for its first match
|
||||
* @param term returns the chosen term
|
||||
* @return page number or -1 if term not found or other issue
|
||||
*/
|
||||
int getFirstMatchPage(const Doc &doc, std::string& term);
|
||||
|
||||
/** Compute line number for first match of term. Only works if doc.text has text.
|
||||
* This uses a text split. Both this and the above getFirstMaxPage() could be done and saved
|
||||
* while we compute the abstracts, quite a lot of waste here. */
|
||||
int getFirstMatchLine(const Doc &doc, const std::string& term);
|
||||
|
||||
/** Retrieve a reference to the searchData we are using */
|
||||
std::shared_ptr<SearchData> getSD() {
|
||||
return m_sd;
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user