This commit is contained in:
Jean-Francois Dockes 2022-01-13 10:18:22 +00:00
commit d2d2cbff14
33 changed files with 314 additions and 191 deletions

View File

@ -13,7 +13,7 @@
PPA_KEYID=7808CE96D38B9201
RCLVERS=1.31.5
RCLVERS=1.31.6
SCOPEVERS=1.20.2.4
GSSPVERS=1.1.1
PPAVERS=1
@ -85,7 +85,7 @@ done
### KIO.
series="bionic focal groovy hirsute impish"
#series="bionic focal hirsute impish"
series=
debdir=debiankio

View File

@ -1,3 +1,9 @@
recoll (1.31.6-1~ppaPPAVERS~SERIES1) SERIES; urgency=low
* Almost no change: translation files update.
-- Jean-Francois Dockes <jf@dockes.org> Sat, 20 Dec 2021 09:25:00 +0100
recoll (1.31.5-1~ppaPPAVERS~SERIES1) SERIES; urgency=low
* Fix ennoying bug in tesseract OCR temporary files cleanup.

View File

@ -1,3 +1,9 @@
kio-recoll (1.31.6-1~ppaPPAVERS~SERIES1) SERIES; urgency=low
* Follow
-- Jean-Francois Dockes <jf@dockes.org> Sat, 20 Dec 2021 09:25:00 +0100
kio-recoll (1.31.5-1~ppaPPAVERS~SERIES1) SERIES; urgency=low
* Follow recoll version

View File

@ -1 +1 @@
1.31.5
1.31.6

View File

@ -125,7 +125,7 @@
#define PACKAGE_NAME "Recoll"
/* Define to the full name and version of this package. */
#define PACKAGE_STRING "Recoll 1.31.5"
#define PACKAGE_STRING "Recoll 1.31.6"
/* Define to the one symbol short name of this package. */
#define PACKAGE_TARNAME "recoll"
@ -134,7 +134,7 @@
#define PACKAGE_URL ""
/* Define to the version of this package. */
#define PACKAGE_VERSION "1.31.5"
#define PACKAGE_VERSION "1.31.6"
/* putenv parameter is const */
/* #undef PUTENV_ARG_CONST */

View File

@ -118,7 +118,7 @@
#define PACKAGE_NAME "Recoll"
/* Define to the full name and version of this package. */
#define PACKAGE_STRING "Recoll 1.31.5"
#define PACKAGE_STRING "Recoll 1.31.6"
/* Define to the one symbol short name of this package. */
#define PACKAGE_TARNAME "recoll"
@ -127,7 +127,7 @@
#define PACKAGE_URL ""
/* Define to the version of this package. */
#define PACKAGE_VERSION "1.31.5"
#define PACKAGE_VERSION "1.31.6"
/* putenv parameter is const */
/* #undef PUTENV_ARG_CONST */

View File

@ -596,6 +596,7 @@ bool TextSplit::text_to_words(const string &in)
clearsplitstate();
bool pagepending = false;
bool nlpending = false;
bool softhyphenpending = false;
// Running count of non-alphanum chars. Reset when we see one;
@ -705,6 +706,10 @@ bool TextSplit::text_to_words(const string &in)
pagepending = false;
newpage(m_wordpos);
}
if (nlpending) {
nlpending = false;
newline(m_wordpos);
}
break;
case WILD:
@ -745,6 +750,12 @@ bool TextSplit::text_to_words(const string &in)
break;
}
} else {
// Note about dangling hyphens: we always strip '-' found before whitespace,
// even before a newline, then generate two terms, before and after the line
// break. We have no way to know if '-' is there because a word was broken by
// justification or if it was part of an actual compound word (would need a
// dictionary to check). As soft-hyphen *should* be used if the '-' is not part
// of the text.
if (nextc == -1 || isvisiblewhite(nextc)) {
goto SPACE;
}
@ -844,19 +855,10 @@ bool TextSplit::text_to_words(const string &in)
break;
case '\n':
nlpending = true;
/* FALLTHROUGH */
case '\r':
if (m_span.length() && *m_span.rbegin() == '-') {
// if '-' is the last char before end of line, we
// strip it. We have no way to know if this is added
// because of the line split or if it was part of an
// actual compound word (would need a dictionary to
// check). As soft-hyphen *should* be used if the '-'
// is not part of the text, it is better to properly
// process a real compound word, and produce wrong
// output from wrong text. The word-emitting routine
// will strip the trailing '-'.
goto SPACE;
} else if (softhyphenpending) {
if (softhyphenpending) {
// Don't reset soft-hyphen
continue;
} else {

View File

@ -73,6 +73,9 @@ public:
* just don't know about pages. */
virtual void newpage(int /*pos*/) {}
/** Called when we encounter newline \n 0x0a. Override to use the event. */
virtual void newline(int /*pos*/) {}
// Static utility functions:
/** Count words in string, as the splitter would generate them */

View File

@ -613,8 +613,7 @@ location before copy, to allow path translation computations. For
example if a dataset originally indexed as '/home/me/mydata/config' has
been mounted to '/media/me/mydata', and the GUI is running from a copied
configuration, orgidxconfdir would be '/home/me/mydata/config', and
curidxconfdir (as set in the copied configuration) would be
'/media/me/mydata/config'.
curidxconfdir (as set in the copied configuration) would be '/media/me/mydata/config'.
.TP
.BI "idxrundir = "dfn
Indexing process current directory. The input

View File

@ -170,13 +170,12 @@ listing either MIME types (e.g. audio/mpeg) or handler names
files. We need to decompress these in a
temporary directory for identification, which can be wasteful in some
cases. Limit the waste. Negative means no limit. 0 results in no
processing of any compressed file. Default 50 MB.
processing of any compressed file. Default 100 MB.
</para></listitem></varlistentry>
<varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.TEXTFILEMAXMBS">
<term><varname>textfilemaxmbs</varname></term>
<listitem><para>Size limit for text
files. Mostly for skipping monster
logs. Default 20 MB.
<listitem><para>Size limit for text files. Mostly for skipping monster logs. Default 20 MB. Use a value of -1 to
disable.
</para></listitem></varlistentry>
<varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.INDEXALLFILENAMES">
<term><varname>indexallfilenames</varname></term>

View File

@ -10,7 +10,7 @@
<link rel="stylesheet" type="text/css" href="docbook-xsl.css">
<meta name="generator" content="DocBook XSL Stylesheets V1.79.1">
<meta name="description" content=
"Permission is granted to copy, distribute and/or modify this document under the terms of the GNU Free Documentation License, Version 1.3 or any later version published by the Free Software Foundation; with no Invariant Sections, no Front-Cover Texts, and no Back-Cover Texts. A copy of the license can be found at the following location: GNU web site. This document introduces full text search notions and describes the installation and use of the Recoll application. This version describes Recoll 1.29.">
"Permission is granted to copy, distribute and/or modify this document under the terms of the GNU Free Documentation License, Version 1.3 or any later version published by the Free Software Foundation; with no Invariant Sections, no Front-Cover Texts, and no Back-Cover Texts. A copy of the license can be found at the following location: GNU web site. This document introduces full text search notions and describes the installation and use of the Recoll application. This version describes Recoll 1.31.">
</head>
<body bgcolor="white" text="black" link="#0000FF" vlink="#840084"
alink="#0000FF">
@ -53,7 +53,7 @@ alink="#0000FF">
and describes the installation and use of the
<span class="application">Recoll</span> application.
This version describes <span class=
"application">Recoll</span> 1.29.</p>
"application">Recoll</span> 1.31.</p>
</div>
</div>
</div>
@ -443,7 +443,7 @@ alink="#0000FF">
<p>This document introduces full text search notions and
describes the installation and use of the <span class=
"application">Recoll</span> application. It is updated for
<span class="application">Recoll</span> 1.29.</p>
<span class="application">Recoll</span> 1.31.</p>
<p><span class="application">Recoll</span> was for a long
time dedicated to Unix-like systems. It was only lately
(2015) ported to <span class="application">MS-Windows</span>.
@ -9169,14 +9169,15 @@ hasextract = False
identification, which can be wasteful in some
cases. Limit the waste. Negative means no limit.
0 results in no processing of any compressed
file. Default 50 MB.</p>
file. Default 100 MB.</p>
</dd>
<dt><a name=
"RCL.INSTALL.CONFIG.RECOLLCONF.TEXTFILEMAXMBS" id=
"RCL.INSTALL.CONFIG.RECOLLCONF.TEXTFILEMAXMBS"></a><span class="term"><code class="varname">textfilemaxmbs</code></span></dt>
<dd>
<p>Size limit for text files. Mostly for skipping
monster logs. Default 20 MB.</p>
monster logs. Default 20 MB. Use a value of -1 to
disable.</p>
</dd>
<dt><a name=
"RCL.INSTALL.CONFIG.RECOLLCONF.INDEXALLFILENAMES"
@ -10757,17 +10758,28 @@ other = rclcat:other
<li class="listitem">
<p><b>%p.&nbsp;</b>Page index. Only significant for
a subset of document types, currently only PDF,
Postscript and DVI files. Can be used to start the
editor at the right page for a match or
snippet.</p>
Postscript and DVI files. If it is set, a
significant term will be chosen in the query, and
%p will be substituted with the first page where
the term appears. Can be used to start the editor
at the right page for a match or snippet.</p>
</li>
<li class="listitem">
<p><b>%l.&nbsp;</b>Line number. Only significant
for document types with relevant line breaks,
mostly text/plain and analogs. If it is set, a
significant term will be chosen in the query, and
%p will be substituted with the first line where
the term appears.</p>
</li>
<li class="listitem">
<p><b>%s.&nbsp;</b>Search term. The value will only
be set for documents with indexed page numbers (ie:
PDF). The value will be one of the matched search
terms. It would allow pre-setting the value in the
"Find" entry inside Evince for example, for easy
highlighting of the term.</p>
be set for documents with indexed page or line
numbers and if %p or %l is also used. The value
will be one of the matched search terms. It would
allow pre-setting the value in the "Find" entry
inside Evince for example, for easy highlighting of
the term.</p>
</li>
<li class="listitem">
<p><b>%u.&nbsp;</b>Url.</p>

View File

@ -5,7 +5,7 @@
<!ENTITY RCL "<application>Recoll</application>">
<!ENTITY RCLAPPS "<ulink url='http://www.recoll.org/pages/features.html#doctypes'>http://www.recoll.org/pages/features.html</ulink>">
<!ENTITY RCLVERSION "1.29">
<!ENTITY RCLVERSION "1.32">
<!ENTITY XAP "<application>Xapian</application>">
<!ENTITY WIN "<application>Windows</application>">
<!ENTITY LIN "<application>Unix</application>-like systems">
@ -7114,28 +7114,37 @@ other = rclcat:other
(possibly a script) to be able to handle it.</para></formalpara>
</listitem>
<listitem><formalpara><title>%M</title>
<para>MIME type</para></formalpara>
<listitem>
<formalpara><title>%M</title><para>MIME type</para></formalpara>
</listitem>
<listitem><formalpara><title>%p</title>
<para>Page index. Only significant for a subset of document
types, currently only PDF, Postscript and DVI files. Can be
used to start the editor at the right page for a match or
snippet.</para></formalpara>
<listitem>
<formalpara><title>%p</title><para>Page index. Only significant for a subset of
document types, currently only PDF, Postscript and DVI files. If it is set, a
significant term will be chosen in the query, and %p will be substituted with the
first page where the term appears. Can be used to start the editor at the right page
for a match or snippet.</para></formalpara>
</listitem>
<listitem><formalpara><title>%s</title>
<para>Search term. The value will only be set for documents
with indexed page numbers (ie: PDF). The value will be one of
the matched search terms. It would allow pre-setting the
value in the "Find" entry inside Evince for example, for easy
highlighting of the term.</para></formalpara>
<listitem>
<formalpara><title>%l</title><para>Line number. Only significant for document
types with relevant line breaks, mostly text/plain and analogs. If it is set, a
significant term will be chosen in the query, and %p will be substituted with the
first line where the term appears.</para></formalpara>
</listitem>
<listitem><formalpara><title>%u</title>
<para>Url.</para></formalpara>
<listitem>
<formalpara><title>%s</title><para>Search term. The value will only be set for
documents with indexed page or line numbers and if %p or %l is also used. The value
will be one of the matched search terms. It would allow pre-setting the value in the
"Find" entry inside Evince for example, for easy highlighting of the
term.</para></formalpara>
</listitem>
<listitem>
<formalpara><title>%u</title><para>Url.</para></formalpara>
</listitem>
</itemizedlist>
<para>In addition to the predefined values above, all strings like

View File

@ -23,7 +23,7 @@ class OrgModeExtractor:
iseof = rclexecm.RclExecM.noteof
if self.currentindex >= len(self.docs) -1:
iseof = rclexecm.RclExecM.eofnext
self.em.setmimetype("text/plain")
self.em.setmimetype("text/x-orgmode-sub")
try:
self.em.setfield("title", docdata.splitlines()[0])
except:

View File

@ -59,8 +59,9 @@ bool runWebFilesMoverScript(RclConfig *config)
static string downloadsdir;
if (downloadsdir.empty()) {
if (!config->getConfParam("webdownloadsdir", downloadsdir)) {
downloadsdir = path_tildexpand("~/Downloads");
downloadsdir = "~/Downloads";
}
downloadsdir = path_tildexpand(downloadsdir);
}
vector<string> cmdvec;
config->pythonCmd("recoll-we-move-files.py", cmdvec);

View File

@ -304,3 +304,14 @@ def stringsToString(vs):
out.append(s)
return " ".join(out)
def valToBool(s):
if not s:
return False
try:
val = int(s)
return val != 0
except:
pass
if type(s) == type(b''):
s = s.decode("UTF-8")
return s[0] in "tTyY"

View File

@ -50,7 +50,7 @@ public:
void startElement(const std::string &nm,
const std::map<std::string, std::string>&) override {
std::cerr << "startElement [" << nm << "]\n";
//std::cerr << "startElement [" << nm << "]\n";
currentText.clear();
if (nm == "buttons") {
radio = false;
@ -67,7 +67,7 @@ public:
}
}
void endElement(const std::string& nm) override {
std::cerr << "endElement [" << nm << "]\n";
//std::cerr << "endElement [" << nm << "]\n";
if (nm == "label") {
label = u8s2qs(currentText);
@ -102,7 +102,7 @@ public:
}
}
void characterData(const std::string &str) override {
std::cerr << "characterData [" << str << "]\n";
//std::cerr << "characterData [" << str << "]\n";
currentText += str;
}

View File

@ -34,6 +34,7 @@
#include "rclmain_w.h"
#include "rclzg.h"
#include "pathut.h"
#include "unacpp.h"
using namespace std;
@ -42,7 +43,6 @@ static const vector<string> browser_list{
"opera", "google-chrome", "chromium-browser",
"palemoon", "iceweasel", "firefox", "konqueror", "epiphany"};
// Start native viewer or preview for input Doc. This is used to allow
// using recoll from another app (e.g. Unity Scope) to view embedded
// result docs (docs with an ipath). . We act as a proxy to extract
@ -155,13 +155,27 @@ void RclMain::openWith(Rcl::Doc doc, string cmdspec)
execViewer(subs, false, execname, lcmd, cmdspec, doc);
}
void RclMain::startNativeViewer(Rcl::Doc doc, int pagenum, QString term)
static bool pagenumNeeded(const std::string& cmd)
{
return cmd.find("%p") != std::string::npos;
}
static bool linenumNeeded(const std::string& cmd)
{
return cmd.find("%l") != std::string::npos;
}
static bool termNeeded(const std::string& cmd)
{
return cmd.find("%s") != std::string::npos;
}
void RclMain::startNativeViewer(Rcl::Doc doc, int pagenum, QString qterm)
{
std::string term = qs2utf8s(qterm);
string apptag;
doc.getmeta(Rcl::Doc::keyapptg, &apptag);
LOGDEB("RclMain::startNativeViewer: mtype [" << doc.mimetype <<
"] apptag [" << apptag << "] page " << pagenum << " term [" <<
qs2utf8s(term) << "] url [" << doc.url << "] ipath [" <<
term << "] url [" << doc.url << "] ipath [" <<
doc.ipath << "]\n");
// Look for appropriate viewer
@ -377,19 +391,19 @@ void RclMain::startNativeViewer(Rcl::Doc doc, int pagenum, QString term)
// If we are not called with a page number (which would happen for a call
// from the snippets window), see if we can compute a page number anyway.
if (pagenum == -1) {
pagenum = 1;
string lterm;
if (m_source)
pagenum = m_source->getFirstMatchPage(doc, lterm);
if (m_source && pagenum == -1 && (pagenumNeeded(cmd) || termNeeded(cmd)|| linenumNeeded(cmd))) {
pagenum = m_source->getFirstMatchPage(doc, term);
if (pagenum == -1)
pagenum = 1;
else // We get the match term used to compute the page
term = QString::fromUtf8(lterm.c_str());
}
char cpagenum[20];
sprintf(cpagenum, "%d", pagenum);
int line = 1;
if (m_source && !term.empty() && linenumNeeded(cmd)) {
if (doc.text.empty()) {
rcldb->getDocRawText(doc);
}
line = m_source->getFirstMatchLine(doc, term);
}
// Substitute %xx inside arguments
string efftime;
@ -408,9 +422,10 @@ void RclMain::startNativeViewer(Rcl::Doc doc, int pagenum, QString term)
subs["f"] = fn;
subs["F"] = fn;
subs["i"] = FileInterner::getLastIpathElt(doc.ipath);
subs["l"] = ulltodecstr(line);
subs["M"] = doc.mimetype;
subs["p"] = cpagenum;
subs["s"] = (const char*)term.toLocal8Bit();
subs["p"] = ulltodecstr(pagenum);
subs["s"] = term;
subs["U"] = url_encode(url);
subs["u"] = url;
// Let %(xx) access all metadata.

View File

@ -140,8 +140,7 @@ public slots:
virtual void showActionsSearch();
virtual void startPreview(int docnum, Rcl::Doc doc, int keymods);
virtual void startPreview(Rcl::Doc);
virtual void startNativeViewer(Rcl::Doc, int pagenum = -1,
QString term = QString());
virtual void startNativeViewer(Rcl::Doc, int pagenum = -1, QString term = QString());
virtual void openWith(Rcl::Doc, string);
virtual void saveDocToFile(Rcl::Doc);
virtual void previewNextInTab(Preview *, int sid, int docnum);

View File

@ -1,4 +1,4 @@
/* Copyright (C) 2012 J.F.Dockes
/* Copyright (C) 2012-2021 J.F.Dockes
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
@ -67,8 +67,7 @@ using namespace std;
class PlainToRichQtSnippets : public PlainToRich {
public:
virtual string startMatch(unsigned int) {
return string("<span class='rclmatch' style='")
+ qs2utf8s(prefs.qtermstyle) + string("'>");
return string("<span class='rclmatch' style='") + qs2utf8s(prefs.qtermstyle) + string("'>");
}
virtual string endMatch() {
return string("</span>");
@ -82,12 +81,10 @@ void SnippetsW::init()
QPushButton *searchButton = new QPushButton(tr("Search"));
searchButton->setAutoDefault(false);
buttonBox->addButton(searchButton, QDialogButtonBox::ActionRole);
// setWindowFlags(Qt::WindowStaysOnTopHint);
searchFM->hide();
onNewShortcuts();
connect(&SCBase::scBase(), SIGNAL(shortcutsChanged()),
this, SLOT(onNewShortcuts()));
connect(&SCBase::scBase(), SIGNAL(shortcutsChanged()), this, SLOT(onNewShortcuts()));
QPushButton *closeButton = buttonBox->button(QDialogButtonBox::Close);
if (closeButton)
@ -105,11 +102,9 @@ void SnippetsW::init()
browserw = new QWebView(this);
verticalLayout->insertWidget(0, browserw);
browser->setUrl(QUrl(QString::fromUtf8("about:blank")));
connect(browser, SIGNAL(linkClicked(const QUrl &)),
this, SLOT(onLinkClicked(const QUrl &)));
connect(browser, SIGNAL(linkClicked(const QUrl &)), this, SLOT(onLinkClicked(const QUrl &)));
browser->page()->setLinkDelegationPolicy(QWebPage::DelegateAllLinks);
browser->page()->currentFrame()->setScrollBarPolicy(Qt::Horizontal,
Qt::ScrollBarAlwaysOff);
browser->page()->currentFrame()->setScrollBarPolicy(Qt::Horizontal, Qt::ScrollBarAlwaysOff);
QWEBSETTINGS *ws = browser->page()->settings();
if (prefs.reslistfontfamily != "") {
ws->setFontFamily(QWEBSETTINGS::StandardFont, prefs.reslistfontfamily);
@ -136,8 +131,7 @@ void SnippetsW::init()
#else
browserw = new QTextBrowser(this);
verticalLayout->insertWidget(0, browserw);
connect(browser, SIGNAL(anchorClicked(const QUrl &)),
this, SLOT(onLinkClicked(const QUrl &)));
connect(browser, SIGNAL(anchorClicked(const QUrl &)), this, SLOT(onLinkClicked(const QUrl &)));
browser->setReadOnly(true);
browser->setUndoRedoEnabled(false);
browser->setOpenLinks(false);
@ -183,8 +177,7 @@ void SnippetsW::createPopupMenu(const QPoint& pos)
{
QMenu *popup = new QMenu(this);
if (m_sortingByPage) {
popup->addAction(tr("Sort By Relevance"), this,
SLOT(reloadByRelevance()));
popup->addAction(tr("Sort By Relevance"), this, SLOT(reloadByRelevance()));
} else {
popup->addAction(tr("Sort By Page"), this, SLOT(reloadByPage()));
}
@ -230,29 +223,22 @@ void SnippetsW::onSetDoc(Rcl::Doc doc, std::shared_ptr<DocSequence> source)
source->getTerms(hdata);
ostringstream oss;
oss <<
"<html><head>"
"<meta http-equiv=\"content-type\" "
"content=\"text/html; charset=utf-8\">";
oss << "<html><head>"
"<meta http-equiv=\"content-type\" content=\"text/html; charset=utf-8\">";
oss << "<style type=\"text/css\">\nbody,table,select,input {\n";
oss << "color: " + qs2utf8s(prefs.fontcolor) + ";\n";
oss << "}\n</style>\n";
oss << qs2utf8s(prefs.darkreslistheadertext) << qs2utf8s(prefs.reslistheadertext);
oss <<
"</head>"
"<body>"
"<table class=\"snippets\">"
;
oss << "</head><body><table class=\"snippets\">";
g_hiliter.set_inputhtml(false);
bool nomatch = true;
for (const auto& snippet : vpabs) {
if (snippet.page == -1) {
oss << "<tr><td colspan=\"2\">" <<
snippet.snippet << "</td></tr>" << endl;
oss << "<tr><td colspan=\"2\">" << snippet.snippet << "</td></tr>" << "\n";
continue;
}
list<string> lr;
@ -263,13 +249,12 @@ void SnippetsW::onSetDoc(Rcl::Doc doc, std::shared_ptr<DocSequence> source)
nomatch = false;
oss << "<tr><td>";
if (snippet.page > 0) {
oss << "<a href=\"http://h/P" << snippet.page << "T" <<
snippet.term << "\">"
<< "P.&nbsp;" << snippet.page << "</a>";
oss << "<a href=\"http://h/P" << snippet.page << "T" << snippet.term << "\">" <<
"P.&nbsp;" << snippet.page << "</a>";
}
oss << "</td><td>" << lr.front().c_str() << "</td></tr>" << endl;
oss << "</td><td>" << lr.front().c_str() << "</td></tr>" << "\n";
}
oss << "</table>" << endl;
oss << "</table>" << "\n";
if (nomatch) {
oss.str("<html><head></head><body>\n");
oss << qs2utf8s(tr("<p>Sorry, no exact match was found within limits. "
@ -278,12 +263,12 @@ void SnippetsW::onSetDoc(Rcl::Doc doc, std::shared_ptr<DocSequence> source)
}
oss << "\n</body></html>";
#if defined(USING_WEBKIT) || defined(USING_WEBENGINE)
browser->setHtml(QString::fromUtf8(oss.str().c_str()));
browser->setHtml(u8s2qs(oss.str()));
#else
browser->clear();
browser->append(".");
browser->clear();
browser->insertHtml(QString::fromUtf8(oss.str().c_str()));
browser->insertHtml(u8s2qs(oss.str()));
browser->moveCursor (QTextCursor::Start);
browser->ensureCursorVisible();
#endif
@ -354,8 +339,7 @@ void SnippetsW::onLinkClicked(const QUrl &url)
string term;
if (termpos != string::npos)
term = ascurl.substr(termpos+1);
emit startNativeViewer(m_doc, page,
QString::fromUtf8(term.c_str()));
emit startNativeViewer(m_doc, page, u8s2qs(term));
return;
}
}

View File

@ -111,6 +111,9 @@ public:
virtual int getFirstMatchPage(Rcl::Doc&, std::string&) {
return -1;
}
virtual int getFirstMatchLine(const Rcl::Doc&, const std::string&) {
return 1;
}
/** Get duplicates. */
virtual bool docDups(const Rcl::Doc&, std::vector<Rcl::Doc>&) {
return false;

View File

@ -126,6 +126,17 @@ int DocSequenceDb::getFirstMatchPage(Rcl::Doc &doc, string& term)
return -1;
}
int DocSequenceDb::getFirstMatchLine(const Rcl::Doc &doc, const string& term)
{
std::unique_lock<std::mutex> locker(o_dblock);
if (!setQuery())
return false;
if (m_q->whatDb()) {
return m_q->getFirstMatchLine(doc, term);
}
return 1;
}
list<string> DocSequenceDb::expand(Rcl::Doc &doc)
{
std::unique_lock<std::mutex> locker(o_dblock);

View File

@ -43,6 +43,7 @@ public:
virtual bool getAbstract(Rcl::Doc &doc, std::vector<std::string>&) override;
virtual int getFirstMatchPage(Rcl::Doc&, std::string& term) override;
virtual int getFirstMatchLine(const Rcl::Doc&, const std::string& term) override;
virtual bool docDups(const Rcl::Doc& doc, std::vector<Rcl::Doc>& dups)
override;
virtual std::string getDescription() override;

View File

@ -1,4 +1,4 @@
/* Copyright (C) 2004 J.F.Dockes
/* Copyright (C) 2004-2021 J.F.Dockes
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
@ -60,8 +60,7 @@ public:
* @param in raw text out of internfile.
* @param out rich text output, divided in chunks (to help our caller
* avoid inserting half tags into textedit which doesnt like it)
* @param in hdata terms and groups to be highlighted. These are
* lowercase and unaccented.
* @param in hdata terms and groups to be highlighted. See utils/hldata.h
* @param chunksize max size of chunks in output list
*/
virtual bool plaintorich(const std::string &in, std::list<std::string> &out,

View File

@ -141,11 +141,9 @@ public:
// add/update fragment definition.
virtual bool takeword(const std::string& term, int pos, int bts, int bte) {
LOGDEB1("takeword: [" << term << "] bytepos: "<<bts<<":"<<bte<<endl);
// Limit time taken with monster documents. The resulting
// abstract will be incorrect or inexistent, but this is
// better than taking forever (the default cutoff value comes
// from the snippetMaxPosWalk configuration parameter, and is
// 10E6)
// Limit time taken with monster documents. The resulting abstract will be incorrect or
// inexistent, but this is better than taking forever (the default cutoff value comes from
// the snippetMaxPosWalk configuration parameter, and is 10E6)
if (maxtermcount && termcount++ > maxtermcount) {
LOGINF("Rclabsfromtext: stopping because maxtermcount reached: "<<
maxtermcount << endl);
@ -154,8 +152,7 @@ public:
}
// Also limit the number of fragments (just in case safety)
if (m_fragments.size() > maxtermcount / 100) {
LOGINF("Rclabsfromtext: stopping because maxfragments reached: "<<
maxtermcount/100 << endl);
LOGINF("Rclabsfromtext: stopping: max fragments count: " << maxtermcount/100 << "\n");
retflags |= ABSRES_TRUNC;
return false;
}
@ -193,8 +190,7 @@ public:
m_curterm = term;
m_curtermcoef = coef;
} else {
LOGDEB2("Extending current fragment: " << m_remainingWords <<
" -> " << m_ctxwords << endl);
LOGDEB2("Extending current fragment: "<<m_remainingWords<<" -> "<<m_ctxwords<< "\n");
m_extcount++;
#ifdef COMPUTE_HLZONES
if (m_prevwordhit) {
@ -215,9 +211,8 @@ public:
m_curfragcoef += coef;
m_remainingWords = m_ctxwords + 1;
if (m_extcount > 5) {
// Limit expansion of contiguous fragments (this is to
// avoid common terms in search causing long
// heavyweight meaningless fragments. Also, limit length).
// Limit expansion of contiguous fragments (this is to avoid common terms in search
// causing long heavyweight meaningless fragments. Also, limit length).
m_remainingWords = 1;
m_extcount = 0;
}
@ -247,18 +242,14 @@ public:
LOGDEB1("FRAGMENT: from byte " << m_curfrag.first <<
" to byte " << m_curfrag.second << endl);
LOGDEB1("FRAGMENT TEXT [" << m_rawtext.substr(
m_curfrag.first, m_curfrag.second-m_curfrag.first)
<< "]\n");
// We used to not push weak fragments if we had a lot
// already. This can cause problems if the fragments
// we drop are actually group fragments (which have
// not got their boost yet). The right cut value is
// difficult to determine, because the absolute values
// of the coefs depend on many things (index size,
// etc.) The old test was if (m_totalcoef < 5.0 ||
// m_curfragcoef >= 1.0) We now just avoid creating a
// monster by testing the current fragments count at
// the top of the function
m_curfrag.first, m_curfrag.second-m_curfrag.first) << "]\n");
// We used to not push weak fragments if we had a lot already. This can cause
// problems if the fragments we drop are actually group fragments (which have not
// got their boost yet). The right cut value is difficult to determine, because the
// absolute values of the coefs depend on many things (index size, etc.) The old
// test was if (m_totalcoef < 5.0 || m_curfragcoef >= 1.0) We now just avoid
// creating a monster by testing the current fragments count at the top of the
// function
m_fragments.push_back(MatchFragment(m_curfrag.first,
m_curfrag.second,
m_curfragcoef,
@ -298,8 +289,7 @@ public:
m_curtermcoef = 0.0;
}
LOGDEB("TextSplitABS: stored total " << m_fragments.size() <<
" fragments" << endl);
LOGDEB("TextSplitABS: stored total " << m_fragments.size() << " fragments" << endl);
vector<GroupMatchEntry> tboffs;
// Look for matches to PHRASE and NEAR term groups and finalize
@ -340,9 +330,8 @@ public:
}
auto fragit = m_fragments.begin();
for (const auto& grpmatch : tboffs) {
LOGDEB2("LOOKING FOR FRAGMENT: group: " << grpmatch.offs.first <<
"-" << grpmatch.offs.second << " curfrag " <<
fragit->start << "-" << fragit->stop << endl);
LOGDEB2("LOOKING FOR FRAGMENT: group: " << grpmatch.offs.first << "-" <<
grpmatch.offs.second<<" curfrag "<<fragit->start<<"-"<<fragit->stop<<"\n");
while (fragit->stop < grpmatch.offs.first) {
fragit++;
if (fragit == m_fragments.end()) {
@ -417,21 +406,19 @@ int Query::Native::abstractFromText(
bool sortbypage
)
{
(void)chron;
PRETEND_USE(chron);
LOGABS("abstractFromText: entry: " << chron.millis() << "mS\n");
string rawtext;
if (!ndb->getRawText(docid, rawtext)) {
LOGDEB0("abstractFromText: can't fetch text\n");
return ABSRES_ERROR;
}
LOGABS("abstractFromText: got raw text: size " << rawtext.size() << " " <<
chron.millis() << "mS\n");
LOGABS("abstractFromText: got raw text: size "<<rawtext.size()<<" "<<chron.millis()<<"mS\n");
#if 0 && ! (XAPIAN_MAJOR_VERSION <= 1 && XAPIAN_MINOR_VERSION <= 2) && \
(defined(RAWTEXT_IN_DATA))
#if 0 && XAPIAN_AT_LEAST(1,3,5)
// Tryout the Xapian internal method.
string snippet = xmset.snippet(rawtext);
LOGDEB("SNIPPET: [" << snippet << "] END SNIPPET\n");
string snippet = xmset.snippet(rawtext, 60);
std::cerr << "XAPIAN SNIPPET: [" << snippet << "] END SNIPPET\n";
#endif
// We need the q coefs for individual terms
@ -452,8 +439,7 @@ int Query::Native::abstractFromText(
}
LOGABS("abstractFromText: getterms: " << chron.millis() << "mS\n");
TextSplitABS splitter(rawtext, matchTerms, hld, wordcoefs, ctxwords,
TextSplit::TXTS_NONE,
TextSplitABS splitter(rawtext, matchTerms, hld, wordcoefs, ctxwords, TextSplit::TXTS_NONE,
m_q->m_snipMaxPosWalk);
splitter.text_to_words(rawtext);
LOGABS("abstractFromText: text_to_words: " << chron.millis() << "mS\n");
@ -484,8 +470,7 @@ int Query::Native::abstractFromText(
// main term and the page positions.
unsigned int count = 0;
for (const auto& entry : result) {
string frag(
fixfrag(rawtext.substr(entry.start, entry.stop - entry.start)));
string frag(fixfrag(rawtext.substr(entry.start, entry.stop - entry.start)));
#ifdef COMPUTE_HLZONES
// This would need to be modified to take tag parameters
@ -506,8 +491,7 @@ int Query::Native::abstractFromText(
if (page < 0)
page = 0;
}
LOGDEB0("=== FRAGMENT: p. " << page << " Coef: " << entry.coef <<
": " << frag << endl);
LOGDEB0("=== FRAGMENT: p. " << page << " Coef: " << entry.coef << ": " << frag << endl);
vabs.push_back(Snippet(page, frag).setTerm(entry.term));
if (count++ >= maxtotaloccs)
break;
@ -515,4 +499,45 @@ int Query::Native::abstractFromText(
return ABSRES_OK | splitter.getretflags();
}
class TermLineSplitter : public TextSplit {
public:
TermLineSplitter(const std::string& term)
: TextSplit(TextSplit::TXTS_NOSPANS), m_term(term) {
}
bool takeword(const std::string& _term, int, int, int) override {
std::string term;
if (o_index_stripchars) {
if (!unacmaybefold(_term, term, "UTF-8", UNACOP_UNACFOLD)) {
LOGINFO("PlainToRich::takeword: unac failed for [" << term << "]\n");
return true;
}
}
if (term == m_term) {
return false;
}
return true;
}
void newline(int) override {
m_line++;
}
int getline() {
return m_line;
}
private:
int m_line{1};
std::string m_term;
};
int Query::getFirstMatchLine(const Doc &doc, const std::string& term)
{
int line = 1;
TermLineSplitter splitter(term);
bool ret = splitter.text_to_words(doc.text);
// The splitter takeword() breaks by returning false as soon as the term is found
if (ret == false) {
line = splitter.getline();
}
return line;
}
}

View File

@ -254,7 +254,7 @@ double Query::Native::qualityTerms(Xapian::docid docid,
}
// Return page number for first match of "significant" term.
// Choose most interesting term and return the page number for its first match
int Query::Native::getFirstMatchPage(Xapian::docid docid, string& term)
{
LOGDEB("Query::Native::getFirstMatchPage\n");
@ -286,9 +286,7 @@ int Query::Native::getFirstMatchPage(Xapian::docid docid, string& term)
qualityTerms(docid, terms, byQ);
for (auto mit = byQ.rbegin(); mit != byQ.rend(); mit++) {
for (vector<string>::const_iterator qit = mit->second.begin();
qit != mit->second.end(); qit++) {
string qterm = *qit;
for (const auto& qterm : mit->second) {
Xapian::PositionIterator pos;
string emptys;
try {
@ -619,9 +617,8 @@ int Query::Native::abstractFromIndex(
// possibly retried by our caller.
//
// @param[out] vabs the abstract is returned as a vector of snippets.
int Query::Native::makeAbstract(Xapian::docid docid,
vector<Snippet>& vabs,
int imaxoccs, int ictxwords, bool sortbypage)
int Query::Native::makeAbstract(
Xapian::docid docid, vector<Snippet>& vabs, int imaxoccs, int ictxwords, bool sortbypage)
{
chron.restart();
LOGDEB("makeAbstract: docid " << docid << " imaxoccs " <<

View File

@ -96,10 +96,13 @@ const string pathelt_prefix = "XP";
static const string udi_prefix("Q");
static const string parent_prefix("F");
// Special terms to mark begin/end of field (for anchored searches), and
// page breaks
// Special terms to mark begin/end of field (for anchored searches).
string start_of_field_term;
string end_of_field_term;
// Special term for page breaks. Note that we use a complicated mechanism for multiple page
// breaks at the same position, when it would have been probably simpler to use XXPG/n terms
// instead (did not try to implement though). A change would force users to reindex.
const string page_break_term = "XXPG/";
// Special term to mark documents with children.
@ -1846,16 +1849,14 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc)
}
}
// If empty pages (multiple break at same pos) were recorded, save
// them (this is because we have no way to record them in the
// Xapian list
// If empty pages (multiple break at same pos) were recorded, save them (this is
// because we have no way to record them in the Xapian list)
if (!tpidx.m_pageincrvec.empty()) {
ostringstream multibreaks;
for (unsigned int i = 0; i < tpidx.m_pageincrvec.size(); i++) {
if (i != 0)
multibreaks << ",";
multibreaks << tpidx.m_pageincrvec[i].first << "," <<
tpidx.m_pageincrvec[i].second;
multibreaks << tpidx.m_pageincrvec[i].first << "," << tpidx.m_pageincrvec[i].second;
}
RECORD_APPEND(record, string(cstr_mbreaks), multibreaks.str());
}

View File

@ -360,7 +360,6 @@ int Query::getFirstMatchPage(const Doc &doc, string& term)
return m_reason.empty() ? pagenum : -1;
}
// Mset size
// Note: times for retrieving (multiple times)all docs from a sample
// 25k docs db (q: mime:*)
@ -511,8 +510,7 @@ vector<string> Query::expand(const Doc &doc)
Xapian::ESet eset = m_nq->xenquire->get_eset(20, rset, false);
LOGDEB("ESet terms:\n");
// We filter out the special terms
for (Xapian::ESetIterator it = eset.begin();
it != eset.end(); it++) {
for (Xapian::ESetIterator it = eset.begin(); it != eset.end(); it++) {
LOGDEB(" [" << (*it) << "]\n");
if ((*it).empty() || has_prefix(*it))
continue;

View File

@ -115,10 +115,18 @@ public:
// Returned as a vector of pair<page,snippet> page is 0 if unknown
int makeDocAbstract(const Doc &doc, std::vector<Snippet>& abst,
int maxoccs= -1, int ctxwords= -1,bool sortbypage=false);
/** Retrieve page number for first match for "significant" query term
* @param term returns the chosen term */
/** Choose most interesting term and return the page number for its first match
* @param term returns the chosen term
* @return page number or -1 if term not found or other issue
*/
int getFirstMatchPage(const Doc &doc, std::string& term);
/** Compute line number for first match of term. Only works if doc.text has text.
* This uses a text split. Both this and the above getFirstMaxPage() could be done and saved
* while we compute the abstracts, quite a lot of waste here. */
int getFirstMatchLine(const Doc &doc, const std::string& term);
/** Retrieve a reference to the searchData we are using */
std::shared_ptr<SearchData> getSD() {
return m_sd;

View File

@ -225,6 +225,7 @@ text/x-lua = internal
text/x-mail = internal
text/x-man = exec rclman;maxseconds=30
text/x-orgmode = execm rclorgmode.py
text/x-orgmode-sub = internal text/plain
text/x-perl = internal text/plain
text/x-purple-html-log = internal text/html
text/x-purple-log = exec rclpurple
@ -359,6 +360,7 @@ text/x-java = source
text/x-lua = source
text/x-mail = message
text/x-man = document
text/x-orgmode = document
text/x-perl = source
text/x-php = source
text/x-purple-html-log = pidgin
@ -443,6 +445,7 @@ text = \
text/x-ini \
text/x-java \
text/x-man \
text/x-orgmode \
text/x-perl \
text/x-php \
text/x-python \

View File

@ -214,12 +214,13 @@ nomd5types = rclaudio
# files.</brief><descr>We need to decompress these in a
# temporary directory for identification, which can be wasteful in some
# cases. Limit the waste. Negative means no limit. 0 results in no
# processing of any compressed file. Default 50 MB.</descr></var>
# processing of any compressed file. Default 100 MB.</descr></var>
compressedfilemaxkbs = 100000
# <var name="textfilemaxmbs" type="int"><brief>Size limit for text
# files.</brief><descr>Mostly for skipping monster
# logs. Default 20 MB.</descr></var>
# <var name="textfilemaxmbs" type="int">
# <brief>Size limit for text files.</brief>
# <descr>Mostly for skipping monster logs. Default 20 MB. Use a value of -1 to
# disable.</descr></var>
textfilemaxmbs = 20
# <var name="indexallfilenames" type="bool"><brief>Index the file names of

View File

@ -619,11 +619,17 @@ static const string& thumbnailsdir()
return thumbnailsd;
}
// Place for 1024x1024 files
static const string thmbdirxxlarge = "xx-large";
// Place for 512x512 files
static const string thmbdirxlarge = "x-large";
// Place for 256x256 files
static const string thmbdirlarge = "large";
// 128x128
static const string thmbdirnormal = "normal";
static const vector<string> thmbdirs{thmbdirxxlarge, thmbdirxlarge, thmbdirlarge, thmbdirnormal};
static void thumbname(const string& url, string& name)
{
string digest;
@ -635,26 +641,47 @@ static void thumbname(const string& url, string& name)
bool thumbPathForUrl(const string& url, int size, string& path)
{
string name;
string name, path128, path256, path512, path1024;
thumbname(url, name);
if (size <= 128) {
path = path_cat(thumbnailsdir(), thmbdirnormal);
path = path_cat(path, name);
path128 = path;
} else if (size <= 256) {
path = path_cat(thumbnailsdir(), thmbdirlarge);
path = path_cat(path, name);
path256 = path;
} else if (size <= 512) {
path = path_cat(thumbnailsdir(), thmbdirxlarge);
path = path_cat(path, name);
path512 = path;
} else {
path = path_cat(thumbnailsdir(), thmbdirxxlarge);
path = path_cat(path, name);
path1024 = path;
}
if (access(path.c_str(), R_OK) == 0) {
return true;
}
// Not found in requested size. Try to find any size and return it. Let the client scale.
for (const auto& tdir : thmbdirs) {
path = path_cat(thumbnailsdir(), tdir);
path = path_cat(path, name);
if (access(path.c_str(), R_OK) == 0) {
return true;
}
}
path = path_cat(thumbnailsdir(), thmbdirlarge);
path = path_cat(path, name);
if (access(path.c_str(), R_OK) == 0) {
return true;
}
// File does not exist. Path corresponds to the large version at this point,
// fix it if needed.
// File does not exist. Return appropriate path anyway.
if (size <= 128) {
path = path_cat(path_home(), thmbdirnormal);
path = path_cat(path, name);
path = path128;
} else if (size <= 256) {
path = path256;
} else if (size <= 512) {
path = path512;
} else {
path = path1024;
}
return false;
}

View File

@ -318,7 +318,7 @@ template <class T> void stringsToCSV(const T& tokens, string& s, char sep)
s.append(1, sep);
}
// Remove last separator.
if (s.size())
if (!s.empty())
s.pop_back();
}
@ -951,7 +951,7 @@ bool parsedateinterval(const string& s, DateInterval *dip)
return false;
}
vector<string>::const_iterator it = vs.begin();
auto it = vs.cbegin();
if (*it == "P" || *it == "p") {
it++;
if (!parseperiod(it, vs.end(), &p1)) {
@ -1221,7 +1221,7 @@ std::string SimpleRegexp::simpleSub(
const std::string& in, const std::string& repl)
{
if (!ok()) {
return std::string();
return {};
}
int err;
@ -1256,7 +1256,7 @@ bool SimpleRegexp::simpleMatch(const string& val) const
string SimpleRegexp::getMatch(const string& val, int i) const
{
if (i > m->nmatch) {
return string();
return {};
}
return val.substr(m->matches[i].rm_so,
m->matches[i].rm_eo - m->matches[i].rm_so);

View File

@ -187,6 +187,7 @@ text/x-csv = internal text/plain
text/x-fictionbook = internal xsltproc fb2.xsl
text/x-ini = internal text/plain
text/x-mail = internal
text/x-orgmode = execm python rclorgmode.py
text/x-perl = internal text/plain
text/x-python = execm python rclpython.py
text/x-shellscript = internal text/plain
@ -291,6 +292,7 @@ text/x-html-sidux-man = sidux-book
text/x-ini = txt
text/x-mail = message
text/x-man = document
text/x-orgmode = document
text/x-perl = source
text/x-purple-html-log = pidgin
text/x-purple-log = pidgin
@ -359,6 +361,7 @@ text = \
text/x-html-sidux-man \
text/x-ini \
text/x-man \
text/x-orgmode \
text/x-perl \
text/x-python \
text/x-shellscript