diff --git a/packaging/debian/buildppa.sh b/packaging/debian/buildppa.sh index 1e330c10..5b59a6ef 100644 --- a/packaging/debian/buildppa.sh +++ b/packaging/debian/buildppa.sh @@ -13,7 +13,7 @@ PPA_KEYID=7808CE96D38B9201 -RCLVERS=1.31.5 +RCLVERS=1.31.6 SCOPEVERS=1.20.2.4 GSSPVERS=1.1.1 PPAVERS=1 @@ -85,7 +85,7 @@ done ### KIO. -series="bionic focal groovy hirsute impish" +#series="bionic focal hirsute impish" series= debdir=debiankio diff --git a/packaging/debian/debian/changelog b/packaging/debian/debian/changelog index 5de457f6..b09b3137 100644 --- a/packaging/debian/debian/changelog +++ b/packaging/debian/debian/changelog @@ -1,3 +1,9 @@ +recoll (1.31.6-1~ppaPPAVERS~SERIES1) SERIES; urgency=low + + * Almost no change: translation files update. + + -- Jean-Francois Dockes Sat, 20 Dec 2021 09:25:00 +0100 + recoll (1.31.5-1~ppaPPAVERS~SERIES1) SERIES; urgency=low * Fix ennoying bug in tesseract OCR temporary files cleanup. diff --git a/packaging/debian/debiankio/changelog b/packaging/debian/debiankio/changelog index 10b3fd00..3960caad 100644 --- a/packaging/debian/debiankio/changelog +++ b/packaging/debian/debiankio/changelog @@ -1,3 +1,9 @@ +kio-recoll (1.31.6-1~ppaPPAVERS~SERIES1) SERIES; urgency=low + + * Follow + + -- Jean-Francois Dockes Sat, 20 Dec 2021 09:25:00 +0100 + kio-recoll (1.31.5-1~ppaPPAVERS~SERIES1) SERIES; urgency=low * Follow recoll version diff --git a/src/RECOLL-VERSION.txt b/src/RECOLL-VERSION.txt index 490b76e4..deade24a 100644 --- a/src/RECOLL-VERSION.txt +++ b/src/RECOLL-VERSION.txt @@ -1 +1 @@ -1.31.5 +1.31.6 diff --git a/src/common/autoconfig-mac.h b/src/common/autoconfig-mac.h index 71d49a01..4d452f7a 100644 --- a/src/common/autoconfig-mac.h +++ b/src/common/autoconfig-mac.h @@ -125,7 +125,7 @@ #define PACKAGE_NAME "Recoll" /* Define to the full name and version of this package. */ -#define PACKAGE_STRING "Recoll 1.31.5" +#define PACKAGE_STRING "Recoll 1.31.6" /* Define to the one symbol short name of this package. */ #define PACKAGE_TARNAME "recoll" @@ -134,7 +134,7 @@ #define PACKAGE_URL "" /* Define to the version of this package. */ -#define PACKAGE_VERSION "1.31.5" +#define PACKAGE_VERSION "1.31.6" /* putenv parameter is const */ /* #undef PUTENV_ARG_CONST */ diff --git a/src/common/autoconfig-win.h b/src/common/autoconfig-win.h index b7e8f475..d1852cf8 100644 --- a/src/common/autoconfig-win.h +++ b/src/common/autoconfig-win.h @@ -118,7 +118,7 @@ #define PACKAGE_NAME "Recoll" /* Define to the full name and version of this package. */ -#define PACKAGE_STRING "Recoll 1.31.5" +#define PACKAGE_STRING "Recoll 1.31.6" /* Define to the one symbol short name of this package. */ #define PACKAGE_TARNAME "recoll" @@ -127,7 +127,7 @@ #define PACKAGE_URL "" /* Define to the version of this package. */ -#define PACKAGE_VERSION "1.31.5" +#define PACKAGE_VERSION "1.31.6" /* putenv parameter is const */ /* #undef PUTENV_ARG_CONST */ diff --git a/src/common/textsplit.cpp b/src/common/textsplit.cpp index 49b234c0..be61c917 100644 --- a/src/common/textsplit.cpp +++ b/src/common/textsplit.cpp @@ -596,6 +596,7 @@ bool TextSplit::text_to_words(const string &in) clearsplitstate(); bool pagepending = false; + bool nlpending = false; bool softhyphenpending = false; // Running count of non-alphanum chars. Reset when we see one; @@ -705,6 +706,10 @@ bool TextSplit::text_to_words(const string &in) pagepending = false; newpage(m_wordpos); } + if (nlpending) { + nlpending = false; + newline(m_wordpos); + } break; case WILD: @@ -745,6 +750,12 @@ bool TextSplit::text_to_words(const string &in) break; } } else { + // Note about dangling hyphens: we always strip '-' found before whitespace, + // even before a newline, then generate two terms, before and after the line + // break. We have no way to know if '-' is there because a word was broken by + // justification or if it was part of an actual compound word (would need a + // dictionary to check). As soft-hyphen *should* be used if the '-' is not part + // of the text. if (nextc == -1 || isvisiblewhite(nextc)) { goto SPACE; } @@ -844,19 +855,10 @@ bool TextSplit::text_to_words(const string &in) break; case '\n': + nlpending = true; + /* FALLTHROUGH */ case '\r': - if (m_span.length() && *m_span.rbegin() == '-') { - // if '-' is the last char before end of line, we - // strip it. We have no way to know if this is added - // because of the line split or if it was part of an - // actual compound word (would need a dictionary to - // check). As soft-hyphen *should* be used if the '-' - // is not part of the text, it is better to properly - // process a real compound word, and produce wrong - // output from wrong text. The word-emitting routine - // will strip the trailing '-'. - goto SPACE; - } else if (softhyphenpending) { + if (softhyphenpending) { // Don't reset soft-hyphen continue; } else { diff --git a/src/common/textsplit.h b/src/common/textsplit.h index 0821ee04..c09e867f 100644 --- a/src/common/textsplit.h +++ b/src/common/textsplit.h @@ -73,6 +73,9 @@ public: * just don't know about pages. */ virtual void newpage(int /*pos*/) {} + /** Called when we encounter newline \n 0x0a. Override to use the event. */ + virtual void newline(int /*pos*/) {} + // Static utility functions: /** Count words in string, as the splitter would generate them */ diff --git a/src/doc/man/recoll.conf.5 b/src/doc/man/recoll.conf.5 index 58e3adb4..ff3390a7 100644 --- a/src/doc/man/recoll.conf.5 +++ b/src/doc/man/recoll.conf.5 @@ -613,8 +613,7 @@ location before copy, to allow path translation computations. For example if a dataset originally indexed as '/home/me/mydata/config' has been mounted to '/media/me/mydata', and the GUI is running from a copied configuration, orgidxconfdir would be '/home/me/mydata/config', and -curidxconfdir (as set in the copied configuration) would be -'/media/me/mydata/config'. +curidxconfdir (as set in the copied configuration) would be '/media/me/mydata/config'. .TP .BI "idxrundir = "dfn Indexing process current directory. The input diff --git a/src/doc/user/recoll.conf.xml b/src/doc/user/recoll.conf.xml index de6cca4c..7fbaec38 100644 --- a/src/doc/user/recoll.conf.xml +++ b/src/doc/user/recoll.conf.xml @@ -170,13 +170,12 @@ listing either MIME types (e.g. audio/mpeg) or handler names files. We need to decompress these in a temporary directory for identification, which can be wasteful in some cases. Limit the waste. Negative means no limit. 0 results in no -processing of any compressed file. Default 50 MB. +processing of any compressed file. Default 100 MB. textfilemaxmbs -Size limit for text -files. Mostly for skipping monster -logs. Default 20 MB. +Size limit for text files. Mostly for skipping monster logs. Default 20 MB. Use a value of -1 to +disable. indexallfilenames diff --git a/src/doc/user/usermanual.html b/src/doc/user/usermanual.html index 7cee7dcb..e679932c 100644 --- a/src/doc/user/usermanual.html +++ b/src/doc/user/usermanual.html @@ -10,7 +10,7 @@ + "Permission is granted to copy, distribute and/or modify this document under the terms of the GNU Free Documentation License, Version 1.3 or any later version published by the Free Software Foundation; with no Invariant Sections, no Front-Cover Texts, and no Back-Cover Texts. A copy of the license can be found at the following location: GNU web site. This document introduces full text search notions and describes the installation and use of the Recoll application. This version describes Recoll 1.31."> @@ -53,7 +53,7 @@ alink="#0000FF"> and describes the installation and use of the Recoll application. This version describes Recoll 1.29.

+ "application">Recoll 1.31.

@@ -443,7 +443,7 @@ alink="#0000FF">

This document introduces full text search notions and describes the installation and use of the Recoll application. It is updated for - Recoll 1.29.

+ Recoll 1.31.

Recoll was for a long time dedicated to Unix-like systems. It was only lately (2015) ported to MS-Windows. @@ -9169,14 +9169,15 @@ hasextract = False identification, which can be wasteful in some cases. Limit the waste. Negative means no limit. 0 results in no processing of any compressed - file. Default 50 MB.

+ file. Default 100 MB.

textfilemaxmbs

Size limit for text files. Mostly for skipping - monster logs. Default 20 MB.

+ monster logs. Default 20 MB. Use a value of -1 to + disable.

%p. Page index. Only significant for a subset of document types, currently only PDF, - Postscript and DVI files. Can be used to start the - editor at the right page for a match or - snippet.

+ Postscript and DVI files. If it is set, a + significant term will be chosen in the query, and + %p will be substituted with the first page where + the term appears. Can be used to start the editor + at the right page for a match or snippet.

+ +
  • +

    %l. Line number. Only significant + for document types with relevant line breaks, + mostly text/plain and analogs. If it is set, a + significant term will be chosen in the query, and + %p will be substituted with the first line where + the term appears.

  • %s. Search term. The value will only - be set for documents with indexed page numbers (ie: - PDF). The value will be one of the matched search - terms. It would allow pre-setting the value in the - "Find" entry inside Evince for example, for easy - highlighting of the term.

    + be set for documents with indexed page or line + numbers and if %p or %l is also used. The value + will be one of the matched search terms. It would + allow pre-setting the value in the "Find" entry + inside Evince for example, for easy highlighting of + the term.

  • %u. Url.

    diff --git a/src/doc/user/usermanual.xml b/src/doc/user/usermanual.xml index 506616c0..cc2e81f7 100644 --- a/src/doc/user/usermanual.xml +++ b/src/doc/user/usermanual.xml @@ -5,7 +5,7 @@ Recoll"> http://www.recoll.org/pages/features.html"> - + Xapian"> Windows"> Unix-like systems"> @@ -7114,28 +7114,37 @@ other = rclcat:other (possibly a script) to be able to handle it. - %M - MIME type + + %MMIME type - %p - Page index. Only significant for a subset of document - types, currently only PDF, Postscript and DVI files. Can be - used to start the editor at the right page for a match or - snippet. + + %pPage index. Only significant for a subset of + document types, currently only PDF, Postscript and DVI files. If it is set, a + significant term will be chosen in the query, and %p will be substituted with the + first page where the term appears. Can be used to start the editor at the right page + for a match or snippet. - %s - Search term. The value will only be set for documents - with indexed page numbers (ie: PDF). The value will be one of - the matched search terms. It would allow pre-setting the - value in the "Find" entry inside Evince for example, for easy - highlighting of the term. + + %lLine number. Only significant for document + types with relevant line breaks, mostly text/plain and analogs. If it is set, a + significant term will be chosen in the query, and %p will be substituted with the + first line where the term appears. - %u - Url. + + %sSearch term. The value will only be set for + documents with indexed page or line numbers and if %p or %l is also used. The value + will be one of the matched search terms. It would allow pre-setting the value in the + "Find" entry inside Evince for example, for easy highlighting of the + term. + + + %uUrl. + + In addition to the predefined values above, all strings like diff --git a/src/filters/rclorgmode.py b/src/filters/rclorgmode.py index c77ea6c5..cf23773e 100755 --- a/src/filters/rclorgmode.py +++ b/src/filters/rclorgmode.py @@ -23,7 +23,7 @@ class OrgModeExtractor: iseof = rclexecm.RclExecM.noteof if self.currentindex >= len(self.docs) -1: iseof = rclexecm.RclExecM.eofnext - self.em.setmimetype("text/plain") + self.em.setmimetype("text/x-orgmode-sub") try: self.em.setfield("title", docdata.splitlines()[0]) except: diff --git a/src/index/indexer.cpp b/src/index/indexer.cpp index c0e5af03..aec2d58b 100644 --- a/src/index/indexer.cpp +++ b/src/index/indexer.cpp @@ -59,8 +59,9 @@ bool runWebFilesMoverScript(RclConfig *config) static string downloadsdir; if (downloadsdir.empty()) { if (!config->getConfParam("webdownloadsdir", downloadsdir)) { - downloadsdir = path_tildexpand("~/Downloads"); + downloadsdir = "~/Downloads"; } + downloadsdir = path_tildexpand(downloadsdir); } vector cmdvec; config->pythonCmd("recoll-we-move-files.py", cmdvec); diff --git a/src/python/recoll/recoll/conftree.py b/src/python/recoll/recoll/conftree.py index 94cca934..0ad53eec 100644 --- a/src/python/recoll/recoll/conftree.py +++ b/src/python/recoll/recoll/conftree.py @@ -304,3 +304,14 @@ def stringsToString(vs): out.append(s) return " ".join(out) +def valToBool(s): + if not s: + return False + try: + val = int(s) + return val != 0 + except: + pass + if type(s) == type(b''): + s = s.decode("UTF-8") + return s[0] in "tTyY" diff --git a/src/qtgui/fragbuts.cpp b/src/qtgui/fragbuts.cpp index f7990ee0..1c66ce15 100644 --- a/src/qtgui/fragbuts.cpp +++ b/src/qtgui/fragbuts.cpp @@ -50,7 +50,7 @@ public: void startElement(const std::string &nm, const std::map&) override { - std::cerr << "startElement [" << nm << "]\n"; + //std::cerr << "startElement [" << nm << "]\n"; currentText.clear(); if (nm == "buttons") { radio = false; @@ -67,7 +67,7 @@ public: } } void endElement(const std::string& nm) override { - std::cerr << "endElement [" << nm << "]\n"; + //std::cerr << "endElement [" << nm << "]\n"; if (nm == "label") { label = u8s2qs(currentText); @@ -102,7 +102,7 @@ public: } } void characterData(const std::string &str) override { - std::cerr << "characterData [" << str << "]\n"; + //std::cerr << "characterData [" << str << "]\n"; currentText += str; } diff --git a/src/qtgui/rclm_view.cpp b/src/qtgui/rclm_view.cpp index d9a75b90..6aa43e00 100644 --- a/src/qtgui/rclm_view.cpp +++ b/src/qtgui/rclm_view.cpp @@ -34,6 +34,7 @@ #include "rclmain_w.h" #include "rclzg.h" #include "pathut.h" +#include "unacpp.h" using namespace std; @@ -42,7 +43,6 @@ static const vector browser_list{ "opera", "google-chrome", "chromium-browser", "palemoon", "iceweasel", "firefox", "konqueror", "epiphany"}; - // Start native viewer or preview for input Doc. This is used to allow // using recoll from another app (e.g. Unity Scope) to view embedded // result docs (docs with an ipath). . We act as a proxy to extract @@ -155,13 +155,27 @@ void RclMain::openWith(Rcl::Doc doc, string cmdspec) execViewer(subs, false, execname, lcmd, cmdspec, doc); } -void RclMain::startNativeViewer(Rcl::Doc doc, int pagenum, QString term) +static bool pagenumNeeded(const std::string& cmd) { + return cmd.find("%p") != std::string::npos; +} +static bool linenumNeeded(const std::string& cmd) +{ + return cmd.find("%l") != std::string::npos; +} +static bool termNeeded(const std::string& cmd) +{ + return cmd.find("%s") != std::string::npos; +} + +void RclMain::startNativeViewer(Rcl::Doc doc, int pagenum, QString qterm) +{ + std::string term = qs2utf8s(qterm); string apptag; doc.getmeta(Rcl::Doc::keyapptg, &apptag); LOGDEB("RclMain::startNativeViewer: mtype [" << doc.mimetype << "] apptag [" << apptag << "] page " << pagenum << " term [" << - qs2utf8s(term) << "] url [" << doc.url << "] ipath [" << + term << "] url [" << doc.url << "] ipath [" << doc.ipath << "]\n"); // Look for appropriate viewer @@ -377,19 +391,19 @@ void RclMain::startNativeViewer(Rcl::Doc doc, int pagenum, QString term) // If we are not called with a page number (which would happen for a call // from the snippets window), see if we can compute a page number anyway. - if (pagenum == -1) { - pagenum = 1; - string lterm; - if (m_source) - pagenum = m_source->getFirstMatchPage(doc, lterm); + if (m_source && pagenum == -1 && (pagenumNeeded(cmd) || termNeeded(cmd)|| linenumNeeded(cmd))) { + pagenum = m_source->getFirstMatchPage(doc, term); if (pagenum == -1) pagenum = 1; - else // We get the match term used to compute the page - term = QString::fromUtf8(lterm.c_str()); } - char cpagenum[20]; - sprintf(cpagenum, "%d", pagenum); + int line = 1; + if (m_source && !term.empty() && linenumNeeded(cmd)) { + if (doc.text.empty()) { + rcldb->getDocRawText(doc); + } + line = m_source->getFirstMatchLine(doc, term); + } // Substitute %xx inside arguments string efftime; @@ -408,9 +422,10 @@ void RclMain::startNativeViewer(Rcl::Doc doc, int pagenum, QString term) subs["f"] = fn; subs["F"] = fn; subs["i"] = FileInterner::getLastIpathElt(doc.ipath); + subs["l"] = ulltodecstr(line); subs["M"] = doc.mimetype; - subs["p"] = cpagenum; - subs["s"] = (const char*)term.toLocal8Bit(); + subs["p"] = ulltodecstr(pagenum); + subs["s"] = term; subs["U"] = url_encode(url); subs["u"] = url; // Let %(xx) access all metadata. diff --git a/src/qtgui/rclmain_w.h b/src/qtgui/rclmain_w.h index 05899cb4..1c00932d 100644 --- a/src/qtgui/rclmain_w.h +++ b/src/qtgui/rclmain_w.h @@ -140,8 +140,7 @@ public slots: virtual void showActionsSearch(); virtual void startPreview(int docnum, Rcl::Doc doc, int keymods); virtual void startPreview(Rcl::Doc); - virtual void startNativeViewer(Rcl::Doc, int pagenum = -1, - QString term = QString()); + virtual void startNativeViewer(Rcl::Doc, int pagenum = -1, QString term = QString()); virtual void openWith(Rcl::Doc, string); virtual void saveDocToFile(Rcl::Doc); virtual void previewNextInTab(Preview *, int sid, int docnum); diff --git a/src/qtgui/snippets_w.cpp b/src/qtgui/snippets_w.cpp index 0e67a1bc..54c90e47 100644 --- a/src/qtgui/snippets_w.cpp +++ b/src/qtgui/snippets_w.cpp @@ -1,4 +1,4 @@ -/* Copyright (C) 2012 J.F.Dockes +/* Copyright (C) 2012-2021 J.F.Dockes * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or @@ -67,8 +67,7 @@ using namespace std; class PlainToRichQtSnippets : public PlainToRich { public: virtual string startMatch(unsigned int) { - return string(""); + return string(""); } virtual string endMatch() { return string(""); @@ -82,12 +81,10 @@ void SnippetsW::init() QPushButton *searchButton = new QPushButton(tr("Search")); searchButton->setAutoDefault(false); buttonBox->addButton(searchButton, QDialogButtonBox::ActionRole); -// setWindowFlags(Qt::WindowStaysOnTopHint); searchFM->hide(); onNewShortcuts(); - connect(&SCBase::scBase(), SIGNAL(shortcutsChanged()), - this, SLOT(onNewShortcuts())); + connect(&SCBase::scBase(), SIGNAL(shortcutsChanged()), this, SLOT(onNewShortcuts())); QPushButton *closeButton = buttonBox->button(QDialogButtonBox::Close); if (closeButton) @@ -105,11 +102,9 @@ void SnippetsW::init() browserw = new QWebView(this); verticalLayout->insertWidget(0, browserw); browser->setUrl(QUrl(QString::fromUtf8("about:blank"))); - connect(browser, SIGNAL(linkClicked(const QUrl &)), - this, SLOT(onLinkClicked(const QUrl &))); + connect(browser, SIGNAL(linkClicked(const QUrl &)), this, SLOT(onLinkClicked(const QUrl &))); browser->page()->setLinkDelegationPolicy(QWebPage::DelegateAllLinks); - browser->page()->currentFrame()->setScrollBarPolicy(Qt::Horizontal, - Qt::ScrollBarAlwaysOff); + browser->page()->currentFrame()->setScrollBarPolicy(Qt::Horizontal, Qt::ScrollBarAlwaysOff); QWEBSETTINGS *ws = browser->page()->settings(); if (prefs.reslistfontfamily != "") { ws->setFontFamily(QWEBSETTINGS::StandardFont, prefs.reslistfontfamily); @@ -136,8 +131,7 @@ void SnippetsW::init() #else browserw = new QTextBrowser(this); verticalLayout->insertWidget(0, browserw); - connect(browser, SIGNAL(anchorClicked(const QUrl &)), - this, SLOT(onLinkClicked(const QUrl &))); + connect(browser, SIGNAL(anchorClicked(const QUrl &)), this, SLOT(onLinkClicked(const QUrl &))); browser->setReadOnly(true); browser->setUndoRedoEnabled(false); browser->setOpenLinks(false); @@ -183,8 +177,7 @@ void SnippetsW::createPopupMenu(const QPoint& pos) { QMenu *popup = new QMenu(this); if (m_sortingByPage) { - popup->addAction(tr("Sort By Relevance"), this, - SLOT(reloadByRelevance())); + popup->addAction(tr("Sort By Relevance"), this, SLOT(reloadByRelevance())); } else { popup->addAction(tr("Sort By Page"), this, SLOT(reloadByPage())); } @@ -230,29 +223,22 @@ void SnippetsW::onSetDoc(Rcl::Doc doc, std::shared_ptr source) source->getTerms(hdata); ostringstream oss; - oss << - "" - ""; + oss << "" + ""; oss << "\n"; oss << qs2utf8s(prefs.darkreslistheadertext) << qs2utf8s(prefs.reslistheadertext); - oss << - "" - "" - "" - ; + oss << "
    "; g_hiliter.set_inputhtml(false); bool nomatch = true; for (const auto& snippet : vpabs) { if (snippet.page == -1) { - oss << "" << endl; + oss << "" << "\n"; continue; } list lr; @@ -263,13 +249,12 @@ void SnippetsW::onSetDoc(Rcl::Doc doc, std::shared_ptr source) nomatch = false; oss << "" << endl; + oss << "" << "\n"; } - oss << "
    " << - snippet.snippet << "
    " << snippet.snippet << "
    "; if (snippet.page > 0) { - oss << "" - << "P. " << snippet.page << ""; + oss << "" << + "P. " << snippet.page << ""; } - oss << "" << lr.front().c_str() << "
    " << lr.front().c_str() << "
    " << endl; + oss << "" << "\n"; if (nomatch) { oss.str("\n"); oss << qs2utf8s(tr("

    Sorry, no exact match was found within limits. " @@ -278,12 +263,12 @@ void SnippetsW::onSetDoc(Rcl::Doc doc, std::shared_ptr source) } oss << "\n"; #if defined(USING_WEBKIT) || defined(USING_WEBENGINE) - browser->setHtml(QString::fromUtf8(oss.str().c_str())); + browser->setHtml(u8s2qs(oss.str())); #else browser->clear(); browser->append("."); browser->clear(); - browser->insertHtml(QString::fromUtf8(oss.str().c_str())); + browser->insertHtml(u8s2qs(oss.str())); browser->moveCursor (QTextCursor::Start); browser->ensureCursorVisible(); #endif @@ -354,8 +339,7 @@ void SnippetsW::onLinkClicked(const QUrl &url) string term; if (termpos != string::npos) term = ascurl.substr(termpos+1); - emit startNativeViewer(m_doc, page, - QString::fromUtf8(term.c_str())); + emit startNativeViewer(m_doc, page, u8s2qs(term)); return; } } diff --git a/src/query/docseq.h b/src/query/docseq.h index 4dd6f50f..650b9d89 100644 --- a/src/query/docseq.h +++ b/src/query/docseq.h @@ -111,6 +111,9 @@ public: virtual int getFirstMatchPage(Rcl::Doc&, std::string&) { return -1; } + virtual int getFirstMatchLine(const Rcl::Doc&, const std::string&) { + return 1; + } /** Get duplicates. */ virtual bool docDups(const Rcl::Doc&, std::vector&) { return false; diff --git a/src/query/docseqdb.cpp b/src/query/docseqdb.cpp index fab028bd..df06c6a3 100644 --- a/src/query/docseqdb.cpp +++ b/src/query/docseqdb.cpp @@ -126,6 +126,17 @@ int DocSequenceDb::getFirstMatchPage(Rcl::Doc &doc, string& term) return -1; } +int DocSequenceDb::getFirstMatchLine(const Rcl::Doc &doc, const string& term) +{ + std::unique_lock locker(o_dblock); + if (!setQuery()) + return false; + if (m_q->whatDb()) { + return m_q->getFirstMatchLine(doc, term); + } + return 1; +} + list DocSequenceDb::expand(Rcl::Doc &doc) { std::unique_lock locker(o_dblock); diff --git a/src/query/docseqdb.h b/src/query/docseqdb.h index 69535d79..b77051b4 100644 --- a/src/query/docseqdb.h +++ b/src/query/docseqdb.h @@ -43,6 +43,7 @@ public: virtual bool getAbstract(Rcl::Doc &doc, std::vector&) override; virtual int getFirstMatchPage(Rcl::Doc&, std::string& term) override; + virtual int getFirstMatchLine(const Rcl::Doc&, const std::string& term) override; virtual bool docDups(const Rcl::Doc& doc, std::vector& dups) override; virtual std::string getDescription() override; diff --git a/src/query/plaintorich.h b/src/query/plaintorich.h index 9118ea5a..b86f649b 100644 --- a/src/query/plaintorich.h +++ b/src/query/plaintorich.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2004 J.F.Dockes +/* Copyright (C) 2004-2021 J.F.Dockes * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or @@ -60,8 +60,7 @@ public: * @param in raw text out of internfile. * @param out rich text output, divided in chunks (to help our caller * avoid inserting half tags into textedit which doesnt like it) - * @param in hdata terms and groups to be highlighted. These are - * lowercase and unaccented. + * @param in hdata terms and groups to be highlighted. See utils/hldata.h * @param chunksize max size of chunks in output list */ virtual bool plaintorich(const std::string &in, std::list &out, diff --git a/src/rcldb/rclabsfromtext.cpp b/src/rcldb/rclabsfromtext.cpp index 32783b1f..4195b1c4 100644 --- a/src/rcldb/rclabsfromtext.cpp +++ b/src/rcldb/rclabsfromtext.cpp @@ -141,11 +141,9 @@ public: // add/update fragment definition. virtual bool takeword(const std::string& term, int pos, int bts, int bte) { LOGDEB1("takeword: [" << term << "] bytepos: "< maxtermcount) { LOGINF("Rclabsfromtext: stopping because maxtermcount reached: "<< maxtermcount << endl); @@ -154,8 +152,7 @@ public: } // Also limit the number of fragments (just in case safety) if (m_fragments.size() > maxtermcount / 100) { - LOGINF("Rclabsfromtext: stopping because maxfragments reached: "<< - maxtermcount/100 << endl); + LOGINF("Rclabsfromtext: stopping: max fragments count: " << maxtermcount/100 << "\n"); retflags |= ABSRES_TRUNC; return false; } @@ -193,8 +190,7 @@ public: m_curterm = term; m_curtermcoef = coef; } else { - LOGDEB2("Extending current fragment: " << m_remainingWords << - " -> " << m_ctxwords << endl); + LOGDEB2("Extending current fragment: "< "< 5) { - // Limit expansion of contiguous fragments (this is to - // avoid common terms in search causing long - // heavyweight meaningless fragments. Also, limit length). + // Limit expansion of contiguous fragments (this is to avoid common terms in search + // causing long heavyweight meaningless fragments. Also, limit length). m_remainingWords = 1; m_extcount = 0; } @@ -247,18 +242,14 @@ public: LOGDEB1("FRAGMENT: from byte " << m_curfrag.first << " to byte " << m_curfrag.second << endl); LOGDEB1("FRAGMENT TEXT [" << m_rawtext.substr( - m_curfrag.first, m_curfrag.second-m_curfrag.first) - << "]\n"); - // We used to not push weak fragments if we had a lot - // already. This can cause problems if the fragments - // we drop are actually group fragments (which have - // not got their boost yet). The right cut value is - // difficult to determine, because the absolute values - // of the coefs depend on many things (index size, - // etc.) The old test was if (m_totalcoef < 5.0 || - // m_curfragcoef >= 1.0) We now just avoid creating a - // monster by testing the current fragments count at - // the top of the function + m_curfrag.first, m_curfrag.second-m_curfrag.first) << "]\n"); + // We used to not push weak fragments if we had a lot already. This can cause + // problems if the fragments we drop are actually group fragments (which have not + // got their boost yet). The right cut value is difficult to determine, because the + // absolute values of the coefs depend on many things (index size, etc.) The old + // test was if (m_totalcoef < 5.0 || m_curfragcoef >= 1.0) We now just avoid + // creating a monster by testing the current fragments count at the top of the + // function m_fragments.push_back(MatchFragment(m_curfrag.first, m_curfrag.second, m_curfragcoef, @@ -298,8 +289,7 @@ public: m_curtermcoef = 0.0; } - LOGDEB("TextSplitABS: stored total " << m_fragments.size() << - " fragments" << endl); + LOGDEB("TextSplitABS: stored total " << m_fragments.size() << " fragments" << endl); vector tboffs; // Look for matches to PHRASE and NEAR term groups and finalize @@ -340,9 +330,8 @@ public: } auto fragit = m_fragments.begin(); for (const auto& grpmatch : tboffs) { - LOGDEB2("LOOKING FOR FRAGMENT: group: " << grpmatch.offs.first << - "-" << grpmatch.offs.second << " curfrag " << - fragit->start << "-" << fragit->stop << endl); + LOGDEB2("LOOKING FOR FRAGMENT: group: " << grpmatch.offs.first << "-" << + grpmatch.offs.second<<" curfrag "<start<<"-"<stop<<"\n"); while (fragit->stop < grpmatch.offs.first) { fragit++; if (fragit == m_fragments.end()) { @@ -417,21 +406,19 @@ int Query::Native::abstractFromText( bool sortbypage ) { - (void)chron; + PRETEND_USE(chron); LOGABS("abstractFromText: entry: " << chron.millis() << "mS\n"); string rawtext; if (!ndb->getRawText(docid, rawtext)) { LOGDEB0("abstractFromText: can't fetch text\n"); return ABSRES_ERROR; } - LOGABS("abstractFromText: got raw text: size " << rawtext.size() << " " << - chron.millis() << "mS\n"); + LOGABS("abstractFromText: got raw text: size "<m_snipMaxPosWalk); splitter.text_to_words(rawtext); LOGABS("abstractFromText: text_to_words: " << chron.millis() << "mS\n"); @@ -484,8 +470,7 @@ int Query::Native::abstractFromText( // main term and the page positions. unsigned int count = 0; for (const auto& entry : result) { - string frag( - fixfrag(rawtext.substr(entry.start, entry.stop - entry.start))); + string frag(fixfrag(rawtext.substr(entry.start, entry.stop - entry.start))); #ifdef COMPUTE_HLZONES // This would need to be modified to take tag parameters @@ -506,8 +491,7 @@ int Query::Native::abstractFromText( if (page < 0) page = 0; } - LOGDEB0("=== FRAGMENT: p. " << page << " Coef: " << entry.coef << - ": " << frag << endl); + LOGDEB0("=== FRAGMENT: p. " << page << " Coef: " << entry.coef << ": " << frag << endl); vabs.push_back(Snippet(page, frag).setTerm(entry.term)); if (count++ >= maxtotaloccs) break; @@ -515,4 +499,45 @@ int Query::Native::abstractFromText( return ABSRES_OK | splitter.getretflags(); } +class TermLineSplitter : public TextSplit { +public: + TermLineSplitter(const std::string& term) + : TextSplit(TextSplit::TXTS_NOSPANS), m_term(term) { + } + bool takeword(const std::string& _term, int, int, int) override { + std::string term; + if (o_index_stripchars) { + if (!unacmaybefold(_term, term, "UTF-8", UNACOP_UNACFOLD)) { + LOGINFO("PlainToRich::takeword: unac failed for [" << term << "]\n"); + return true; + } + } + if (term == m_term) { + return false; + } + return true; + } + void newline(int) override { + m_line++; + } + int getline() { + return m_line; + } +private: + int m_line{1}; + std::string m_term; +}; + +int Query::getFirstMatchLine(const Doc &doc, const std::string& term) +{ + int line = 1; + TermLineSplitter splitter(term); + bool ret = splitter.text_to_words(doc.text); + // The splitter takeword() breaks by returning false as soon as the term is found + if (ret == false) { + line = splitter.getline(); + } + return line; +} + } diff --git a/src/rcldb/rclabstract.cpp b/src/rcldb/rclabstract.cpp index 311ef760..04811a11 100644 --- a/src/rcldb/rclabstract.cpp +++ b/src/rcldb/rclabstract.cpp @@ -254,7 +254,7 @@ double Query::Native::qualityTerms(Xapian::docid docid, } -// Return page number for first match of "significant" term. +// Choose most interesting term and return the page number for its first match int Query::Native::getFirstMatchPage(Xapian::docid docid, string& term) { LOGDEB("Query::Native::getFirstMatchPage\n"); @@ -286,9 +286,7 @@ int Query::Native::getFirstMatchPage(Xapian::docid docid, string& term) qualityTerms(docid, terms, byQ); for (auto mit = byQ.rbegin(); mit != byQ.rend(); mit++) { - for (vector::const_iterator qit = mit->second.begin(); - qit != mit->second.end(); qit++) { - string qterm = *qit; + for (const auto& qterm : mit->second) { Xapian::PositionIterator pos; string emptys; try { @@ -619,9 +617,8 @@ int Query::Native::abstractFromIndex( // possibly retried by our caller. // // @param[out] vabs the abstract is returned as a vector of snippets. -int Query::Native::makeAbstract(Xapian::docid docid, - vector& vabs, - int imaxoccs, int ictxwords, bool sortbypage) +int Query::Native::makeAbstract( + Xapian::docid docid, vector& vabs, int imaxoccs, int ictxwords, bool sortbypage) { chron.restart(); LOGDEB("makeAbstract: docid " << docid << " imaxoccs " << diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index 557affcb..ceaec4e3 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -96,10 +96,13 @@ const string pathelt_prefix = "XP"; static const string udi_prefix("Q"); static const string parent_prefix("F"); -// Special terms to mark begin/end of field (for anchored searches), and -// page breaks +// Special terms to mark begin/end of field (for anchored searches). string start_of_field_term; string end_of_field_term; + +// Special term for page breaks. Note that we use a complicated mechanism for multiple page +// breaks at the same position, when it would have been probably simpler to use XXPG/n terms +// instead (did not try to implement though). A change would force users to reindex. const string page_break_term = "XXPG/"; // Special term to mark documents with children. @@ -1846,16 +1849,14 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc) } } - // If empty pages (multiple break at same pos) were recorded, save - // them (this is because we have no way to record them in the - // Xapian list + // If empty pages (multiple break at same pos) were recorded, save them (this is + // because we have no way to record them in the Xapian list) if (!tpidx.m_pageincrvec.empty()) { ostringstream multibreaks; for (unsigned int i = 0; i < tpidx.m_pageincrvec.size(); i++) { if (i != 0) multibreaks << ","; - multibreaks << tpidx.m_pageincrvec[i].first << "," << - tpidx.m_pageincrvec[i].second; + multibreaks << tpidx.m_pageincrvec[i].first << "," << tpidx.m_pageincrvec[i].second; } RECORD_APPEND(record, string(cstr_mbreaks), multibreaks.str()); } diff --git a/src/rcldb/rclquery.cpp b/src/rcldb/rclquery.cpp index 19b88f79..19b50c87 100644 --- a/src/rcldb/rclquery.cpp +++ b/src/rcldb/rclquery.cpp @@ -360,7 +360,6 @@ int Query::getFirstMatchPage(const Doc &doc, string& term) return m_reason.empty() ? pagenum : -1; } - // Mset size // Note: times for retrieving (multiple times)all docs from a sample // 25k docs db (q: mime:*) @@ -511,8 +510,7 @@ vector Query::expand(const Doc &doc) Xapian::ESet eset = m_nq->xenquire->get_eset(20, rset, false); LOGDEB("ESet terms:\n"); // We filter out the special terms - for (Xapian::ESetIterator it = eset.begin(); - it != eset.end(); it++) { + for (Xapian::ESetIterator it = eset.begin(); it != eset.end(); it++) { LOGDEB(" [" << (*it) << "]\n"); if ((*it).empty() || has_prefix(*it)) continue; diff --git a/src/rcldb/rclquery.h b/src/rcldb/rclquery.h index cade3650..fd8874d3 100644 --- a/src/rcldb/rclquery.h +++ b/src/rcldb/rclquery.h @@ -115,10 +115,18 @@ public: // Returned as a vector of pair page is 0 if unknown int makeDocAbstract(const Doc &doc, std::vector& abst, int maxoccs= -1, int ctxwords= -1,bool sortbypage=false); - /** Retrieve page number for first match for "significant" query term - * @param term returns the chosen term */ + + /** Choose most interesting term and return the page number for its first match + * @param term returns the chosen term + * @return page number or -1 if term not found or other issue + */ int getFirstMatchPage(const Doc &doc, std::string& term); + /** Compute line number for first match of term. Only works if doc.text has text. + * This uses a text split. Both this and the above getFirstMaxPage() could be done and saved + * while we compute the abstracts, quite a lot of waste here. */ + int getFirstMatchLine(const Doc &doc, const std::string& term); + /** Retrieve a reference to the searchData we are using */ std::shared_ptr getSD() { return m_sd; diff --git a/src/sampleconf/mimeconf b/src/sampleconf/mimeconf index b213ade6..f41e5d06 100644 --- a/src/sampleconf/mimeconf +++ b/src/sampleconf/mimeconf @@ -225,6 +225,7 @@ text/x-lua = internal text/x-mail = internal text/x-man = exec rclman;maxseconds=30 text/x-orgmode = execm rclorgmode.py +text/x-orgmode-sub = internal text/plain text/x-perl = internal text/plain text/x-purple-html-log = internal text/html text/x-purple-log = exec rclpurple @@ -359,6 +360,7 @@ text/x-java = source text/x-lua = source text/x-mail = message text/x-man = document +text/x-orgmode = document text/x-perl = source text/x-php = source text/x-purple-html-log = pidgin @@ -443,6 +445,7 @@ text = \ text/x-ini \ text/x-java \ text/x-man \ + text/x-orgmode \ text/x-perl \ text/x-php \ text/x-python \ diff --git a/src/sampleconf/recoll.conf b/src/sampleconf/recoll.conf index 7a93c597..63f99a4c 100644 --- a/src/sampleconf/recoll.conf +++ b/src/sampleconf/recoll.conf @@ -214,12 +214,13 @@ nomd5types = rclaudio # files.We need to decompress these in a # temporary directory for identification, which can be wasteful in some # cases. Limit the waste. Negative means no limit. 0 results in no -# processing of any compressed file. Default 50 MB. +# processing of any compressed file. Default 100 MB. compressedfilemaxkbs = 100000 -# Size limit for text -# files.Mostly for skipping monster -# logs. Default 20 MB. +# +# Size limit for text files. +# Mostly for skipping monster logs. Default 20 MB. Use a value of -1 to +# disable. textfilemaxmbs = 20 # Index the file names of diff --git a/src/utils/rclutil.cpp b/src/utils/rclutil.cpp index 83962e56..d33f79bb 100644 --- a/src/utils/rclutil.cpp +++ b/src/utils/rclutil.cpp @@ -619,11 +619,17 @@ static const string& thumbnailsdir() return thumbnailsd; } +// Place for 1024x1024 files +static const string thmbdirxxlarge = "xx-large"; +// Place for 512x512 files +static const string thmbdirxlarge = "x-large"; // Place for 256x256 files static const string thmbdirlarge = "large"; // 128x128 static const string thmbdirnormal = "normal"; +static const vector thmbdirs{thmbdirxxlarge, thmbdirxlarge, thmbdirlarge, thmbdirnormal}; + static void thumbname(const string& url, string& name) { string digest; @@ -635,26 +641,47 @@ static void thumbname(const string& url, string& name) bool thumbPathForUrl(const string& url, int size, string& path) { - string name; + string name, path128, path256, path512, path1024; thumbname(url, name); if (size <= 128) { path = path_cat(thumbnailsdir(), thmbdirnormal); path = path_cat(path, name); + path128 = path; + } else if (size <= 256) { + path = path_cat(thumbnailsdir(), thmbdirlarge); + path = path_cat(path, name); + path256 = path; + } else if (size <= 512) { + path = path_cat(thumbnailsdir(), thmbdirxlarge); + path = path_cat(path, name); + path512 = path; + } else { + path = path_cat(thumbnailsdir(), thmbdirxxlarge); + path = path_cat(path, name); + path1024 = path; + } + if (access(path.c_str(), R_OK) == 0) { + return true; + } + + // Not found in requested size. Try to find any size and return it. Let the client scale. + for (const auto& tdir : thmbdirs) { + path = path_cat(thumbnailsdir(), tdir); + path = path_cat(path, name); if (access(path.c_str(), R_OK) == 0) { return true; } } - path = path_cat(thumbnailsdir(), thmbdirlarge); - path = path_cat(path, name); - if (access(path.c_str(), R_OK) == 0) { - return true; - } - // File does not exist. Path corresponds to the large version at this point, - // fix it if needed. + // File does not exist. Return appropriate path anyway. if (size <= 128) { - path = path_cat(path_home(), thmbdirnormal); - path = path_cat(path, name); + path = path128; + } else if (size <= 256) { + path = path256; + } else if (size <= 512) { + path = path512; + } else { + path = path1024; } return false; } diff --git a/src/utils/smallut.cpp b/src/utils/smallut.cpp index ab7f021f..4299962d 100644 --- a/src/utils/smallut.cpp +++ b/src/utils/smallut.cpp @@ -318,7 +318,7 @@ template void stringsToCSV(const T& tokens, string& s, char sep) s.append(1, sep); } // Remove last separator. - if (s.size()) + if (!s.empty()) s.pop_back(); } @@ -951,7 +951,7 @@ bool parsedateinterval(const string& s, DateInterval *dip) return false; } - vector::const_iterator it = vs.begin(); + auto it = vs.cbegin(); if (*it == "P" || *it == "p") { it++; if (!parseperiod(it, vs.end(), &p1)) { @@ -1221,7 +1221,7 @@ std::string SimpleRegexp::simpleSub( const std::string& in, const std::string& repl) { if (!ok()) { - return std::string(); + return {}; } int err; @@ -1256,7 +1256,7 @@ bool SimpleRegexp::simpleMatch(const string& val) const string SimpleRegexp::getMatch(const string& val, int i) const { if (i > m->nmatch) { - return string(); + return {}; } return val.substr(m->matches[i].rm_so, m->matches[i].rm_eo - m->matches[i].rm_so); diff --git a/src/windows/mimeconf b/src/windows/mimeconf index c48a69a1..6bc6545e 100644 --- a/src/windows/mimeconf +++ b/src/windows/mimeconf @@ -187,6 +187,7 @@ text/x-csv = internal text/plain text/x-fictionbook = internal xsltproc fb2.xsl text/x-ini = internal text/plain text/x-mail = internal +text/x-orgmode = execm python rclorgmode.py text/x-perl = internal text/plain text/x-python = execm python rclpython.py text/x-shellscript = internal text/plain @@ -291,6 +292,7 @@ text/x-html-sidux-man = sidux-book text/x-ini = txt text/x-mail = message text/x-man = document +text/x-orgmode = document text/x-perl = source text/x-purple-html-log = pidgin text/x-purple-log = pidgin @@ -359,6 +361,7 @@ text = \ text/x-html-sidux-man \ text/x-ini \ text/x-man \ + text/x-orgmode \ text/x-perl \ text/x-python \ text/x-shellscript