web page indexing doc and indents

This commit is contained in:
Jean-Francois Dockes 2021-03-11 10:00:11 +01:00
parent f57530e2a6
commit 5c7d0ff96d
5 changed files with 108 additions and 121 deletions

View File

@ -37,15 +37,15 @@ WebStore::WebStore(RclConfig *cnf)
int maxmbs = 40;
cnf->getConfParam("webcachemaxmbs", &maxmbs);
if ((m_cache = new CirCache(ccdir)) == 0) {
LOGERR("WebStore: cant create CirCache object\n" );
return;
LOGERR("WebStore: cant create CirCache object\n" );
return;
}
if (!m_cache->create(int64_t(maxmbs)*1000*1024, CirCache::CC_CRUNIQUE)) {
LOGERR("WebStore: cache file creation failed: " <<
LOGERR("WebStore: cache file creation failed: " <<
m_cache->getReason() << "\n");
delete m_cache;
m_cache = 0;
return;
delete m_cache;
m_cache = 0;
return;
}
}
@ -57,17 +57,17 @@ WebStore::~WebStore()
// Read document from cache. Return the metadata as an Rcl::Doc
// @param htt Web Hit Type
bool WebStore::getFromCache(const string& udi, Rcl::Doc &dotdoc,
string& data, string *htt)
string& data, string *htt)
{
string dict;
if (m_cache == 0) {
LOGERR("WebStore::getFromCache: cache is null\n");
return false;
LOGERR("WebStore::getFromCache: cache is null\n");
return false;
}
if (!m_cache->get(udi, dict, &data)) {
LOGDEB("WebStore::getFromCache: get failed\n");
return false;
LOGDEB("WebStore::getFromCache: get failed\n");
return false;
}
ConfSimple cf(dict, 1);
@ -89,4 +89,3 @@ bool WebStore::getFromCache(const string& udi, Rcl::Doc &dotdoc,
dotdoc.meta[Rcl::Doc::keyudi] = udi;
return true;
}

View File

@ -21,8 +21,8 @@
class RclConfig;
namespace Rcl {
class Db;
class Doc;
class Db;
class Doc;
}
class CirCache;

View File

@ -140,9 +140,9 @@ alink="#0000FF">
"#RCL.INDEXING.REMOVABLE">Removable
volumes</a></span></dt>
<dt><span class="sect1">2.5. <a href=
"#RCL.INDEXING.WEBQUEUE"><span class=
"#RCL.INDEXING.WebQUEUE"><span class=
"application">Unix</span>-like systems: indexing
visited WEB pages</a></span></dt>
visited Web pages</a></span></dt>
<dt><span class="sect1">2.6. <a href=
"#RCL.INDEXING.EXTATTR"><span class=
"application">Unix</span>-like systems: using extended
@ -423,7 +423,7 @@ alink="#0000FF">
<div class="list-of-tables">
<p><b>List of Tables</b></p>
<dl>
<dt>3.1. <a href="#idm1437">Keyboard shortcuts</a></dt>
<dt>3.1. <a href="#idm1438">Keyboard shortcuts</a></dt>
</dl>
</div>
<div class="chapter">
@ -720,7 +720,7 @@ alink="#0000FF">
<li class="listitem">
<p>A <a class="ulink" href=
"https://framagit.org/medoc92/recollwebui" target=
"_top">WEB interface</a>.</p>
"_top">Web interface</a>.</p>
</li>
</ul>
</div>
@ -1949,10 +1949,10 @@ recollindex -c "$confdir"
<div>
<div>
<h2 class="title" style="clear: both"><a name=
"RCL.INDEXING.WEBQUEUE" id=
"RCL.INDEXING.WEBQUEUE"></a>2.5.&nbsp;<span class=
"RCL.INDEXING.WebQUEUE" id=
"RCL.INDEXING.WebQUEUE"></a>2.5.&nbsp;<span class=
"application">Unix</span>-like systems: indexing
visited WEB pages</h2>
visited Web pages</h2>
</div>
</div>
</div>
@ -1964,57 +1964,48 @@ recollindex -c "$confdir"
"application">Beagle</span> indexer, then adapted to
<span class="application">Recoll</span> and the
<span class="application">Firefox</span> <span class=
"application">XUL</span> API. A new version of the addon
has been written to work with the <span class=
"application">WebExtensions</span> API, which is the only
one supported after <span class=
"application">Firefox</span> version 57.</p>
<p>The extension works by copying visited WEB pages to an
"application">XUL</span> API. The current version of the
extension is located in the <a class="ulink" href=
"https://addons.mozilla.org/en-US/firefox/addon/recoll-we/"
target="_top">Mozilla add-ons repository</a> uses the
<span class="application">WebExtensions</span> API, and
works with current <span class="application">Firefox</span>
versions.</p>
<p>The extension works by copying visited Web pages to an
indexing queue directory, which <span class=
"application">Recoll</span> then processes, indexing the
data, storing it into a local cache, then removing the file
from the queue.</p>
<p>Because the WebExtensions API introduces more
constraints to what extensions can do, the new version
works with one more step: the files are first created in
the browser default downloads location (typically
<code class="filename">$HOME/Downloads</code> ), then moved
by a script in the old queue location. The script is
automatically executed by the <span class=
"application">Recoll</span> indexer versions 1.23.5 and
newer. It could conceivably be executed independently to
make the new browser extension compatible with an older
<span class="application">Recoll</span> version (the script
is named <span class=
"command"><strong>recoll-we-move-files.py</strong></span>).</p>
<div class="note" style=
"margin-left: 0.5in; margin-right: 0.5in;">
<h3 class="title">Note</h3>
<p>For the WebExtensions-based version to work, it is
necessary to set the <code class=
"literal">webdownloadsdir</code> value in the
configuration if it was changed from the default
<code class="filename">$HOME/Downloads</code> in the
browser preferences.</p>
</div>
<p>The visited WEB pages indexing feature can be enabled on
"application">Recoll</span> then processes, storing the
data into a local cache, then indexing it, then removing
the file from the queue.</p>
<p>The visited Web pages indexing feature can be enabled on
the <span class="application">Recoll</span> side from the
GUI <span class="guilabel">Index configuration</span>
panel, or by editing the configuration file (set
<code class="varname">processwebqueue</code> to 1).</p>
<p>A current pointer to the extension can be found, along
with up-to-date instructions, on the <a class="ulink" href=
<p>The <span class="application">Recoll</span> GUI has a
tool to list and edit the contents of the Web cache.
(<span class="guimenu">Tools</span><span class=
"guimenuitem">Webcache editor</span>)</p>
<p>You can find more details on Web indexing, its usage and
configuration in a <a class="ulink" href=
"https://www.lesbonscomptes.com/recoll/faqsandhowtos/IndexWebHistory"
target="_top">Recoll wiki</a>.</p>
<p>A copy of the indexed WEB pages is retained by Recoll in
a local cache (from which previews can be fetched). The
cache size can be adjusted from the <span class=
"guilabel">Index configuration</span> / <span class=
"guilabel">Web history</span> panel. Once the maximum size
is reached, old pages are purged - both from the cache and
the index - to make room for new ones, so you need to
explicitly archive in some other place the pages that you
want to keep indefinitely.</p>
target="_top">Recoll 'Howto' entry</a>.</p>
<div class="note" style=
"margin-left: 0.5in; margin-right: 0.5in;">
<h3 class="title">The cache is not an archive</h3>
<p>A copy of the indexed Web pages is retained by Recoll
in a local cache (from which data is fetched for
previews, or when resetting the index). The cache has a
maximum size, which can be adjusted from the <span class=
"guilabel">Index configuration</span> / <span class=
"guilabel">Web history</span> panel (<code class=
"literal">webcachemaxmbs</code> parameter in <code class=
"filename">recoll.conf</code>). Once the maximum size is
reached, old pages are erased to make room for new ones.
The pages which you want to keep indefinitely need to be
explicitly archived elsewhere. Using a very high value
for the cache size can avoid data erasure, but see the
above 'Howto' page for more details and gotchas.</p>
</div>
</div>
<div class="sect1">
<div class="titlepage">
@ -3473,14 +3464,14 @@ fs.inotify.max_user_watches=32768
be able to find under <code class=
"filename">/usr/share/recoll/examples/fragbuts.xml</code>),
contains an example which filters the results from the
WEB history.</p>
Web history.</p>
<p>Here follows an example:</p>
<pre class="programlisting">
&lt;?xml version="1.0" encoding="UTF-8"?&gt;
&lt;fragbuts version="1.0"&gt;
&lt;radiobuttons&gt;
&lt;!-- Actually useful: toggle WEB queue results inclusion --&gt;
&lt;!-- Actually useful: toggle Web queue results inclusion --&gt;
&lt;fragbut&gt;
&lt;label&gt;Include Web Results&lt;/label&gt;
&lt;frag&gt;&lt;/frag&gt;
@ -3996,7 +3987,7 @@ fs.inotify.max_user_watches=32768
given context (e.g. within a preview window, within the
result table).</p>
<div class="table">
<a name="idm1437" id="idm1437"></a>
<a name="idm1438" id="idm1438"></a>
<p class="title"><b>Table&nbsp;3.1.&nbsp;Keyboard
shortcuts</b></p>
<div class="table-contents">
@ -7940,11 +7931,11 @@ hasextract = False
"application">FreeBSD</span> ports, etc.), or from some
type of "backports" repository providing versions newer
than the standard ones, or found on the <span class=
"application">Recoll</span> WEB site in some cases. The
"application">Recoll</span> Web site in some cases. The
most up-to-date information about Recoll packages can
usually be found on the <a class="ulink" href=
"http://www.recoll.org/pages/download.html" target=
"_top"><span class="application">Recoll</span> WEB site
"_top"><span class="application">Recoll</span> Web site
downloads page</a></p>
<p>The <span class="application">Windows</span> version of
Recoll comes in a self-contained setup file, there is

View File

@ -282,7 +282,7 @@
<ulink url="https://www.lesbonscomptes.com/recoll/pages/download.html">Search Provider</ulink>.
</para></listitem>
<listitem><para>A
<ulink url="https://framagit.org/medoc92/recollwebui">WEB interface</ulink>.
<ulink url="https://framagit.org/medoc92/recollwebui">Web interface</ulink>.
</para></listitem>
</itemizedlist>
</para>
@ -1257,56 +1257,51 @@ recollindex -c "$confdir"
</simplesect>
</sect1>
<sect1 id="RCL.INDEXING.WEBQUEUE">
<title>&LIN;: indexing visited WEB pages</title>
<sect1 id="RCL.INDEXING.WebQUEUE">
<title>&LIN;: indexing visited Web pages</title>
<para>With the help of a <application>Firefox</application>
extension, &RCL; can index the Internet pages that you visit. The
extension has a long history: it was initially designed for the
<application>Beagle</application> indexer, then adapted to &RCL; and
the <application>Firefox</application> <application>XUL</application>
API. A new version of the addon has been written to work with the
<application>WebExtensions</application> API, which is the only one
supported after <application>Firefox</application> version 57.</para>
extension, &RCL; can index the Internet pages that you visit. The
extension has a long history: it was initially designed for
the <application>Beagle</application> indexer, then adapted to
&RCL; and
the <application>Firefox</application> <application>XUL</application>
API. The current version of the extension is located in
the <ulink url="https://addons.mozilla.org/en-US/firefox/addon/recoll-we/">Mozilla
add-ons repository</ulink> uses
the <application>WebExtensions</application> API, and works with
current <application>Firefox</application> versions.</para>
<para>The extension works by copying visited WEB pages to an indexing
queue directory, which &RCL; then processes, indexing the data,
storing it into a local cache, then removing the file from the
queue.</para>
<para>Because the WebExtensions API introduces more constraints to
what extensions can do, the new version works with one
more step: the files are first created in the browser default
downloads location (typically <filename>$HOME/Downloads</filename> ),
then moved by a script in the old queue location. The script is
automatically executed by the &RCL; indexer versions 1.23.5 and
newer. It could conceivably be executed independently to make the new
browser extension compatible with an older &RCL; version (the script
is named <command>recoll-we-move-files.py</command>).</para>
<note><para>For the WebExtensions-based version to work, it is
necessary to set the <literal>webdownloadsdir</literal> value in the
configuration if it was changed from the default
<filename>$HOME/Downloads</filename> in the browser
preferences.</para></note>
<para>The extension works by copying visited Web pages to an indexing
queue directory, which &RCL; then processes, storing the data into a
local cache, then indexing it, then removing the file from the
queue.</para>
<para>The visited WEB pages indexing feature can be enabled on the
&RCL; side from the GUI <guilabel>Index configuration</guilabel>
panel, or by editing the configuration file (set
<varname>processwebqueue</varname> to 1).</para>
<para>The visited Web pages indexing feature can be enabled on the
&RCL; side from the GUI <guilabel>Index configuration</guilabel>
panel, or by editing the configuration file (set
<varname>processwebqueue</varname> to 1).</para>
<para>A current pointer to the extension can be found, along with
up-to-date instructions, on the
<ulink url="&FAQS;IndexWebHistory">Recoll wiki</ulink>.</para>
<para>The &RCL; GUI has a tool to list and edit the contents of the
Web
cache. (<menuchoice><guimenu>Tools</guimenu><guimenuitem>Webcache
editor</guimenuitem></menuchoice>)</para>
<para>A copy of the indexed WEB pages is retained by Recoll in a
local cache (from which previews can be fetched). The cache size can
be adjusted from the <guilabel>Index configuration</guilabel> /
<guilabel>Web history</guilabel> panel. Once the maximum size
is reached, old pages are purged - both from the cache and the index
- to make room for new ones, so you need to explicitly archive in
some other place the pages that you want to keep
indefinitely.</para>
<para>You can find more details on Web indexing, its usage and configuration
in a <ulink url="&FAQS;IndexWebHistory">Recoll 'Howto' entry</ulink>.</para>
<note><title>The cache is not an archive</title><para>A copy of
the indexed Web pages is retained by Recoll in a local cache
(from which data is fetched for previews, or when resetting the
index). The cache has a maximum size, which can be adjusted from
the <guilabel>Index configuration</guilabel> / <guilabel>Web
history</guilabel> panel (<literal>webcachemaxmbs</literal>
parameter in <filename>recoll.conf</filename>). Once the maximum
size is reached, old pages are erased to make room for new ones.
The pages which you want to keep indefinitely need to be
explicitly archived elsewhere. Using a very high value for
the cache size can avoid data erasure, but see the above 'Howto'
page for more details and gotchas.</para></note>
</sect1>
@ -2475,7 +2470,7 @@ fs.inotify.max_user_watches=32768
file inside the configuration directory. The sample file
distributed with &RCL; (which you should be able to find under
<filename>/usr/share/recoll/examples/fragbuts.xml</filename>),
contains an example which filters the results from the WEB
contains an example which filters the results from the Web
history.</para>
@ -2485,7 +2480,7 @@ fs.inotify.max_user_watches=32768
<fragbuts version="1.0">
<radiobuttons>
<!-- Actually useful: toggle WEB queue results inclusion -->
<!-- Actually useful: toggle Web queue results inclusion -->
<fragbut>
<label>Include Web Results</label>
<frag></frag>
@ -6115,11 +6110,11 @@ hasextract = False
<application>Debian/Ubuntu apt</application>,
<application>FreeBSD</application> ports, etc.), or from some type
of "backports" repository providing versions newer than the standard
ones, or found on the &RCL; WEB site in some
ones, or found on the &RCL; Web site in some
cases. The most up-to-date information about Recoll packages can
usually be found on the
<ulink url="http://www.recoll.org/pages/download.html">
<application>Recoll</application> WEB site downloads
<application>Recoll</application> Web site downloads
page</ulink></para>
<para>The &WIN; version of Recoll comes in a self-contained setup

View File

@ -331,7 +331,7 @@ bool ConfIndexW::setupWebHistoryPanel(int idx)
{
ConfParamW *bparam = m_w->addParam(
idx, ConfTabsW::CFPT_BOOL, "processwebqueue",
tr("Process the WEB history queue"),
tr("Process the Web history queue"),
tr("Enables indexing Firefox visited pages.<br>"
"(you need also install the Firefox Recoll plugin)"));
ConfParamW *cparam = m_w->addParam(
@ -353,6 +353,8 @@ bool ConfIndexW::setupWebHistoryPanel(int idx)
"file (only waste space at the end)."
), -1, 1000*1000); // Max 1TB...
m_w->enableLink(bparam, cparam);
m_w->addBlurb(idx, tr("Note: old pages will be erased to make space for "
"new ones when the maximum size is reached"));
m_w->endOfList(idx);
return true;
}