web page indexing doc and indents

This commit is contained in:
Jean-Francois Dockes 2021-03-11 10:00:11 +01:00
parent f57530e2a6
commit 5c7d0ff96d
5 changed files with 108 additions and 121 deletions

View File

@ -37,15 +37,15 @@ WebStore::WebStore(RclConfig *cnf)
int maxmbs = 40; int maxmbs = 40;
cnf->getConfParam("webcachemaxmbs", &maxmbs); cnf->getConfParam("webcachemaxmbs", &maxmbs);
if ((m_cache = new CirCache(ccdir)) == 0) { if ((m_cache = new CirCache(ccdir)) == 0) {
LOGERR("WebStore: cant create CirCache object\n" ); LOGERR("WebStore: cant create CirCache object\n" );
return; return;
} }
if (!m_cache->create(int64_t(maxmbs)*1000*1024, CirCache::CC_CRUNIQUE)) { if (!m_cache->create(int64_t(maxmbs)*1000*1024, CirCache::CC_CRUNIQUE)) {
LOGERR("WebStore: cache file creation failed: " << LOGERR("WebStore: cache file creation failed: " <<
m_cache->getReason() << "\n"); m_cache->getReason() << "\n");
delete m_cache; delete m_cache;
m_cache = 0; m_cache = 0;
return; return;
} }
} }
@ -57,17 +57,17 @@ WebStore::~WebStore()
// Read document from cache. Return the metadata as an Rcl::Doc // Read document from cache. Return the metadata as an Rcl::Doc
// @param htt Web Hit Type // @param htt Web Hit Type
bool WebStore::getFromCache(const string& udi, Rcl::Doc &dotdoc, bool WebStore::getFromCache(const string& udi, Rcl::Doc &dotdoc,
string& data, string *htt) string& data, string *htt)
{ {
string dict; string dict;
if (m_cache == 0) { if (m_cache == 0) {
LOGERR("WebStore::getFromCache: cache is null\n"); LOGERR("WebStore::getFromCache: cache is null\n");
return false; return false;
} }
if (!m_cache->get(udi, dict, &data)) { if (!m_cache->get(udi, dict, &data)) {
LOGDEB("WebStore::getFromCache: get failed\n"); LOGDEB("WebStore::getFromCache: get failed\n");
return false; return false;
} }
ConfSimple cf(dict, 1); ConfSimple cf(dict, 1);
@ -89,4 +89,3 @@ bool WebStore::getFromCache(const string& udi, Rcl::Doc &dotdoc,
dotdoc.meta[Rcl::Doc::keyudi] = udi; dotdoc.meta[Rcl::Doc::keyudi] = udi;
return true; return true;
} }

View File

@ -21,8 +21,8 @@
class RclConfig; class RclConfig;
namespace Rcl { namespace Rcl {
class Db; class Db;
class Doc; class Doc;
} }
class CirCache; class CirCache;

View File

@ -140,9 +140,9 @@ alink="#0000FF">
"#RCL.INDEXING.REMOVABLE">Removable "#RCL.INDEXING.REMOVABLE">Removable
volumes</a></span></dt> volumes</a></span></dt>
<dt><span class="sect1">2.5. <a href= <dt><span class="sect1">2.5. <a href=
"#RCL.INDEXING.WEBQUEUE"><span class= "#RCL.INDEXING.WebQUEUE"><span class=
"application">Unix</span>-like systems: indexing "application">Unix</span>-like systems: indexing
visited WEB pages</a></span></dt> visited Web pages</a></span></dt>
<dt><span class="sect1">2.6. <a href= <dt><span class="sect1">2.6. <a href=
"#RCL.INDEXING.EXTATTR"><span class= "#RCL.INDEXING.EXTATTR"><span class=
"application">Unix</span>-like systems: using extended "application">Unix</span>-like systems: using extended
@ -423,7 +423,7 @@ alink="#0000FF">
<div class="list-of-tables"> <div class="list-of-tables">
<p><b>List of Tables</b></p> <p><b>List of Tables</b></p>
<dl> <dl>
<dt>3.1. <a href="#idm1437">Keyboard shortcuts</a></dt> <dt>3.1. <a href="#idm1438">Keyboard shortcuts</a></dt>
</dl> </dl>
</div> </div>
<div class="chapter"> <div class="chapter">
@ -720,7 +720,7 @@ alink="#0000FF">
<li class="listitem"> <li class="listitem">
<p>A <a class="ulink" href= <p>A <a class="ulink" href=
"https://framagit.org/medoc92/recollwebui" target= "https://framagit.org/medoc92/recollwebui" target=
"_top">WEB interface</a>.</p> "_top">Web interface</a>.</p>
</li> </li>
</ul> </ul>
</div> </div>
@ -1949,10 +1949,10 @@ recollindex -c "$confdir"
<div> <div>
<div> <div>
<h2 class="title" style="clear: both"><a name= <h2 class="title" style="clear: both"><a name=
"RCL.INDEXING.WEBQUEUE" id= "RCL.INDEXING.WebQUEUE" id=
"RCL.INDEXING.WEBQUEUE"></a>2.5.&nbsp;<span class= "RCL.INDEXING.WebQUEUE"></a>2.5.&nbsp;<span class=
"application">Unix</span>-like systems: indexing "application">Unix</span>-like systems: indexing
visited WEB pages</h2> visited Web pages</h2>
</div> </div>
</div> </div>
</div> </div>
@ -1964,57 +1964,48 @@ recollindex -c "$confdir"
"application">Beagle</span> indexer, then adapted to "application">Beagle</span> indexer, then adapted to
<span class="application">Recoll</span> and the <span class="application">Recoll</span> and the
<span class="application">Firefox</span> <span class= <span class="application">Firefox</span> <span class=
"application">XUL</span> API. A new version of the addon "application">XUL</span> API. The current version of the
has been written to work with the <span class= extension is located in the <a class="ulink" href=
"application">WebExtensions</span> API, which is the only "https://addons.mozilla.org/en-US/firefox/addon/recoll-we/"
one supported after <span class= target="_top">Mozilla add-ons repository</a> uses the
"application">Firefox</span> version 57.</p> <span class="application">WebExtensions</span> API, and
<p>The extension works by copying visited WEB pages to an works with current <span class="application">Firefox</span>
versions.</p>
<p>The extension works by copying visited Web pages to an
indexing queue directory, which <span class= indexing queue directory, which <span class=
"application">Recoll</span> then processes, indexing the "application">Recoll</span> then processes, storing the
data, storing it into a local cache, then removing the file data into a local cache, then indexing it, then removing
from the queue.</p> the file from the queue.</p>
<p>Because the WebExtensions API introduces more <p>The visited Web pages indexing feature can be enabled on
constraints to what extensions can do, the new version
works with one more step: the files are first created in
the browser default downloads location (typically
<code class="filename">$HOME/Downloads</code> ), then moved
by a script in the old queue location. The script is
automatically executed by the <span class=
"application">Recoll</span> indexer versions 1.23.5 and
newer. It could conceivably be executed independently to
make the new browser extension compatible with an older
<span class="application">Recoll</span> version (the script
is named <span class=
"command"><strong>recoll-we-move-files.py</strong></span>).</p>
<div class="note" style=
"margin-left: 0.5in; margin-right: 0.5in;">
<h3 class="title">Note</h3>
<p>For the WebExtensions-based version to work, it is
necessary to set the <code class=
"literal">webdownloadsdir</code> value in the
configuration if it was changed from the default
<code class="filename">$HOME/Downloads</code> in the
browser preferences.</p>
</div>
<p>The visited WEB pages indexing feature can be enabled on
the <span class="application">Recoll</span> side from the the <span class="application">Recoll</span> side from the
GUI <span class="guilabel">Index configuration</span> GUI <span class="guilabel">Index configuration</span>
panel, or by editing the configuration file (set panel, or by editing the configuration file (set
<code class="varname">processwebqueue</code> to 1).</p> <code class="varname">processwebqueue</code> to 1).</p>
<p>A current pointer to the extension can be found, along <p>The <span class="application">Recoll</span> GUI has a
with up-to-date instructions, on the <a class="ulink" href= tool to list and edit the contents of the Web cache.
(<span class="guimenu">Tools</span><span class=
"guimenuitem">Webcache editor</span>)</p>
<p>You can find more details on Web indexing, its usage and
configuration in a <a class="ulink" href=
"https://www.lesbonscomptes.com/recoll/faqsandhowtos/IndexWebHistory" "https://www.lesbonscomptes.com/recoll/faqsandhowtos/IndexWebHistory"
target="_top">Recoll wiki</a>.</p> target="_top">Recoll 'Howto' entry</a>.</p>
<p>A copy of the indexed WEB pages is retained by Recoll in <div class="note" style=
a local cache (from which previews can be fetched). The "margin-left: 0.5in; margin-right: 0.5in;">
cache size can be adjusted from the <span class= <h3 class="title">The cache is not an archive</h3>
"guilabel">Index configuration</span> / <span class= <p>A copy of the indexed Web pages is retained by Recoll
"guilabel">Web history</span> panel. Once the maximum size in a local cache (from which data is fetched for
is reached, old pages are purged - both from the cache and previews, or when resetting the index). The cache has a
the index - to make room for new ones, so you need to maximum size, which can be adjusted from the <span class=
explicitly archive in some other place the pages that you "guilabel">Index configuration</span> / <span class=
want to keep indefinitely.</p> "guilabel">Web history</span> panel (<code class=
"literal">webcachemaxmbs</code> parameter in <code class=
"filename">recoll.conf</code>). Once the maximum size is
reached, old pages are erased to make room for new ones.
The pages which you want to keep indefinitely need to be
explicitly archived elsewhere. Using a very high value
for the cache size can avoid data erasure, but see the
above 'Howto' page for more details and gotchas.</p>
</div>
</div> </div>
<div class="sect1"> <div class="sect1">
<div class="titlepage"> <div class="titlepage">
@ -3473,14 +3464,14 @@ fs.inotify.max_user_watches=32768
be able to find under <code class= be able to find under <code class=
"filename">/usr/share/recoll/examples/fragbuts.xml</code>), "filename">/usr/share/recoll/examples/fragbuts.xml</code>),
contains an example which filters the results from the contains an example which filters the results from the
WEB history.</p> Web history.</p>
<p>Here follows an example:</p> <p>Here follows an example:</p>
<pre class="programlisting"> <pre class="programlisting">
&lt;?xml version="1.0" encoding="UTF-8"?&gt; &lt;?xml version="1.0" encoding="UTF-8"?&gt;
&lt;fragbuts version="1.0"&gt; &lt;fragbuts version="1.0"&gt;
&lt;radiobuttons&gt; &lt;radiobuttons&gt;
&lt;!-- Actually useful: toggle WEB queue results inclusion --&gt; &lt;!-- Actually useful: toggle Web queue results inclusion --&gt;
&lt;fragbut&gt; &lt;fragbut&gt;
&lt;label&gt;Include Web Results&lt;/label&gt; &lt;label&gt;Include Web Results&lt;/label&gt;
&lt;frag&gt;&lt;/frag&gt; &lt;frag&gt;&lt;/frag&gt;
@ -3996,7 +3987,7 @@ fs.inotify.max_user_watches=32768
given context (e.g. within a preview window, within the given context (e.g. within a preview window, within the
result table).</p> result table).</p>
<div class="table"> <div class="table">
<a name="idm1437" id="idm1437"></a> <a name="idm1438" id="idm1438"></a>
<p class="title"><b>Table&nbsp;3.1.&nbsp;Keyboard <p class="title"><b>Table&nbsp;3.1.&nbsp;Keyboard
shortcuts</b></p> shortcuts</b></p>
<div class="table-contents"> <div class="table-contents">
@ -7940,11 +7931,11 @@ hasextract = False
"application">FreeBSD</span> ports, etc.), or from some "application">FreeBSD</span> ports, etc.), or from some
type of "backports" repository providing versions newer type of "backports" repository providing versions newer
than the standard ones, or found on the <span class= than the standard ones, or found on the <span class=
"application">Recoll</span> WEB site in some cases. The "application">Recoll</span> Web site in some cases. The
most up-to-date information about Recoll packages can most up-to-date information about Recoll packages can
usually be found on the <a class="ulink" href= usually be found on the <a class="ulink" href=
"http://www.recoll.org/pages/download.html" target= "http://www.recoll.org/pages/download.html" target=
"_top"><span class="application">Recoll</span> WEB site "_top"><span class="application">Recoll</span> Web site
downloads page</a></p> downloads page</a></p>
<p>The <span class="application">Windows</span> version of <p>The <span class="application">Windows</span> version of
Recoll comes in a self-contained setup file, there is Recoll comes in a self-contained setup file, there is

View File

@ -282,7 +282,7 @@
<ulink url="https://www.lesbonscomptes.com/recoll/pages/download.html">Search Provider</ulink>. <ulink url="https://www.lesbonscomptes.com/recoll/pages/download.html">Search Provider</ulink>.
</para></listitem> </para></listitem>
<listitem><para>A <listitem><para>A
<ulink url="https://framagit.org/medoc92/recollwebui">WEB interface</ulink>. <ulink url="https://framagit.org/medoc92/recollwebui">Web interface</ulink>.
</para></listitem> </para></listitem>
</itemizedlist> </itemizedlist>
</para> </para>
@ -1257,56 +1257,51 @@ recollindex -c "$confdir"
</simplesect> </simplesect>
</sect1> </sect1>
<sect1 id="RCL.INDEXING.WEBQUEUE"> <sect1 id="RCL.INDEXING.WebQUEUE">
<title>&LIN;: indexing visited WEB pages</title> <title>&LIN;: indexing visited Web pages</title>
<para>With the help of a <application>Firefox</application> <para>With the help of a <application>Firefox</application>
extension, &RCL; can index the Internet pages that you visit. The extension, &RCL; can index the Internet pages that you visit. The
extension has a long history: it was initially designed for the extension has a long history: it was initially designed for
<application>Beagle</application> indexer, then adapted to &RCL; and the <application>Beagle</application> indexer, then adapted to
the <application>Firefox</application> <application>XUL</application> &RCL; and
API. A new version of the addon has been written to work with the the <application>Firefox</application> <application>XUL</application>
<application>WebExtensions</application> API, which is the only one API. The current version of the extension is located in
supported after <application>Firefox</application> version 57.</para> the <ulink url="https://addons.mozilla.org/en-US/firefox/addon/recoll-we/">Mozilla
add-ons repository</ulink> uses
the <application>WebExtensions</application> API, and works with
current <application>Firefox</application> versions.</para>
<para>The extension works by copying visited WEB pages to an indexing <para>The extension works by copying visited Web pages to an indexing
queue directory, which &RCL; then processes, indexing the data, queue directory, which &RCL; then processes, storing the data into a
storing it into a local cache, then removing the file from the local cache, then indexing it, then removing the file from the
queue.</para> queue.</para>
<para>Because the WebExtensions API introduces more constraints to <para>The visited Web pages indexing feature can be enabled on the
what extensions can do, the new version works with one &RCL; side from the GUI <guilabel>Index configuration</guilabel>
more step: the files are first created in the browser default panel, or by editing the configuration file (set
downloads location (typically <filename>$HOME/Downloads</filename> ), <varname>processwebqueue</varname> to 1).</para>
then moved by a script in the old queue location. The script is
automatically executed by the &RCL; indexer versions 1.23.5 and
newer. It could conceivably be executed independently to make the new
browser extension compatible with an older &RCL; version (the script
is named <command>recoll-we-move-files.py</command>).</para>
<note><para>For the WebExtensions-based version to work, it is <para>The &RCL; GUI has a tool to list and edit the contents of the
necessary to set the <literal>webdownloadsdir</literal> value in the Web
configuration if it was changed from the default cache. (<menuchoice><guimenu>Tools</guimenu><guimenuitem>Webcache
<filename>$HOME/Downloads</filename> in the browser editor</guimenuitem></menuchoice>)</para>
preferences.</para></note>
<para>The visited WEB pages indexing feature can be enabled on the <para>You can find more details on Web indexing, its usage and configuration
&RCL; side from the GUI <guilabel>Index configuration</guilabel> in a <ulink url="&FAQS;IndexWebHistory">Recoll 'Howto' entry</ulink>.</para>
panel, or by editing the configuration file (set
<varname>processwebqueue</varname> to 1).</para>
<para>A current pointer to the extension can be found, along with <note><title>The cache is not an archive</title><para>A copy of
up-to-date instructions, on the the indexed Web pages is retained by Recoll in a local cache
<ulink url="&FAQS;IndexWebHistory">Recoll wiki</ulink>.</para> (from which data is fetched for previews, or when resetting the
index). The cache has a maximum size, which can be adjusted from
<para>A copy of the indexed WEB pages is retained by Recoll in a the <guilabel>Index configuration</guilabel> / <guilabel>Web
local cache (from which previews can be fetched). The cache size can history</guilabel> panel (<literal>webcachemaxmbs</literal>
be adjusted from the <guilabel>Index configuration</guilabel> / parameter in <filename>recoll.conf</filename>). Once the maximum
<guilabel>Web history</guilabel> panel. Once the maximum size size is reached, old pages are erased to make room for new ones.
is reached, old pages are purged - both from the cache and the index The pages which you want to keep indefinitely need to be
- to make room for new ones, so you need to explicitly archive in explicitly archived elsewhere. Using a very high value for
some other place the pages that you want to keep the cache size can avoid data erasure, but see the above 'Howto'
indefinitely.</para> page for more details and gotchas.</para></note>
</sect1> </sect1>
@ -2475,7 +2470,7 @@ fs.inotify.max_user_watches=32768
file inside the configuration directory. The sample file file inside the configuration directory. The sample file
distributed with &RCL; (which you should be able to find under distributed with &RCL; (which you should be able to find under
<filename>/usr/share/recoll/examples/fragbuts.xml</filename>), <filename>/usr/share/recoll/examples/fragbuts.xml</filename>),
contains an example which filters the results from the WEB contains an example which filters the results from the Web
history.</para> history.</para>
@ -2485,7 +2480,7 @@ fs.inotify.max_user_watches=32768
<fragbuts version="1.0"> <fragbuts version="1.0">
<radiobuttons> <radiobuttons>
<!-- Actually useful: toggle WEB queue results inclusion --> <!-- Actually useful: toggle Web queue results inclusion -->
<fragbut> <fragbut>
<label>Include Web Results</label> <label>Include Web Results</label>
<frag></frag> <frag></frag>
@ -6115,11 +6110,11 @@ hasextract = False
<application>Debian/Ubuntu apt</application>, <application>Debian/Ubuntu apt</application>,
<application>FreeBSD</application> ports, etc.), or from some type <application>FreeBSD</application> ports, etc.), or from some type
of "backports" repository providing versions newer than the standard of "backports" repository providing versions newer than the standard
ones, or found on the &RCL; WEB site in some ones, or found on the &RCL; Web site in some
cases. The most up-to-date information about Recoll packages can cases. The most up-to-date information about Recoll packages can
usually be found on the usually be found on the
<ulink url="http://www.recoll.org/pages/download.html"> <ulink url="http://www.recoll.org/pages/download.html">
<application>Recoll</application> WEB site downloads <application>Recoll</application> Web site downloads
page</ulink></para> page</ulink></para>
<para>The &WIN; version of Recoll comes in a self-contained setup <para>The &WIN; version of Recoll comes in a self-contained setup

View File

@ -331,7 +331,7 @@ bool ConfIndexW::setupWebHistoryPanel(int idx)
{ {
ConfParamW *bparam = m_w->addParam( ConfParamW *bparam = m_w->addParam(
idx, ConfTabsW::CFPT_BOOL, "processwebqueue", idx, ConfTabsW::CFPT_BOOL, "processwebqueue",
tr("Process the WEB history queue"), tr("Process the Web history queue"),
tr("Enables indexing Firefox visited pages.<br>" tr("Enables indexing Firefox visited pages.<br>"
"(you need also install the Firefox Recoll plugin)")); "(you need also install the Firefox Recoll plugin)"));
ConfParamW *cparam = m_w->addParam( ConfParamW *cparam = m_w->addParam(
@ -353,6 +353,8 @@ bool ConfIndexW::setupWebHistoryPanel(int idx)
"file (only waste space at the end)." "file (only waste space at the end)."
), -1, 1000*1000); // Max 1TB... ), -1, 1000*1000); // Max 1TB...
m_w->enableLink(bparam, cparam); m_w->enableLink(bparam, cparam);
m_w->addBlurb(idx, tr("Note: old pages will be erased to make space for "
"new ones when the maximum size is reached"));
m_w->endOfList(idx); m_w->endOfList(idx);
return true; return true;
} }