From 5c7d0ff96db8c5b8911d6db11752d59ec841db5c Mon Sep 17 00:00:00 2001
From: Jean-Francois Dockes <jf@dockes.org>
Date: Thu, 11 Mar 2021 10:00:11 +0100
Subject: [PATCH] web page indexing doc and indents

---
 src/common/webstore.cpp            |  23 +++----
 src/common/webstore.h              |   4 +-
 src/doc/user/usermanual.html       | 105 +++++++++++++----------------
 src/doc/user/usermanual.xml        |  93 ++++++++++++-------------
 src/qtgui/confgui/confguiindex.cpp |   4 +-
 5 files changed, 108 insertions(+), 121 deletions(-)
diff --git a/src/common/webstore.cpp b/src/common/webstore.cpp
index 5a070f6d..9361da29 100644
--- a/src/common/webstore.cpp
+++ b/src/common/webstore.cpp
@@ -37,15 +37,15 @@ WebStore::WebStore(RclConfig *cnf)
     int maxmbs = 40;
     cnf->getConfParam("webcachemaxmbs", &maxmbs);
     if ((m_cache = new CirCache(ccdir)) == 0) {
-    LOGERR("WebStore: cant create CirCache object\n" );
-    return;
+        LOGERR("WebStore: cant create CirCache object\n" );
+        return;
     }
     if (!m_cache->create(int64_t(maxmbs)*1000*1024, CirCache::CC_CRUNIQUE)) {
-    LOGERR("WebStore: cache file creation failed: " <<
+        LOGERR("WebStore: cache file creation failed: " <<
                m_cache->getReason() << "\n");
-    delete m_cache;
-    m_cache = 0;
-    return;
+        delete m_cache;
+        m_cache = 0;
+        return;
     }
 }
 
@@ -57,17 +57,17 @@ WebStore::~WebStore()
 // Read  document from cache. Return the metadata as an Rcl::Doc
 // @param htt Web Hit Type 
 bool WebStore::getFromCache(const string& udi, Rcl::Doc &dotdoc, 
-                    string& data, string *htt)
+                            string& data, string *htt)
 {
     string dict;
 
     if (m_cache == 0) {
-    LOGERR("WebStore::getFromCache: cache is null\n");
-    return false;
+        LOGERR("WebStore::getFromCache: cache is null\n");
+        return false;
     }
     if (!m_cache->get(udi, dict, &data)) {
-    LOGDEB("WebStore::getFromCache: get failed\n");
-    return false;
+        LOGDEB("WebStore::getFromCache: get failed\n");
+        return false;
     }
 
     ConfSimple cf(dict, 1);
@@ -89,4 +89,3 @@ bool WebStore::getFromCache(const string& udi, Rcl::Doc &dotdoc,
     dotdoc.meta[Rcl::Doc::keyudi] = udi;
     return true;
 }
-
diff --git a/src/common/webstore.h b/src/common/webstore.h
index a720ba50..70386035 100644
--- a/src/common/webstore.h
+++ b/src/common/webstore.h
@@ -21,8 +21,8 @@
 
 class RclConfig;
 namespace Rcl {
-    class Db;
-    class Doc;
+class Db;
+class Doc;
 }
 class CirCache;
 
diff --git a/src/doc/user/usermanual.html b/src/doc/user/usermanual.html
index 16885cc8..85405efa 100644
--- a/src/doc/user/usermanual.html
+++ b/src/doc/user/usermanual.html
@@ -140,9 +140,9 @@ alink="#0000FF">
             "#RCL.INDEXING.REMOVABLE">Removable
             volumes</a></span></dt>
             <dt><span class="sect1">2.5. <a href=
-            "#RCL.INDEXING.WEBQUEUE"><span class=
+            "#RCL.INDEXING.WebQUEUE"><span class=
             "application">Unix</span>-like systems: indexing
-            visited WEB pages</a></span></dt>
+            visited Web pages</a></span></dt>
             <dt><span class="sect1">2.6. <a href=
             "#RCL.INDEXING.EXTATTR"><span class=
             "application">Unix</span>-like systems: using extended
@@ -423,7 +423,7 @@ alink="#0000FF">
     <div class="list-of-tables">
       <p><b>List of Tables</b></p>
       <dl>
-        <dt>3.1. <a href="#idm1437">Keyboard shortcuts</a></dt>
+        <dt>3.1. <a href="#idm1438">Keyboard shortcuts</a></dt>
       </dl>
     </div>
     <div class="chapter">
@@ -720,7 +720,7 @@ alink="#0000FF">
             <li class="listitem">
               <p>A <a class="ulink" href=
               "https://framagit.org/medoc92/recollwebui" target=
-              "_top">WEB interface</a>.</p>
+              "_top">Web interface</a>.</p>
             </li>
           </ul>
         </div>
@@ -1949,10 +1949,10 @@ recollindex -c "$confdir"
           <div>
             <div>
               <h2 class="title" style="clear: both"><a name=
-              "RCL.INDEXING.WEBQUEUE" id=
-              "RCL.INDEXING.WEBQUEUE"></a>2.5.&nbsp;<span class=
+              "RCL.INDEXING.WebQUEUE" id=
+              "RCL.INDEXING.WebQUEUE"></a>2.5.&nbsp;<span class=
               "application">Unix</span>-like systems: indexing
-              visited WEB pages</h2>
+              visited Web pages</h2>
             </div>
           </div>
         </div>
@@ -1964,57 +1964,48 @@ recollindex -c "$confdir"
         "application">Beagle</span> indexer, then adapted to
         <span class="application">Recoll</span> and the
         <span class="application">Firefox</span> <span class=
-        "application">XUL</span> API. A new version of the addon
-        has been written to work with the <span class=
-        "application">WebExtensions</span> API, which is the only
-        one supported after <span class=
-        "application">Firefox</span> version 57.</p>
-        <p>The extension works by copying visited WEB pages to an
+        "application">XUL</span> API. The current version of the
+        extension is located in the <a class="ulink" href=
+        "https://addons.mozilla.org/en-US/firefox/addon/recoll-we/"
+        target="_top">Mozilla add-ons repository</a> uses the
+        <span class="application">WebExtensions</span> API, and
+        works with current <span class="application">Firefox</span>
+        versions.</p>
+        <p>The extension works by copying visited Web pages to an
         indexing queue directory, which <span class=
-        "application">Recoll</span> then processes, indexing the
-        data, storing it into a local cache, then removing the file
-        from the queue.</p>
-        <p>Because the WebExtensions API introduces more
-        constraints to what extensions can do, the new version
-        works with one more step: the files are first created in
-        the browser default downloads location (typically
-        <code class="filename">$HOME/Downloads</code> ), then moved
-        by a script in the old queue location. The script is
-        automatically executed by the <span class=
-        "application">Recoll</span> indexer versions 1.23.5 and
-        newer. It could conceivably be executed independently to
-        make the new browser extension compatible with an older
-        <span class="application">Recoll</span> version (the script
-        is named <span class=
-        "command"><strong>recoll-we-move-files.py</strong></span>).</p>
-        <div class="note" style=
-        "margin-left: 0.5in; margin-right: 0.5in;">
-          <h3 class="title">Note</h3>
-          <p>For the WebExtensions-based version to work, it is
-          necessary to set the <code class=
-          "literal">webdownloadsdir</code> value in the
-          configuration if it was changed from the default
-          <code class="filename">$HOME/Downloads</code> in the
-          browser preferences.</p>
-        </div>
-        <p>The visited WEB pages indexing feature can be enabled on
+        "application">Recoll</span> then processes, storing the
+        data into a local cache, then indexing it, then removing
+        the file from the queue.</p>
+        <p>The visited Web pages indexing feature can be enabled on
         the <span class="application">Recoll</span> side from the
         GUI <span class="guilabel">Index configuration</span>
         panel, or by editing the configuration file (set
         <code class="varname">processwebqueue</code> to 1).</p>
-        <p>A current pointer to the extension can be found, along
-        with up-to-date instructions, on the <a class="ulink" href=
+        <p>The <span class="application">Recoll</span> GUI has a
+        tool to list and edit the contents of the Web cache.
+        (<span class="guimenu">Tools</span> → <span class=
+        "guimenuitem">Webcache editor</span>)</p>
+        <p>You can find more details on Web indexing, its usage and
+        configuration in a <a class="ulink" href=
         "https://www.lesbonscomptes.com/recoll/faqsandhowtos/IndexWebHistory"
-        target="_top">Recoll wiki</a>.</p>
-        <p>A copy of the indexed WEB pages is retained by Recoll in
-        a local cache (from which previews can be fetched). The
-        cache size can be adjusted from the <span class=
-        "guilabel">Index configuration</span> / <span class=
-        "guilabel">Web history</span> panel. Once the maximum size
-        is reached, old pages are purged - both from the cache and
-        the index - to make room for new ones, so you need to
-        explicitly archive in some other place the pages that you
-        want to keep indefinitely.</p>
+        target="_top">Recoll 'Howto' entry</a>.</p>
+        <div class="note" style=
+        "margin-left: 0.5in; margin-right: 0.5in;">
+          <h3 class="title">The cache is not an archive</h3>
+          <p>A copy of the indexed Web pages is retained by Recoll
+          in a local cache (from which data is fetched for
+          previews, or when resetting the index). The cache has a
+          maximum size, which can be adjusted from the <span class=
+          "guilabel">Index configuration</span> / <span class=
+          "guilabel">Web history</span> panel (<code class=
+          "literal">webcachemaxmbs</code> parameter in <code class=
+          "filename">recoll.conf</code>). Once the maximum size is
+          reached, old pages are erased to make room for new ones.
+          The pages which you want to keep indefinitely need to be
+          explicitly archived elsewhere. Using a very high value
+          for the cache size can avoid data erasure, but see the
+          above 'Howto' page for more details and gotchas.</p>
+        </div>
       </div>
       <div class="sect1">
         <div class="titlepage">
@@ -3473,14 +3464,14 @@ fs.inotify.max_user_watches=32768
           be able to find under <code class=
           "filename">/usr/share/recoll/examples/fragbuts.xml</code>),
           contains an example which filters the results from the
-          WEB history.</p>
+          Web history.</p>
           <p>Here follows an example:</p>
           <pre class="programlisting">
 &lt;?xml version="1.0" encoding="UTF-8"?&gt;
 &lt;fragbuts version="1.0"&gt;
 
   &lt;radiobuttons&gt;
-    &lt;!-- Actually useful: toggle WEB queue results inclusion --&gt;
+    &lt;!-- Actually useful: toggle Web queue results inclusion --&gt;
     &lt;fragbut&gt;
       &lt;label&gt;Include Web Results&lt;/label&gt;
       &lt;frag&gt;&lt;/frag&gt;
@@ -3996,7 +3987,7 @@ fs.inotify.max_user_watches=32768
           given context (e.g. within a preview window, within the
           result table).</p>
           <div class="table">
-            <a name="idm1437" id="idm1437"></a>
+            <a name="idm1438" id="idm1438"></a>
             <p class="title"><b>Table&nbsp;3.1.&nbsp;Keyboard
             shortcuts</b></p>
             <div class="table-contents">
@@ -7940,11 +7931,11 @@ hasextract = False
         "application">FreeBSD</span> ports, etc.), or from some
         type of "backports" repository providing versions newer
         than the standard ones, or found on the <span class=
-        "application">Recoll</span> WEB site in some cases. The
+        "application">Recoll</span> Web site in some cases. The
         most up-to-date information about Recoll packages can
         usually be found on the <a class="ulink" href=
         "http://www.recoll.org/pages/download.html" target=
-        "_top"><span class="application">Recoll</span> WEB site
+        "_top"><span class="application">Recoll</span> Web site
         downloads page</a></p>
         <p>The <span class="application">Windows</span> version of
         Recoll comes in a self-contained setup file, there is
diff --git a/src/doc/user/usermanual.xml b/src/doc/user/usermanual.xml
index 385ead62..9526a320 100644
--- a/src/doc/user/usermanual.xml
+++ b/src/doc/user/usermanual.xml
@@ -282,7 +282,7 @@
         <ulink url="https://www.lesbonscomptes.com/recoll/pages/download.html">Search Provider</ulink>.
         </para></listitem>
         <listitem><para>A
-        <ulink url="https://framagit.org/medoc92/recollwebui">WEB interface</ulink>.
+        <ulink url="https://framagit.org/medoc92/recollwebui">Web interface</ulink>.
         </para></listitem>
       </itemizedlist>
       </para>
@@ -1257,56 +1257,51 @@ recollindex -c "$confdir"
   </simplesect>
   </sect1>
 
-    <sect1 id="RCL.INDEXING.WEBQUEUE">
-      <title>&LIN;: indexing visited WEB pages</title>
+    <sect1 id="RCL.INDEXING.WebQUEUE">
+      <title>&LIN;: indexing visited Web pages</title>
 
       <para>With the help of a <application>Firefox</application>
-      extension, &RCL; can index the Internet pages that you visit. The
-      extension has a long history: it was initially designed for the
-      <application>Beagle</application> indexer, then adapted to &RCL; and
-      the <application>Firefox</application> <application>XUL</application>
-      API. A new version of the addon has been written to work with the
-      <application>WebExtensions</application> API, which is the only one
-      supported after <application>Firefox</application> version 57.</para>
+        extension, &RCL; can index the Internet pages that you visit. The
+        extension has a long history: it was initially designed for
+        the <application>Beagle</application> indexer, then adapted to
+        &RCL; and
+        the <application>Firefox</application> <application>XUL</application>
+        API. The current version of the extension is located in
+        the <ulink url="https://addons.mozilla.org/en-US/firefox/addon/recoll-we/">Mozilla
+        add-ons repository</ulink> uses
+        the <application>WebExtensions</application> API, and works with
+        current <application>Firefox</application> versions.</para>
 
-      <para>The extension works by copying visited WEB pages to an indexing
-      queue directory, which &RCL; then processes, indexing the data,
-      storing it into a local cache, then removing the file from the
-      queue.</para>
-
-      <para>Because the WebExtensions API introduces more constraints to
-      what extensions can do, the new version works with one
-      more step: the files are first created in the browser default
-      downloads location (typically <filename>$HOME/Downloads</filename> ),
-      then moved by a script in the old queue location. The script is
-      automatically executed by the &RCL; indexer versions 1.23.5 and
-      newer. It could conceivably be executed independently to make the new
-      browser extension compatible with an older &RCL; version (the script
-      is named <command>recoll-we-move-files.py</command>).</para>
-
-      <note><para>For the WebExtensions-based version to work, it is
-      necessary to set the <literal>webdownloadsdir</literal> value in the
-      configuration if it was changed from the default
-      <filename>$HOME/Downloads</filename> in the browser
-      preferences.</para></note>
+      <para>The extension works by copying visited Web pages to an indexing
+        queue directory, which &RCL; then processes, storing the data into a
+        local cache, then indexing it, then removing the file from the
+        queue.</para>
       
-      <para>The visited WEB pages indexing feature can be enabled on the
-      &RCL; side from the GUI <guilabel>Index configuration</guilabel>
-      panel, or by editing the configuration file (set
-      <varname>processwebqueue</varname> to 1).</para>
+      <para>The visited Web pages indexing feature can be enabled on the
+        &RCL; side from the GUI <guilabel>Index configuration</guilabel>
+        panel, or by editing the configuration file (set
+        <varname>processwebqueue</varname> to 1).</para>
 
-      <para>A current pointer to the extension can be found, along with
-      up-to-date instructions, on the
-      <ulink url="&FAQS;IndexWebHistory">Recoll wiki</ulink>.</para>
+      <para>The &RCL; GUI has a tool to list and edit the contents of the
+        Web
+        cache. (<menuchoice><guimenu>Tools</guimenu><guimenuitem>Webcache
+        editor</guimenuitem></menuchoice>)</para>
 
-      <para>A copy of the indexed WEB pages is retained by Recoll in a
-      local cache (from which previews can be fetched). The cache size can
-      be adjusted from the <guilabel>Index configuration</guilabel> /
-      <guilabel>Web history</guilabel> panel. Once the maximum size
-      is reached, old pages are purged - both from the cache and the index
-      - to make room for new ones, so you need to explicitly archive in
-      some other place the pages that you want to keep
-      indefinitely.</para> 
+      <para>You can find more details on Web indexing, its usage and configuration
+        in a <ulink url="&FAQS;IndexWebHistory">Recoll 'Howto' entry</ulink>.</para>
+
+      <note><title>The cache is not an archive</title><para>A copy of
+          the indexed Web pages is retained by Recoll in a local cache
+          (from which data is fetched for previews, or when resetting the
+          index). The cache has a maximum size, which can be adjusted from
+          the <guilabel>Index configuration</guilabel> / <guilabel>Web
+          history</guilabel> panel (<literal>webcachemaxmbs</literal>
+          parameter in <filename>recoll.conf</filename>). Once the maximum
+          size is reached, old pages are erased to make room for new ones.
+          The pages which you want to keep indefinitely need to be
+          explicitly archived elsewhere. Using a very high value for
+          the cache size can avoid data erasure, but see the above 'Howto'
+          page for more details and gotchas.</para></note>
 
     </sect1>
 
@@ -2475,7 +2470,7 @@ fs.inotify.max_user_watches=32768
         file inside the configuration directory. The sample file
         distributed with &RCL; (which you should be able to find under
         <filename>/usr/share/recoll/examples/fragbuts.xml</filename>),
-        contains an example which filters the results from the WEB
+        contains an example which filters the results from the Web
         history.</para>
 
 
@@ -2485,7 +2480,7 @@ fs.inotify.max_user_watches=32768
 <fragbuts version="1.0">
 
   <radiobuttons>
-    <!-- Actually useful: toggle WEB queue results inclusion -->
+    <!-- Actually useful: toggle Web queue results inclusion -->
     <fragbut>
       <label>Include Web Results</label>
       <frag></frag>
@@ -6115,11 +6110,11 @@ hasextract = False
       <application>Debian/Ubuntu apt</application>,
       <application>FreeBSD</application> ports, etc.), or from some type
       of "backports" repository providing versions newer than the standard
-      ones, or found on the &RCL; WEB site in some
+      ones, or found on the &RCL; Web site in some
       cases. The most up-to-date information about Recoll packages can
       usually be found on the
       <ulink url="http://www.recoll.org/pages/download.html">
-        <application>Recoll</application> WEB site downloads
+        <application>Recoll</application> Web site downloads
       page</ulink></para> 
 
       <para>The &WIN; version of Recoll comes in a self-contained setup
diff --git a/src/qtgui/confgui/confguiindex.cpp b/src/qtgui/confgui/confguiindex.cpp
index 4f9e7e57..a915b9a4 100644
--- a/src/qtgui/confgui/confguiindex.cpp
+++ b/src/qtgui/confgui/confguiindex.cpp
@@ -331,7 +331,7 @@ bool ConfIndexW::setupWebHistoryPanel(int idx)
 {
     ConfParamW *bparam = m_w->addParam(
         idx, ConfTabsW::CFPT_BOOL, "processwebqueue",
-        tr("Process the WEB history queue"),
+        tr("Process the Web history queue"),
         tr("Enables indexing Firefox visited pages.<br>"
            "(you need also install the Firefox Recoll plugin)"));
     ConfParamW *cparam = m_w->addParam(
@@ -353,6 +353,8 @@ bool ConfIndexW::setupWebHistoryPanel(int idx)
            "file (only waste space at the end)."
             ), -1, 1000*1000); // Max 1TB...
     m_w->enableLink(bparam, cparam);
+    m_w->addBlurb(idx, tr("Note: old pages will be erased to make space for "
+                          "new ones when the maximum size is reached"));
     m_w->endOfList(idx);
     return true;
 }