web page indexing doc and indents

2021-03-11 10:00:11 +01:00 · 2021-03-11 10:00:11 +01:00 · 5c7d0ff96d
commit 5c7d0ff96d
parent f57530e2a6
5 changed files with 108 additions and 121 deletions
--- a/src/common/webstore.cpp
+++ b/src/common/webstore.cpp
@ -37,15 +37,15 @@ WebStore::WebStore(RclConfig *cnf)
    int maxmbs = 40;
    cnf->getConfParam("webcachemaxmbs", &maxmbs);
    if ((m_cache = new CirCache(ccdir)) == 0) {
-    LOGERR("WebStore: cant create CirCache object\n" );
-    return;
+        LOGERR("WebStore: cant create CirCache object\n" );
+        return;
    }
    if (!m_cache->create(int64_t(maxmbs)*1000*1024, CirCache::CC_CRUNIQUE)) {
-    LOGERR("WebStore: cache file creation failed: " <<
+        LOGERR("WebStore: cache file creation failed: " <<
               m_cache->getReason() << "\n");
-    delete m_cache;
-    m_cache = 0;
-    return;
+        delete m_cache;
+        m_cache = 0;
+        return;
    }
 }

@ -57,17 +57,17 @@ WebStore::~WebStore()
 // Read  document from cache. Return the metadata as an Rcl::Doc
 // @param htt Web Hit Type 
 bool WebStore::getFromCache(const string& udi, Rcl::Doc &dotdoc, 
-                    string& data, string *htt)
+                            string& data, string *htt)
 {
    string dict;

    if (m_cache == 0) {
-    LOGERR("WebStore::getFromCache: cache is null\n");
-    return false;
+        LOGERR("WebStore::getFromCache: cache is null\n");
+        return false;
    }
    if (!m_cache->get(udi, dict, &data)) {
-    LOGDEB("WebStore::getFromCache: get failed\n");
-    return false;
+        LOGDEB("WebStore::getFromCache: get failed\n");
+        return false;
    }

    ConfSimple cf(dict, 1);
@ -89,4 +89,3 @@ bool WebStore::getFromCache(const string& udi, Rcl::Doc &dotdoc,
    dotdoc.meta[Rcl::Doc::keyudi] = udi;
    return true;
 }
-
--- a/src/common/webstore.h
+++ b/src/common/webstore.h
@ -21,8 +21,8 @@

 class RclConfig;
 namespace Rcl {
-    class Db;
-    class Doc;
+class Db;
+class Doc;
 }
 class CirCache;

--- a/src/doc/user/usermanual.html
+++ b/src/doc/user/usermanual.html
@ -140,9 +140,9 @@ alink="#0000FF">
            "#RCL.INDEXING.REMOVABLE">Removable
            volumes</a></span></dt>
            <dt><span class="sect1">2.5. <a href=
-            "#RCL.INDEXING.WEBQUEUE"><span class=
+            "#RCL.INDEXING.WebQUEUE"><span class=
            "application">Unix</span>-like systems: indexing
-            visited WEB pages</a></span></dt>
+            visited Web pages</a></span></dt>
            <dt><span class="sect1">2.6. <a href=
            "#RCL.INDEXING.EXTATTR"><span class=
            "application">Unix</span>-like systems: using extended
@ -423,7 +423,7 @@ alink="#0000FF">
    <div class="list-of-tables">
      <p><b>List of Tables</b></p>
      <dl>
-        <dt>3.1. <a href="#idm1437">Keyboard shortcuts</a></dt>
+        <dt>3.1. <a href="#idm1438">Keyboard shortcuts</a></dt>
      </dl>
    </div>
    <div class="chapter">
@ -720,7 +720,7 @@ alink="#0000FF">
            <li class="listitem">
              <p>A <a class="ulink" href=
              "https://framagit.org/medoc92/recollwebui" target=
-              "_top">WEB interface</a>.</p>
+              "_top">Web interface</a>.</p>
            </li>
          </ul>
        </div>
@ -1949,10 +1949,10 @@ recollindex -c "$confdir"
          <div>
            <div>
              <h2 class="title" style="clear: both"><a name=
-              "RCL.INDEXING.WEBQUEUE" id=
-              "RCL.INDEXING.WEBQUEUE"></a>2.5.&nbsp;<span class=
+              "RCL.INDEXING.WebQUEUE" id=
+              "RCL.INDEXING.WebQUEUE"></a>2.5.&nbsp;<span class=
              "application">Unix</span>-like systems: indexing
-              visited WEB pages</h2>
+              visited Web pages</h2>
            </div>
          </div>
        </div>
@ -1964,57 +1964,48 @@ recollindex -c "$confdir"
        "application">Beagle</span> indexer, then adapted to
        <span class="application">Recoll</span> and the
        <span class="application">Firefox</span> <span class=
-        "application">XUL</span> API. A new version of the addon
-        has been written to work with the <span class=
-        "application">WebExtensions</span> API, which is the only
-        one supported after <span class=
-        "application">Firefox</span> version 57.</p>
-        <p>The extension works by copying visited WEB pages to an
+        "application">XUL</span> API. The current version of the
+        extension is located in the <a class="ulink" href=
+        "https://addons.mozilla.org/en-US/firefox/addon/recoll-we/"
+        target="_top">Mozilla add-ons repository</a> uses the
+        <span class="application">WebExtensions</span> API, and
+        works with current <span class="application">Firefox</span>
+        versions.</p>
+        <p>The extension works by copying visited Web pages to an
        indexing queue directory, which <span class=
-        "application">Recoll</span> then processes, indexing the
-        data, storing it into a local cache, then removing the file
-        from the queue.</p>
-        <p>Because the WebExtensions API introduces more
-        constraints to what extensions can do, the new version
-        works with one more step: the files are first created in
-        the browser default downloads location (typically
-        <code class="filename">$HOME/Downloads</code> ), then moved
-        by a script in the old queue location. The script is
-        automatically executed by the <span class=
-        "application">Recoll</span> indexer versions 1.23.5 and
-        newer. It could conceivably be executed independently to
-        make the new browser extension compatible with an older
-        <span class="application">Recoll</span> version (the script
-        is named <span class=
-        "command"><strong>recoll-we-move-files.py</strong></span>).</p>
-        <div class="note" style=
-        "margin-left: 0.5in; margin-right: 0.5in;">
-          <h3 class="title">Note</h3>
-          <p>For the WebExtensions-based version to work, it is
-          necessary to set the <code class=
-          "literal">webdownloadsdir</code> value in the
-          configuration if it was changed from the default
-          <code class="filename">$HOME/Downloads</code> in the
-          browser preferences.</p>
-        </div>
-        <p>The visited WEB pages indexing feature can be enabled on
+        "application">Recoll</span> then processes, storing the
+        data into a local cache, then indexing it, then removing
+        the file from the queue.</p>
+        <p>The visited Web pages indexing feature can be enabled on
        the <span class="application">Recoll</span> side from the
        GUI <span class="guilabel">Index configuration</span>
        panel, or by editing the configuration file (set
        <code class="varname">processwebqueue</code> to 1).</p>
-        <p>A current pointer to the extension can be found, along
-        with up-to-date instructions, on the <a class="ulink" href=
+        <p>The <span class="application">Recoll</span> GUI has a
+        tool to list and edit the contents of the Web cache.
+        (<span class="guimenu">Tools</span> → <span class=
+        "guimenuitem">Webcache editor</span>)</p>
+        <p>You can find more details on Web indexing, its usage and
+        configuration in a <a class="ulink" href=
        "https://www.lesbonscomptes.com/recoll/faqsandhowtos/IndexWebHistory"
-        target="_top">Recoll wiki</a>.</p>
-        <p>A copy of the indexed WEB pages is retained by Recoll in
-        a local cache (from which previews can be fetched). The
-        cache size can be adjusted from the <span class=
-        "guilabel">Index configuration</span> / <span class=
-        "guilabel">Web history</span> panel. Once the maximum size
-        is reached, old pages are purged - both from the cache and
-        the index - to make room for new ones, so you need to
-        explicitly archive in some other place the pages that you
-        want to keep indefinitely.</p>
+        target="_top">Recoll 'Howto' entry</a>.</p>
+        <div class="note" style=
+        "margin-left: 0.5in; margin-right: 0.5in;">
+          <h3 class="title">The cache is not an archive</h3>
+          <p>A copy of the indexed Web pages is retained by Recoll
+          in a local cache (from which data is fetched for
+          previews, or when resetting the index). The cache has a
+          maximum size, which can be adjusted from the <span class=
+          "guilabel">Index configuration</span> / <span class=
+          "guilabel">Web history</span> panel (<code class=
+          "literal">webcachemaxmbs</code> parameter in <code class=
+          "filename">recoll.conf</code>). Once the maximum size is
+          reached, old pages are erased to make room for new ones.
+          The pages which you want to keep indefinitely need to be
+          explicitly archived elsewhere. Using a very high value
+          for the cache size can avoid data erasure, but see the
+          above 'Howto' page for more details and gotchas.</p>
+        </div>
      </div>
      <div class="sect1">
        <div class="titlepage">
@ -3473,14 +3464,14 @@ fs.inotify.max_user_watches=32768
          be able to find under <code class=
          "filename">/usr/share/recoll/examples/fragbuts.xml</code>),
          contains an example which filters the results from the
-          WEB history.</p>
+          Web history.</p>
          <p>Here follows an example:</p>
          <pre class="programlisting">
 &lt;?xml version="1.0" encoding="UTF-8"?&gt;
 &lt;fragbuts version="1.0"&gt;

  &lt;radiobuttons&gt;
-    &lt;!-- Actually useful: toggle WEB queue results inclusion --&gt;
+    &lt;!-- Actually useful: toggle Web queue results inclusion --&gt;
    &lt;fragbut&gt;
      &lt;label&gt;Include Web Results&lt;/label&gt;
      &lt;frag&gt;&lt;/frag&gt;
@ -3996,7 +3987,7 @@ fs.inotify.max_user_watches=32768
          given context (e.g. within a preview window, within the
          result table).</p>
          <div class="table">
-            <a name="idm1437" id="idm1437"></a>
+            <a name="idm1438" id="idm1438"></a>
            <p class="title"><b>Table&nbsp;3.1.&nbsp;Keyboard
            shortcuts</b></p>
            <div class="table-contents">
@ -7940,11 +7931,11 @@ hasextract = False
        "application">FreeBSD</span> ports, etc.), or from some
        type of "backports" repository providing versions newer
        than the standard ones, or found on the <span class=
-        "application">Recoll</span> WEB site in some cases. The
+        "application">Recoll</span> Web site in some cases. The
        most up-to-date information about Recoll packages can
        usually be found on the <a class="ulink" href=
        "http://www.recoll.org/pages/download.html" target=
-        "_top"><span class="application">Recoll</span> WEB site
+        "_top"><span class="application">Recoll</span> Web site
        downloads page</a></p>
        <p>The <span class="application">Windows</span> version of
        Recoll comes in a self-contained setup file, there is
--- a/src/doc/user/usermanual.xml
+++ b/src/doc/user/usermanual.xml
@ -282,7 +282,7 @@
        <ulink url="https://www.lesbonscomptes.com/recoll/pages/download.html">Search Provider</ulink>.
        </para></listitem>
        <listitem><para>A
-        <ulink url="https://framagit.org/medoc92/recollwebui">WEB interface</ulink>.
+        <ulink url="https://framagit.org/medoc92/recollwebui">Web interface</ulink>.
        </para></listitem>
      </itemizedlist>
      </para>
@ -1257,56 +1257,51 @@ recollindex -c "$confdir"
  </simplesect>
  </sect1>

-    <sect1 id="RCL.INDEXING.WEBQUEUE">
-      <title>&LIN;: indexing visited WEB pages</title>
+    <sect1 id="RCL.INDEXING.WebQUEUE">
+      <title>&LIN;: indexing visited Web pages</title>

      <para>With the help of a <application>Firefox</application>
-      extension, &RCL; can index the Internet pages that you visit. The
-      extension has a long history: it was initially designed for the
-      <application>Beagle</application> indexer, then adapted to &RCL; and
-      the <application>Firefox</application> <application>XUL</application>
-      API. A new version of the addon has been written to work with the
-      <application>WebExtensions</application> API, which is the only one
-      supported after <application>Firefox</application> version 57.</para>
+        extension, &RCL; can index the Internet pages that you visit. The
+        extension has a long history: it was initially designed for
+        the <application>Beagle</application> indexer, then adapted to
+        &RCL; and
+        the <application>Firefox</application> <application>XUL</application>
+        API. The current version of the extension is located in
+        the <ulink url="https://addons.mozilla.org/en-US/firefox/addon/recoll-we/">Mozilla
+        add-ons repository</ulink> uses
+        the <application>WebExtensions</application> API, and works with
+        current <application>Firefox</application> versions.</para>

-      <para>The extension works by copying visited WEB pages to an indexing
-      queue directory, which &RCL; then processes, indexing the data,
-      storing it into a local cache, then removing the file from the
-      queue.</para>
-
-      <para>Because the WebExtensions API introduces more constraints to
-      what extensions can do, the new version works with one
-      more step: the files are first created in the browser default
-      downloads location (typically <filename>$HOME/Downloads</filename> ),
-      then moved by a script in the old queue location. The script is
-      automatically executed by the &RCL; indexer versions 1.23.5 and
-      newer. It could conceivably be executed independently to make the new
-      browser extension compatible with an older &RCL; version (the script
-      is named <command>recoll-we-move-files.py</command>).</para>
-
-      <note><para>For the WebExtensions-based version to work, it is
-      necessary to set the <literal>webdownloadsdir</literal> value in the
-      configuration if it was changed from the default
-      <filename>$HOME/Downloads</filename> in the browser
-      preferences.</para></note>
+      <para>The extension works by copying visited Web pages to an indexing
+        queue directory, which &RCL; then processes, storing the data into a
+        local cache, then indexing it, then removing the file from the
+        queue.</para>
      
-      <para>The visited WEB pages indexing feature can be enabled on the
-      &RCL; side from the GUI <guilabel>Index configuration</guilabel>
-      panel, or by editing the configuration file (set
-      <varname>processwebqueue</varname> to 1).</para>
+      <para>The visited Web pages indexing feature can be enabled on the
+        &RCL; side from the GUI <guilabel>Index configuration</guilabel>
+        panel, or by editing the configuration file (set
+        <varname>processwebqueue</varname> to 1).</para>

-      <para>A current pointer to the extension can be found, along with
-      up-to-date instructions, on the
-      <ulink url="&FAQS;IndexWebHistory">Recoll wiki</ulink>.</para>
+      <para>The &RCL; GUI has a tool to list and edit the contents of the
+        Web
+        cache. (<menuchoice><guimenu>Tools</guimenu><guimenuitem>Webcache
+        editor</guimenuitem></menuchoice>)</para>

-      <para>A copy of the indexed WEB pages is retained by Recoll in a
-      local cache (from which previews can be fetched). The cache size can
-      be adjusted from the <guilabel>Index configuration</guilabel> /
-      <guilabel>Web history</guilabel> panel. Once the maximum size
-      is reached, old pages are purged - both from the cache and the index
-      - to make room for new ones, so you need to explicitly archive in
-      some other place the pages that you want to keep
-      indefinitely.</para> 
+      <para>You can find more details on Web indexing, its usage and configuration
+        in a <ulink url="&FAQS;IndexWebHistory">Recoll 'Howto' entry</ulink>.</para>
+
+      <note><title>The cache is not an archive</title><para>A copy of
+          the indexed Web pages is retained by Recoll in a local cache
+          (from which data is fetched for previews, or when resetting the
+          index). The cache has a maximum size, which can be adjusted from
+          the <guilabel>Index configuration</guilabel> / <guilabel>Web
+          history</guilabel> panel (<literal>webcachemaxmbs</literal>
+          parameter in <filename>recoll.conf</filename>). Once the maximum
+          size is reached, old pages are erased to make room for new ones.
+          The pages which you want to keep indefinitely need to be
+          explicitly archived elsewhere. Using a very high value for
+          the cache size can avoid data erasure, but see the above 'Howto'
+          page for more details and gotchas.</para></note>

    </sect1>

@ -2475,7 +2470,7 @@ fs.inotify.max_user_watches=32768
        file inside the configuration directory. The sample file
        distributed with &RCL; (which you should be able to find under
        <filename>/usr/share/recoll/examples/fragbuts.xml</filename>),
-        contains an example which filters the results from the WEB
+        contains an example which filters the results from the Web
        history.</para>


@ -2485,7 +2480,7 @@ fs.inotify.max_user_watches=32768
 <fragbuts version="1.0">

  <radiobuttons>
-    <!-- Actually useful: toggle WEB queue results inclusion -->
+    <!-- Actually useful: toggle Web queue results inclusion -->
    <fragbut>
      <label>Include Web Results</label>
      <frag></frag>
@ -6115,11 +6110,11 @@ hasextract = False
      <application>Debian/Ubuntu apt</application>,
      <application>FreeBSD</application> ports, etc.), or from some type
      of "backports" repository providing versions newer than the standard
-      ones, or found on the &RCL; WEB site in some
+      ones, or found on the &RCL; Web site in some
      cases. The most up-to-date information about Recoll packages can
      usually be found on the
      <ulink url="http://www.recoll.org/pages/download.html">
-        <application>Recoll</application> WEB site downloads
+        <application>Recoll</application> Web site downloads
      page</ulink></para> 

      <para>The &WIN; version of Recoll comes in a self-contained setup
--- a/src/qtgui/confgui/confguiindex.cpp
+++ b/src/qtgui/confgui/confguiindex.cpp
@ -331,7 +331,7 @@ bool ConfIndexW::setupWebHistoryPanel(int idx)
 {
    ConfParamW *bparam = m_w->addParam(
        idx, ConfTabsW::CFPT_BOOL, "processwebqueue",
-        tr("Process the WEB history queue"),
+        tr("Process the Web history queue"),
        tr("Enables indexing Firefox visited pages.<br>"
           "(you need also install the Firefox Recoll plugin)"));
    ConfParamW *cparam = m_w->addParam(
@ -353,6 +353,8 @@ bool ConfIndexW::setupWebHistoryPanel(int idx)
           "file (only waste space at the end)."
            ), -1, 1000*1000); // Max 1TB...
    m_w->enableLink(bparam, cparam);
+    m_w->addBlurb(idx, tr("Note: old pages will be erased to make space for "
+                          "new ones when the maximum size is reached"));
    m_w->endOfList(idx);
    return true;
 }