Add and interface a script to move the files generated by the WebExtensions new browser extension into the web input queue

2017-11-24 15:30:27 +01:00 · 2017-11-24 15:30:27 +01:00 · 5afe1aa631
commit 5afe1aa631
parent c123b17f19
12 changed files with 205 additions and 53 deletions
--- a/src/Makefile.am
+++ b/src/Makefile.am
@ -636,6 +636,7 @@ filters/rclxml.py \
 filters/rclxmp.py \
 filters/rclxslt.py \
 filters/rclzip \
 filters/recoll-we-move-files.py \
 filters/ppt-dump.py \
 filters/xls-dump.py \
 filters/xlsxmltocsv.py \
@ -645,6 +646,7 @@ python/recoll/recoll/rclconfig.py
 install-data-hook: 
 	(cd $(DESTDIR)/$(filterdir); \
 	chmod a+x rcl* ppt-dump.py xls-dump.py xlsxmltocsv.py hotrecoll.py; \
 	chmod a+x recoll-we-move-files.py; \
 	chmod 0644 msodump.zip rclexecm.py rcllatinstops.zip rclconfig.py) 
 if MAKEUSERDOC
--- a/src/VERSION
+++ b/src/VERSION
@ -1 +1 @@
-1.23.4
+1.23.5
--- a/src/doc/user/Makefile
+++ b/src/doc/user/Makefile
@ -39,7 +39,7 @@ index.html: usermanual.xml
 usermanual.pdf: usermanual.xml
 	dblatex $<
-UTILBUILDS=/home/dockes/projets/builds/medocutils/
+UTILBUILDS=/home/dockes/tmp/builds/medocutils/
 recoll-conf-xml:
 	$(UTILBUILDS)/confxml --docbook \
        --idprefix=RCL.INSTALL.CONFIG.RECOLLCONF  \
--- a/src/doc/user/recoll.conf.xml
+++ b/src/doc/user/recoll.conf.xml
@ -83,7 +83,7 @@ be ignored inside zip archives. This is used directly by
 the zip handler, and has a function similar to skippedNames, but works
 independantly. Can be redefined for subdirectories. Supported by recoll
 1.20 and newer. See
-https://www.lesbonscomptes.com/recoll/faqsandhowtos/Filtering%20out%20Zip%20archive%20members
+https://www.lesbonscomptes.com/recoll/faqsandhowtos/FilteringOutZipArchiveMembers.html
 </para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.FOLLOWLINKS">
 <term><varname>followLinks</varname></term>
@ -362,9 +362,17 @@ Default: 40 MB.
 Reducing the size will not physically truncate the file.</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.WEBQUEUEDIR">
 <term><varname>webqueuedir</varname></term>
-<listitem><para>The path to the Web indexing queue. This is
+<listitem><para>The path to the Web indexing queue. This used to be
-hard-coded in the plugin as ~/.recollweb/ToIndex so there should be no
+hard-coded in the old plugin as ~/.recollweb/ToIndex so there would be no
-need or possibility to change it.</para></listitem></varlistentry>
+need or possibility to change it, but the WebExtensions plugin now downloads
 the files to the user Downloads directory, and a script moves them to
 webqueuedir. The script reads this value from the config so it has become
 possible to change it.</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.WEBDOWNLOADSDIR">
 <term><varname>webdownloadsdir</varname></term>
 <listitem><para>The path to browser downloads directory. This is
 where the new browser add-on extension has to create the files. They are
 then moved by a script to webqueuedir.</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.ASPELLDICDIR">
 <term><varname>aspellDicDir</varname></term>
 <listitem><para>Aspell dictionary storage directory location. The
--- a/src/doc/user/usermanual.html
+++ b/src/doc/user/usermanual.html
@ -131,8 +131,8 @@ alink="#0000FF">
              </dl>
            </dd>
            <dt><span class="sect1">2.4. <a href=
-            "#RCL.INDEXING.WEBQUEUE">Indexing WEB pages you
+            "#RCL.INDEXING.WEBQUEUE">Indexing the WEB pages which
-            wisit</a></span></dt>
+            you wisit.</a></span></dt>
            <dt><span class="sect1">2.5. <a href=
            "#RCL.INDEXING.EXTATTR">Extended attributes
            data</a></span></dt>
@ -1505,27 +1505,56 @@ thrQSizes = -1 -1 -1
            <div>
              <h2 class="title" style="clear: both"><a name=
              "RCL.INDEXING.WEBQUEUE" id=
-              "RCL.INDEXING.WEBQUEUE"></a>2.4.<2E>Indexing WEB pages
+              "RCL.INDEXING.WEBQUEUE"></a>2.4.<2E>Indexing the WEB
-              you wisit</h2>
+              pages which you wisit.</h2>
            </div>
          </div>
        </div>
        <p>With the help of a <span class=
        "application">Firefox</span> extension, <span class=
        "application">Recoll</span> can index the Internet pages
-        that you visit. The extension was initially designed for
+        that you visit. The extension has a long history: it was
-        the <span class="application">Beagle</span> indexer, but it
+        initially designed for the <span class=
-        has recently be renamed and better adapted to <span class=
+        "application">Beagle</span> indexer, then adapted to
-        "application">Recoll</span>.</p>
+        <span class="application">Recoll</span> and the
        <span class="application">Firefox</span> <span class=
        "application">XUL</span> API. A new version of the addon
        has been written to work with the <span class=
        "application">WebExtensions</span> API, which is the only
        one supported after <span class=
        "application">Firefox</span> version 57.</p>
        <p>The extension works by copying visited WEB pages to an
        indexing queue directory, which <span class=
        "application">Recoll</span> then processes, indexing the
        data, storing it into a local cache, then removing the file
        from the queue.</p>
-        <p>This feature can be enabled in the GUI <span class=
+        <p>Because the WebExtensions API introduces more
-        "guilabel">Index configuration</span> panel, or by editing
+        constraints to what extensions can do, the new version
-        the configuration file (set <code class=
+        works with one more step: the files are first created in
-        "varname">processwebqueue</code> to 1).</p>
+        the browser default downloads location (typically
        <code class="filename">$HOME/Downloads</code> ), then moved
        by a script in the old queue location. The script is
        automatically executed by the <span class=
        "application">Recoll</span> indexer versions 1.23.5 and
        newer. It could conceivably be executed independantly to
        make the new browser extension compatible with an older
        <span class="application">Recoll</span> version (the script
        is named <span class=
        "command"><strong>recoll-we-move-files.py</strong></span>).</p>
        <div class="note" style=
        "margin-left: 0.5in; margin-right: 0.5in;">
          <h3 class="title">Note</h3>
          <p>For the WebExtensions-based version to work, it is
          necessary to set the <code class=
          "literal">webdownloadsdir</code> value in the
          configuration if it was changed from the default
          <code class="filename">$HOME/Downloads</code> in the
          browser preferences.</p>
        </div>
        <p>The visited WEB pages indexing feature can be enabled in
        the GUI <span class="guilabel">Index configuration</span>
        panel, or by editing the configuration file (set
        <code class="varname">processwebqueue</code> to 1).</p>
        <p>A current pointer to the extension can be found, along
        with up-to-date instructions, on the <a class="ulink" href=
        "https://www.lesbonscomptes.com/recoll/faqsandhowtos/IndexWebHistory"
@ -1570,8 +1599,10 @@ thrQSizes = -1 -1 -1
              the file MIME type.</p>
            </dd>
            <dt><span class="term">charset</span></dt>
-            <dd>If set, this defines the file character set (mostly
+            <dd>
-            useful for plain text files).</dd>
+              <p>If set, this defines the file character set
              (mostly useful for plain text files).</p>
            </dd>
          </dl>
        </div>
        <p>By default, other attributes are handled as <span class=
@ -7854,7 +7885,7 @@ thesame = "some string with spaces"
                function similar to skippedNames, but works
                independantly. Can be redefined for subdirectories.
                Supported by recoll 1.20 and newer. See
-                https://www.lesbonscomptes.com/recoll/faqsandhowtos/Filtering%20out%20Zip%20archive%20members</p>
+                https://www.lesbonscomptes.com/recoll/faqsandhowtos/FilteringOutZipArchiveMembers.html</p>
              </dd>
              <dt>
                <a name="RCL.INSTALL.CONFIG.RECOLLCONF.FOLLOWLINKS"
@ -8370,10 +8401,25 @@ thesame = "some string with spaces"
                "RCL.INSTALL.CONFIG.RECOLLCONF.WEBQUEUEDIR"></a><span class="term"><code class="varname">webqueuedir</code></span>
              </dt>
              <dd>
-                <p>The path to the Web indexing queue. This is
+                <p>The path to the Web indexing queue. This used to
-                hard-coded in the plugin as ~/.recollweb/ToIndex so
+                be hard-coded in the old plugin as
-                there should be no need or possibility to change
+                ~/.recollweb/ToIndex so there would be no need or
-                it.</p>
+                possibility to change it, but the WebExtensions
                plugin now downloads the files to the user
                Downloads directory, and a script moves them to
                webqueuedir. The script reads this value from the
                config so it has become possible to change it.</p>
              </dd>
              <dt>
                <a name=
                "RCL.INSTALL.CONFIG.RECOLLCONF.WEBDOWNLOADSDIR" id=
                "RCL.INSTALL.CONFIG.RECOLLCONF.WEBDOWNLOADSDIR"></a><span class="term"><code class="varname">webdownloadsdir</code></span>
              </dt>
              <dd>
                <p>The path to browser downloads directory. This is
                where the new browser add-on extension has to
                create the files. They are then moved by a script
                to webqueuedir.</p>
              </dd>
              <dt>
                <a name=
--- a/src/doc/user/usermanual.xml
+++ b/src/doc/user/usermanual.xml
@ -965,27 +965,46 @@ thrQSizes = -1 -1 -1
    </sect1>
    <sect1 id="RCL.INDEXING.WEBQUEUE">
-      <title>Indexing WEB pages you wisit</title>
+      <title>Indexing the WEB pages which you wisit.</title>
      <para>With the help of a <application>Firefox</application>
      extension, &RCL; can index the Internet pages that you visit. The
-      extension was initially designed for the
+      extension has a long history: it was initially designed for the
-      <application>Beagle</application> indexer, but it has recently be
+      <application>Beagle</application> indexer, then adapted to &RCL; and
-      renamed and better adapted to &RCL;.</para>
+      the <application>Firefox</application> <application>XUL</application>
      API. A new version of the addon has been written to work with the
      <application>WebExtensions</application> API, which is the only one
      supported after <application>Firefox</application> version 57.</para>
      <para>The extension works by copying visited WEB pages to an indexing
-         queue directory, which &RCL; then processes, indexing the data,
+      queue directory, which &RCL; then processes, indexing the data,
-         storing it into a local cache, then removing the file from the
+      storing it into a local cache, then removing the file from the
-         queue.</para>
+      queue.</para>
-      <para>This feature can be enabled in the GUI 
+      <para>Because the WebExtensions API introduces more constraints to
-        <guilabel>Index configuration</guilabel>
+      what extensions can do, the new version works with one
-	panel, or by editing the configuration file (set
+      more step: the files are first created in the browser default
-	<varname>processwebqueue</varname> to 1).</para>
+      downloads location (typically <filename>$HOME/Downloads</filename> ),
      then moved by a script in the old queue location. The script is
      automatically executed by the &RCL; indexer versions 1.23.5 and
      newer. It could conceivably be executed independantly to make the new
      browser extension compatible with an older &RCL; version (the script
      is named <command>recoll-we-move-files.py</command>).</para>
      <note><para>For the WebExtensions-based version to work, it is
      necessary to set the <literal>webdownloadsdir</literal> value in the
      configuration if it was changed from the default
      <filename>$HOME/Downloads</filename> in the browser
      preferences.</para></note>
      <para>The visited WEB pages indexing feature can be enabled in the
      GUI <guilabel>Index configuration</guilabel> panel, or by editing the
      configuration file (set <varname>processwebqueue</varname> to
      1).</para> 
      <para>A current pointer to the extension can be found, along with
-        up-to-date instructions, on the
+      up-to-date instructions, on the
-        <ulink url="&FAQS;IndexWebHistory">Recoll wiki</ulink>.</para>
+      <ulink url="&FAQS;IndexWebHistory">Recoll wiki</ulink>.</para>
      <para>A copy of the indexed WEB pages is retained by Recoll in a
      local cache (from which previews can be fetched). The cache size can
@ -1020,8 +1039,8 @@ thrQSizes = -1 -1 -1
          </varlistentry>
          <varlistentry>
            <term>charset</term>
-            <listitem>If set, this defines the file character set
+            <listitem><para>If set, this defines the file character set
-              (mostly useful for plain text files).</listitem>
+              (mostly useful for plain text files).</para></listitem>
          </varlistentry>
        </variablelist>
      </para>
--- a/src/filters/recoll-we-move-files.py
+++ b/src/filters/recoll-we-move-files.py
@ -42,8 +42,10 @@ try:
 except:
    import rclconfig
 verbosity = 0
 def logdeb(s):
-    print("%s"%s, file=sys.stderr)
+    if verbosity >= 4:
        print("%s"%s, file=sys.stderr)
 # # wnloaded instances of the same page are suffixed with (nn) by the
 # browser.  We are passed a list of (hash, instancenum, filename)
@ -94,8 +96,12 @@ def usage():
    print("Usage: recoll-we-move-files.py [<downloaddir>]", file=sys.stderr)
    sys.exit(1)
-# Source dir is parameter, else default Downloads directory
+config = rclconfig.RclConfig()
-downloadsdir = os.path.expanduser("~/Downloads")
+
 # Source dir is parameter, else from config else default Downloads directory
 downloadsdir = config.getConfParam("webdownloadsdir")
 if not downloadsdir:
    downloadsdir = os.path.expanduser("~/Downloads")
 if len(sys.argv) == 2:
    mydir = sys.argv[1]
 elif len(sys.argv) == 1:
@ -106,12 +112,13 @@ if not os.path.isdir(mydir):
    usage()
 # Get target webqueue recoll directory from recoll configuration
 config = rclconfig.RclConfig()
 webqueuedir = config.getConfParam("webqueuedir")
 if not webqueuedir:
    webqueuedir = "~/.recollweb/ToIndex"
 webqueuedir = os.path.expanduser(webqueuedir)
-logdeb("webqueuedir is %s" % webqueuedir)
+os.makedirs(webqueuedir, exist_ok = True)
 # logdeb("webqueuedir is %s" % webqueuedir)
 # Get the lists of all files created by the browser addon
 mfiles, cfiles = list_all_files(mydir)
@ -130,7 +137,9 @@ cfiles = delete_previous_instances(cfiles, downloadsdir)
 for hash in cfiles.keys():
    if hash in mfiles.keys():
        newname = "firefox-recoll-web-"+hash
-        shutil.move(cfiles[hash], os.path.join(webqueuedir, newname))
+        shutil.move(os.path.join(downloadsdir, cfiles[hash]),
-        shutil.move(mfiles[hash], os.path.join(webqueuedir, "." + newname))
+                    os.path.join(webqueuedir, newname))
        shutil.move(os.path.join(downloadsdir, mfiles[hash]),
                    os.path.join(webqueuedir, "." + newname))
--- a/src/index/indexer.cpp
+++ b/src/index/indexer.cpp
@ -125,6 +125,7 @@ bool ConfIndexer::index(bool resetbefore, ixType typestorun, int flags)
    }
 #ifndef DISABLE_WEB_INDEXER
    if (m_doweb && (typestorun & IxTWebQueue)) {
        runWebFilesMoverScript(m_config);
        deleteZ(m_webindexer);
        m_webindexer = new BeagleQueueIndexer(m_config, &m_db, m_updater);
        if (!m_webindexer || !m_webindexer->index()) {
--- a/src/index/rclmonprc.cpp
+++ b/src/index/rclmonprc.cpp
@ -464,15 +464,22 @@ bool startMonitor(RclConfig *conf, int opts)
    bool timedout;
    time_t lastauxtime = time(0);
    time_t lastixtime = lastauxtime;
    time_t lastmovetime = 0;
    bool didsomething = false;
    list<string> modified;
    list<string> deleted;
    ;
    // Set a relatively short timeout for better monitoring of exit requests
    while (true) {
        time_t now = time(0);
        if (now - lastmovetime > ixinterval) {
            lastmovetime = now;
            runWebFilesMoverScript(conf);
        }
        {
            // Wait for event or timeout.
            // Set a relatively short timeout for better monitoring of
            // exit requests.
            std::unique_lock<std::mutex> lock = rclEQ.wait(2, &timedout);
            // x11IsAlive() can't be called from ok() because both
@ -525,9 +532,9 @@ bool startMonitor(RclConfig *conf, int opts)
            }
        }
        now = time(0);
 	// Process. We don't do this every time but let the lists accumulate
        // a little, this saves processing. Start at once if list is big.
        time_t now = time(0);
        if (expeditedIndexingRequested(conf) ||
 	    (now - lastixtime > ixinterval) || 
 	    (deleted.size() + modified.size() > 20)) {
@ -553,8 +560,9 @@ bool startMonitor(RclConfig *conf, int opts)
        }
 	// Recreate the auxiliary dbs every hour at most.
-	if (didsomething && time(0) - lastauxtime > auxinterval) {
+        now = time(0);
-	    lastauxtime = time(0);
+	if (didsomething && now - lastauxtime > auxinterval) {
 	    lastauxtime = now;
 	    didsomething = false;
 	    if (!createAuxDbs(conf)) {
 		// We used to bail out on error here. Not anymore,
--- a/src/index/recollindex.cpp
+++ b/src/index/recollindex.cpp
@ -337,6 +337,48 @@ static bool checktopdirs(RclConfig *config, vector<string>& nonexist)
    return true;
 }
 bool runWebFilesMoverScript(RclConfig *config)
 {
    static string downloadsdir;
    if (downloadsdir.empty()) {
        if (!config->getConfParam("downloadsdir", downloadsdir)) {
            downloadsdir = path_tildexpand("~/Downloads");
        }
    }
    static string cmdpath;
    if (cmdpath.empty()) {
        cmdpath = config->findFilter("recoll-we-move-files.py");
        if (cmdpath.empty()) {
            LOGERR("runWFMoverScript: recoll-we-move-files.py not found\n");
            return false;
        }
    }
    /* Arrange to not actually run the script if the directory did not change */
    static time_t dirmtime;
    time_t ndirmtime = 0;
    struct stat st;
    if (::stat(downloadsdir.c_str(), &st) == 0) {
        ndirmtime = st.st_mtime;
    }
    /* If stat fails, presumably Downloads does not exist or is not
       accessible, dirmtime and mdirmtime stay at 0, and we never
       execute the script, which is the right thing. */
    if (dirmtime != ndirmtime) {
        /* The script is going to change the directory, so updating
           dirmtime before it runs means that we are going to execute
           it one time too many (it will run without doing anything),
           but we can't set the mtime to after the run in case files
           are created during the run. */
        dirmtime = ndirmtime;
        ExecCmd cmd;
        int status = cmd.doexec(cmdpath, {});
        return status == 0;
    }
    return true;
 }
 static const char *thisprog;
 static const char usage [] =
--- a/src/index/recollindex.h
+++ b/src/index/recollindex.h
@ -26,6 +26,16 @@ extern bool indexfiles(RclConfig *config, std::list<std::string> &filenames);
 extern bool purgefiles(RclConfig *config, std::list<std::string> &filenames);
 extern bool createAuxDbs(RclConfig *config);
 /** 
 * Helper method for executing the recoll-we (new WebExtensions plugin) helper
 * script. This moves files from the browser download directory (only
 * place where the browser accepts to create them), to the web queue
 * dir). This keeps the c++ code compatible with old and new addon.
 * The script is executed before a batch pass, or from time to time in
 *  the monitor, if web processing is enabled.
 */
 extern bool runWebFilesMoverScript(RclConfig *);
 extern int stopindexing;
 // Try to explain what went wrong...
--- a/src/sampleconf/recoll.conf
+++ b/src/sampleconf/recoll.conf
@ -438,6 +438,13 @@ webcachemaxmbs = 40
 # possible to change it.</descr></var>
 #webqueuedir = ~/.recollweb/ToIndex
 # <var name="webdownloadsdir" type="fn">
 #
 # <brief>The path to browser downloads directory.</brief><descr>This is
 # where the new browser add-on extension has to create the files. They are
 # then moved by a script to webqueuedir.</descr></var>
 #webdownloadsdir = ~/Downloads
 # <var name="aspellDicDir" type="dfn">
 #
 # <brief>Aspell dictionary storage directory location.</brief> <descr>The