Add and interface a script to move the files generated by the WebExtensions new browser extension into the web input queue

This commit is contained in:
Jean-Francois Dockes 2017-11-24 15:30:27 +01:00
parent c123b17f19
commit 5afe1aa631
12 changed files with 205 additions and 53 deletions

View File

@ -636,6 +636,7 @@ filters/rclxml.py \
filters/rclxmp.py \ filters/rclxmp.py \
filters/rclxslt.py \ filters/rclxslt.py \
filters/rclzip \ filters/rclzip \
filters/recoll-we-move-files.py \
filters/ppt-dump.py \ filters/ppt-dump.py \
filters/xls-dump.py \ filters/xls-dump.py \
filters/xlsxmltocsv.py \ filters/xlsxmltocsv.py \
@ -645,6 +646,7 @@ python/recoll/recoll/rclconfig.py
install-data-hook: install-data-hook:
(cd $(DESTDIR)/$(filterdir); \ (cd $(DESTDIR)/$(filterdir); \
chmod a+x rcl* ppt-dump.py xls-dump.py xlsxmltocsv.py hotrecoll.py; \ chmod a+x rcl* ppt-dump.py xls-dump.py xlsxmltocsv.py hotrecoll.py; \
chmod a+x recoll-we-move-files.py; \
chmod 0644 msodump.zip rclexecm.py rcllatinstops.zip rclconfig.py) chmod 0644 msodump.zip rclexecm.py rcllatinstops.zip rclconfig.py)
if MAKEUSERDOC if MAKEUSERDOC

View File

@ -1 +1 @@
1.23.4 1.23.5

View File

@ -39,7 +39,7 @@ index.html: usermanual.xml
usermanual.pdf: usermanual.xml usermanual.pdf: usermanual.xml
dblatex $< dblatex $<
UTILBUILDS=/home/dockes/projets/builds/medocutils/ UTILBUILDS=/home/dockes/tmp/builds/medocutils/
recoll-conf-xml: recoll-conf-xml:
$(UTILBUILDS)/confxml --docbook \ $(UTILBUILDS)/confxml --docbook \
--idprefix=RCL.INSTALL.CONFIG.RECOLLCONF \ --idprefix=RCL.INSTALL.CONFIG.RECOLLCONF \

View File

@ -83,7 +83,7 @@ be ignored inside zip archives. This is used directly by
the zip handler, and has a function similar to skippedNames, but works the zip handler, and has a function similar to skippedNames, but works
independantly. Can be redefined for subdirectories. Supported by recoll independantly. Can be redefined for subdirectories. Supported by recoll
1.20 and newer. See 1.20 and newer. See
https://www.lesbonscomptes.com/recoll/faqsandhowtos/Filtering%20out%20Zip%20archive%20members https://www.lesbonscomptes.com/recoll/faqsandhowtos/FilteringOutZipArchiveMembers.html
</para></listitem></varlistentry> </para></listitem></varlistentry>
<varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.FOLLOWLINKS"> <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.FOLLOWLINKS">
<term><varname>followLinks</varname></term> <term><varname>followLinks</varname></term>
@ -362,9 +362,17 @@ Default: 40 MB.
Reducing the size will not physically truncate the file.</para></listitem></varlistentry> Reducing the size will not physically truncate the file.</para></listitem></varlistentry>
<varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.WEBQUEUEDIR"> <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.WEBQUEUEDIR">
<term><varname>webqueuedir</varname></term> <term><varname>webqueuedir</varname></term>
<listitem><para>The path to the Web indexing queue. This is <listitem><para>The path to the Web indexing queue. This used to be
hard-coded in the plugin as ~/.recollweb/ToIndex so there should be no hard-coded in the old plugin as ~/.recollweb/ToIndex so there would be no
need or possibility to change it.</para></listitem></varlistentry> need or possibility to change it, but the WebExtensions plugin now downloads
the files to the user Downloads directory, and a script moves them to
webqueuedir. The script reads this value from the config so it has become
possible to change it.</para></listitem></varlistentry>
<varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.WEBDOWNLOADSDIR">
<term><varname>webdownloadsdir</varname></term>
<listitem><para>The path to browser downloads directory. This is
where the new browser add-on extension has to create the files. They are
then moved by a script to webqueuedir.</para></listitem></varlistentry>
<varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.ASPELLDICDIR"> <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.ASPELLDICDIR">
<term><varname>aspellDicDir</varname></term> <term><varname>aspellDicDir</varname></term>
<listitem><para>Aspell dictionary storage directory location. The <listitem><para>Aspell dictionary storage directory location. The

View File

@ -131,8 +131,8 @@ alink="#0000FF">
</dl> </dl>
</dd> </dd>
<dt><span class="sect1">2.4. <a href= <dt><span class="sect1">2.4. <a href=
"#RCL.INDEXING.WEBQUEUE">Indexing WEB pages you "#RCL.INDEXING.WEBQUEUE">Indexing the WEB pages which
wisit</a></span></dt> you wisit.</a></span></dt>
<dt><span class="sect1">2.5. <a href= <dt><span class="sect1">2.5. <a href=
"#RCL.INDEXING.EXTATTR">Extended attributes "#RCL.INDEXING.EXTATTR">Extended attributes
data</a></span></dt> data</a></span></dt>
@ -1505,27 +1505,56 @@ thrQSizes = -1 -1 -1
<div> <div>
<h2 class="title" style="clear: both"><a name= <h2 class="title" style="clear: both"><a name=
"RCL.INDEXING.WEBQUEUE" id= "RCL.INDEXING.WEBQUEUE" id=
"RCL.INDEXING.WEBQUEUE"></a>2.4.<2E>Indexing WEB pages "RCL.INDEXING.WEBQUEUE"></a>2.4.<2E>Indexing the WEB
you wisit</h2> pages which you wisit.</h2>
</div> </div>
</div> </div>
</div> </div>
<p>With the help of a <span class= <p>With the help of a <span class=
"application">Firefox</span> extension, <span class= "application">Firefox</span> extension, <span class=
"application">Recoll</span> can index the Internet pages "application">Recoll</span> can index the Internet pages
that you visit. The extension was initially designed for that you visit. The extension has a long history: it was
the <span class="application">Beagle</span> indexer, but it initially designed for the <span class=
has recently be renamed and better adapted to <span class= "application">Beagle</span> indexer, then adapted to
"application">Recoll</span>.</p> <span class="application">Recoll</span> and the
<span class="application">Firefox</span> <span class=
"application">XUL</span> API. A new version of the addon
has been written to work with the <span class=
"application">WebExtensions</span> API, which is the only
one supported after <span class=
"application">Firefox</span> version 57.</p>
<p>The extension works by copying visited WEB pages to an <p>The extension works by copying visited WEB pages to an
indexing queue directory, which <span class= indexing queue directory, which <span class=
"application">Recoll</span> then processes, indexing the "application">Recoll</span> then processes, indexing the
data, storing it into a local cache, then removing the file data, storing it into a local cache, then removing the file
from the queue.</p> from the queue.</p>
<p>This feature can be enabled in the GUI <span class= <p>Because the WebExtensions API introduces more
"guilabel">Index configuration</span> panel, or by editing constraints to what extensions can do, the new version
the configuration file (set <code class= works with one more step: the files are first created in
"varname">processwebqueue</code> to 1).</p> the browser default downloads location (typically
<code class="filename">$HOME/Downloads</code> ), then moved
by a script in the old queue location. The script is
automatically executed by the <span class=
"application">Recoll</span> indexer versions 1.23.5 and
newer. It could conceivably be executed independantly to
make the new browser extension compatible with an older
<span class="application">Recoll</span> version (the script
is named <span class=
"command"><strong>recoll-we-move-files.py</strong></span>).</p>
<div class="note" style=
"margin-left: 0.5in; margin-right: 0.5in;">
<h3 class="title">Note</h3>
<p>For the WebExtensions-based version to work, it is
necessary to set the <code class=
"literal">webdownloadsdir</code> value in the
configuration if it was changed from the default
<code class="filename">$HOME/Downloads</code> in the
browser preferences.</p>
</div>
<p>The visited WEB pages indexing feature can be enabled in
the GUI <span class="guilabel">Index configuration</span>
panel, or by editing the configuration file (set
<code class="varname">processwebqueue</code> to 1).</p>
<p>A current pointer to the extension can be found, along <p>A current pointer to the extension can be found, along
with up-to-date instructions, on the <a class="ulink" href= with up-to-date instructions, on the <a class="ulink" href=
"https://www.lesbonscomptes.com/recoll/faqsandhowtos/IndexWebHistory" "https://www.lesbonscomptes.com/recoll/faqsandhowtos/IndexWebHistory"
@ -1570,8 +1599,10 @@ thrQSizes = -1 -1 -1
the file MIME type.</p> the file MIME type.</p>
</dd> </dd>
<dt><span class="term">charset</span></dt> <dt><span class="term">charset</span></dt>
<dd>If set, this defines the file character set (mostly <dd>
useful for plain text files).</dd> <p>If set, this defines the file character set
(mostly useful for plain text files).</p>
</dd>
</dl> </dl>
</div> </div>
<p>By default, other attributes are handled as <span class= <p>By default, other attributes are handled as <span class=
@ -7854,7 +7885,7 @@ thesame = "some string with spaces"
function similar to skippedNames, but works function similar to skippedNames, but works
independantly. Can be redefined for subdirectories. independantly. Can be redefined for subdirectories.
Supported by recoll 1.20 and newer. See Supported by recoll 1.20 and newer. See
https://www.lesbonscomptes.com/recoll/faqsandhowtos/Filtering%20out%20Zip%20archive%20members</p> https://www.lesbonscomptes.com/recoll/faqsandhowtos/FilteringOutZipArchiveMembers.html</p>
</dd> </dd>
<dt> <dt>
<a name="RCL.INSTALL.CONFIG.RECOLLCONF.FOLLOWLINKS" <a name="RCL.INSTALL.CONFIG.RECOLLCONF.FOLLOWLINKS"
@ -8370,10 +8401,25 @@ thesame = "some string with spaces"
"RCL.INSTALL.CONFIG.RECOLLCONF.WEBQUEUEDIR"></a><span class="term"><code class="varname">webqueuedir</code></span> "RCL.INSTALL.CONFIG.RECOLLCONF.WEBQUEUEDIR"></a><span class="term"><code class="varname">webqueuedir</code></span>
</dt> </dt>
<dd> <dd>
<p>The path to the Web indexing queue. This is <p>The path to the Web indexing queue. This used to
hard-coded in the plugin as ~/.recollweb/ToIndex so be hard-coded in the old plugin as
there should be no need or possibility to change ~/.recollweb/ToIndex so there would be no need or
it.</p> possibility to change it, but the WebExtensions
plugin now downloads the files to the user
Downloads directory, and a script moves them to
webqueuedir. The script reads this value from the
config so it has become possible to change it.</p>
</dd>
<dt>
<a name=
"RCL.INSTALL.CONFIG.RECOLLCONF.WEBDOWNLOADSDIR" id=
"RCL.INSTALL.CONFIG.RECOLLCONF.WEBDOWNLOADSDIR"></a><span class="term"><code class="varname">webdownloadsdir</code></span>
</dt>
<dd>
<p>The path to browser downloads directory. This is
where the new browser add-on extension has to
create the files. They are then moved by a script
to webqueuedir.</p>
</dd> </dd>
<dt> <dt>
<a name= <a name=

View File

@ -965,27 +965,46 @@ thrQSizes = -1 -1 -1
</sect1> </sect1>
<sect1 id="RCL.INDEXING.WEBQUEUE"> <sect1 id="RCL.INDEXING.WEBQUEUE">
<title>Indexing WEB pages you wisit</title> <title>Indexing the WEB pages which you wisit.</title>
<para>With the help of a <application>Firefox</application> <para>With the help of a <application>Firefox</application>
extension, &RCL; can index the Internet pages that you visit. The extension, &RCL; can index the Internet pages that you visit. The
extension was initially designed for the extension has a long history: it was initially designed for the
<application>Beagle</application> indexer, but it has recently be <application>Beagle</application> indexer, then adapted to &RCL; and
renamed and better adapted to &RCL;.</para> the <application>Firefox</application> <application>XUL</application>
API. A new version of the addon has been written to work with the
<application>WebExtensions</application> API, which is the only one
supported after <application>Firefox</application> version 57.</para>
<para>The extension works by copying visited WEB pages to an indexing <para>The extension works by copying visited WEB pages to an indexing
queue directory, which &RCL; then processes, indexing the data, queue directory, which &RCL; then processes, indexing the data,
storing it into a local cache, then removing the file from the storing it into a local cache, then removing the file from the
queue.</para> queue.</para>
<para>This feature can be enabled in the GUI <para>Because the WebExtensions API introduces more constraints to
<guilabel>Index configuration</guilabel> what extensions can do, the new version works with one
panel, or by editing the configuration file (set more step: the files are first created in the browser default
<varname>processwebqueue</varname> to 1).</para> downloads location (typically <filename>$HOME/Downloads</filename> ),
then moved by a script in the old queue location. The script is
automatically executed by the &RCL; indexer versions 1.23.5 and
newer. It could conceivably be executed independantly to make the new
browser extension compatible with an older &RCL; version (the script
is named <command>recoll-we-move-files.py</command>).</para>
<note><para>For the WebExtensions-based version to work, it is
necessary to set the <literal>webdownloadsdir</literal> value in the
configuration if it was changed from the default
<filename>$HOME/Downloads</filename> in the browser
preferences.</para></note>
<para>The visited WEB pages indexing feature can be enabled in the
GUI <guilabel>Index configuration</guilabel> panel, or by editing the
configuration file (set <varname>processwebqueue</varname> to
1).</para>
<para>A current pointer to the extension can be found, along with <para>A current pointer to the extension can be found, along with
up-to-date instructions, on the up-to-date instructions, on the
<ulink url="&FAQS;IndexWebHistory">Recoll wiki</ulink>.</para> <ulink url="&FAQS;IndexWebHistory">Recoll wiki</ulink>.</para>
<para>A copy of the indexed WEB pages is retained by Recoll in a <para>A copy of the indexed WEB pages is retained by Recoll in a
local cache (from which previews can be fetched). The cache size can local cache (from which previews can be fetched). The cache size can
@ -1020,8 +1039,8 @@ thrQSizes = -1 -1 -1
</varlistentry> </varlistentry>
<varlistentry> <varlistentry>
<term>charset</term> <term>charset</term>
<listitem>If set, this defines the file character set <listitem><para>If set, this defines the file character set
(mostly useful for plain text files).</listitem> (mostly useful for plain text files).</para></listitem>
</varlistentry> </varlistentry>
</variablelist> </variablelist>
</para> </para>

View File

@ -42,8 +42,10 @@ try:
except: except:
import rclconfig import rclconfig
verbosity = 0
def logdeb(s): def logdeb(s):
print("%s"%s, file=sys.stderr) if verbosity >= 4:
print("%s"%s, file=sys.stderr)
# # wnloaded instances of the same page are suffixed with (nn) by the # # wnloaded instances of the same page are suffixed with (nn) by the
# browser. We are passed a list of (hash, instancenum, filename) # browser. We are passed a list of (hash, instancenum, filename)
@ -94,8 +96,12 @@ def usage():
print("Usage: recoll-we-move-files.py [<downloaddir>]", file=sys.stderr) print("Usage: recoll-we-move-files.py [<downloaddir>]", file=sys.stderr)
sys.exit(1) sys.exit(1)
# Source dir is parameter, else default Downloads directory config = rclconfig.RclConfig()
downloadsdir = os.path.expanduser("~/Downloads")
# Source dir is parameter, else from config else default Downloads directory
downloadsdir = config.getConfParam("webdownloadsdir")
if not downloadsdir:
downloadsdir = os.path.expanduser("~/Downloads")
if len(sys.argv) == 2: if len(sys.argv) == 2:
mydir = sys.argv[1] mydir = sys.argv[1]
elif len(sys.argv) == 1: elif len(sys.argv) == 1:
@ -106,12 +112,13 @@ if not os.path.isdir(mydir):
usage() usage()
# Get target webqueue recoll directory from recoll configuration # Get target webqueue recoll directory from recoll configuration
config = rclconfig.RclConfig()
webqueuedir = config.getConfParam("webqueuedir") webqueuedir = config.getConfParam("webqueuedir")
if not webqueuedir: if not webqueuedir:
webqueuedir = "~/.recollweb/ToIndex" webqueuedir = "~/.recollweb/ToIndex"
webqueuedir = os.path.expanduser(webqueuedir) webqueuedir = os.path.expanduser(webqueuedir)
logdeb("webqueuedir is %s" % webqueuedir) os.makedirs(webqueuedir, exist_ok = True)
# logdeb("webqueuedir is %s" % webqueuedir)
# Get the lists of all files created by the browser addon # Get the lists of all files created by the browser addon
mfiles, cfiles = list_all_files(mydir) mfiles, cfiles = list_all_files(mydir)
@ -130,7 +137,9 @@ cfiles = delete_previous_instances(cfiles, downloadsdir)
for hash in cfiles.keys(): for hash in cfiles.keys():
if hash in mfiles.keys(): if hash in mfiles.keys():
newname = "firefox-recoll-web-"+hash newname = "firefox-recoll-web-"+hash
shutil.move(cfiles[hash], os.path.join(webqueuedir, newname)) shutil.move(os.path.join(downloadsdir, cfiles[hash]),
shutil.move(mfiles[hash], os.path.join(webqueuedir, "." + newname)) os.path.join(webqueuedir, newname))
shutil.move(os.path.join(downloadsdir, mfiles[hash]),
os.path.join(webqueuedir, "." + newname))

View File

@ -125,6 +125,7 @@ bool ConfIndexer::index(bool resetbefore, ixType typestorun, int flags)
} }
#ifndef DISABLE_WEB_INDEXER #ifndef DISABLE_WEB_INDEXER
if (m_doweb && (typestorun & IxTWebQueue)) { if (m_doweb && (typestorun & IxTWebQueue)) {
runWebFilesMoverScript(m_config);
deleteZ(m_webindexer); deleteZ(m_webindexer);
m_webindexer = new BeagleQueueIndexer(m_config, &m_db, m_updater); m_webindexer = new BeagleQueueIndexer(m_config, &m_db, m_updater);
if (!m_webindexer || !m_webindexer->index()) { if (!m_webindexer || !m_webindexer->index()) {

View File

@ -464,15 +464,22 @@ bool startMonitor(RclConfig *conf, int opts)
bool timedout; bool timedout;
time_t lastauxtime = time(0); time_t lastauxtime = time(0);
time_t lastixtime = lastauxtime; time_t lastixtime = lastauxtime;
time_t lastmovetime = 0;
bool didsomething = false; bool didsomething = false;
list<string> modified; list<string> modified;
list<string> deleted; list<string> deleted;
;
// Set a relatively short timeout for better monitoring of exit requests
while (true) { while (true) {
time_t now = time(0);
if (now - lastmovetime > ixinterval) {
lastmovetime = now;
runWebFilesMoverScript(conf);
}
{ {
// Wait for event or timeout.
// Set a relatively short timeout for better monitoring of
// exit requests.
std::unique_lock<std::mutex> lock = rclEQ.wait(2, &timedout); std::unique_lock<std::mutex> lock = rclEQ.wait(2, &timedout);
// x11IsAlive() can't be called from ok() because both // x11IsAlive() can't be called from ok() because both
@ -525,9 +532,9 @@ bool startMonitor(RclConfig *conf, int opts)
} }
} }
now = time(0);
// Process. We don't do this every time but let the lists accumulate // Process. We don't do this every time but let the lists accumulate
// a little, this saves processing. Start at once if list is big. // a little, this saves processing. Start at once if list is big.
time_t now = time(0);
if (expeditedIndexingRequested(conf) || if (expeditedIndexingRequested(conf) ||
(now - lastixtime > ixinterval) || (now - lastixtime > ixinterval) ||
(deleted.size() + modified.size() > 20)) { (deleted.size() + modified.size() > 20)) {
@ -553,8 +560,9 @@ bool startMonitor(RclConfig *conf, int opts)
} }
// Recreate the auxiliary dbs every hour at most. // Recreate the auxiliary dbs every hour at most.
if (didsomething && time(0) - lastauxtime > auxinterval) { now = time(0);
lastauxtime = time(0); if (didsomething && now - lastauxtime > auxinterval) {
lastauxtime = now;
didsomething = false; didsomething = false;
if (!createAuxDbs(conf)) { if (!createAuxDbs(conf)) {
// We used to bail out on error here. Not anymore, // We used to bail out on error here. Not anymore,

View File

@ -337,6 +337,48 @@ static bool checktopdirs(RclConfig *config, vector<string>& nonexist)
return true; return true;
} }
bool runWebFilesMoverScript(RclConfig *config)
{
static string downloadsdir;
if (downloadsdir.empty()) {
if (!config->getConfParam("downloadsdir", downloadsdir)) {
downloadsdir = path_tildexpand("~/Downloads");
}
}
static string cmdpath;
if (cmdpath.empty()) {
cmdpath = config->findFilter("recoll-we-move-files.py");
if (cmdpath.empty()) {
LOGERR("runWFMoverScript: recoll-we-move-files.py not found\n");
return false;
}
}
/* Arrange to not actually run the script if the directory did not change */
static time_t dirmtime;
time_t ndirmtime = 0;
struct stat st;
if (::stat(downloadsdir.c_str(), &st) == 0) {
ndirmtime = st.st_mtime;
}
/* If stat fails, presumably Downloads does not exist or is not
accessible, dirmtime and mdirmtime stay at 0, and we never
execute the script, which is the right thing. */
if (dirmtime != ndirmtime) {
/* The script is going to change the directory, so updating
dirmtime before it runs means that we are going to execute
it one time too many (it will run without doing anything),
but we can't set the mtime to after the run in case files
are created during the run. */
dirmtime = ndirmtime;
ExecCmd cmd;
int status = cmd.doexec(cmdpath, {});
return status == 0;
}
return true;
}
static const char *thisprog; static const char *thisprog;
static const char usage [] = static const char usage [] =

View File

@ -26,6 +26,16 @@ extern bool indexfiles(RclConfig *config, std::list<std::string> &filenames);
extern bool purgefiles(RclConfig *config, std::list<std::string> &filenames); extern bool purgefiles(RclConfig *config, std::list<std::string> &filenames);
extern bool createAuxDbs(RclConfig *config); extern bool createAuxDbs(RclConfig *config);
/**
* Helper method for executing the recoll-we (new WebExtensions plugin) helper
* script. This moves files from the browser download directory (only
* place where the browser accepts to create them), to the web queue
* dir). This keeps the c++ code compatible with old and new addon.
* The script is executed before a batch pass, or from time to time in
* the monitor, if web processing is enabled.
*/
extern bool runWebFilesMoverScript(RclConfig *);
extern int stopindexing; extern int stopindexing;
// Try to explain what went wrong... // Try to explain what went wrong...

View File

@ -438,6 +438,13 @@ webcachemaxmbs = 40
# possible to change it.</descr></var> # possible to change it.</descr></var>
#webqueuedir = ~/.recollweb/ToIndex #webqueuedir = ~/.recollweb/ToIndex
# <var name="webdownloadsdir" type="fn">
#
# <brief>The path to browser downloads directory.</brief><descr>This is
# where the new browser add-on extension has to create the files. They are
# then moved by a script to webqueuedir.</descr></var>
#webdownloadsdir = ~/Downloads
# <var name="aspellDicDir" type="dfn"> # <var name="aspellDicDir" type="dfn">
# #
# <brief>Aspell dictionary storage directory location.</brief> <descr>The # <brief>Aspell dictionary storage directory location.</brief> <descr>The