diff --git a/src/Makefile.am b/src/Makefile.am index bc33c113..0a5415ff 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -636,6 +636,7 @@ filters/rclxml.py \ filters/rclxmp.py \ filters/rclxslt.py \ filters/rclzip \ +filters/recoll-we-move-files.py \ filters/ppt-dump.py \ filters/xls-dump.py \ filters/xlsxmltocsv.py \ @@ -645,6 +646,7 @@ python/recoll/recoll/rclconfig.py install-data-hook: (cd $(DESTDIR)/$(filterdir); \ chmod a+x rcl* ppt-dump.py xls-dump.py xlsxmltocsv.py hotrecoll.py; \ + chmod a+x recoll-we-move-files.py; \ chmod 0644 msodump.zip rclexecm.py rcllatinstops.zip rclconfig.py) if MAKEUSERDOC diff --git a/src/VERSION b/src/VERSION index 27ddcc14..ca8ec414 100644 --- a/src/VERSION +++ b/src/VERSION @@ -1 +1 @@ -1.23.4 +1.23.5 diff --git a/src/doc/user/Makefile b/src/doc/user/Makefile index a4ce4db3..079821d9 100644 --- a/src/doc/user/Makefile +++ b/src/doc/user/Makefile @@ -39,7 +39,7 @@ index.html: usermanual.xml usermanual.pdf: usermanual.xml dblatex $< -UTILBUILDS=/home/dockes/projets/builds/medocutils/ +UTILBUILDS=/home/dockes/tmp/builds/medocutils/ recoll-conf-xml: $(UTILBUILDS)/confxml --docbook \ --idprefix=RCL.INSTALL.CONFIG.RECOLLCONF \ diff --git a/src/doc/user/recoll.conf.xml b/src/doc/user/recoll.conf.xml index 80504303..c05a4453 100644 --- a/src/doc/user/recoll.conf.xml +++ b/src/doc/user/recoll.conf.xml @@ -83,7 +83,7 @@ be ignored inside zip archives. This is used directly by the zip handler, and has a function similar to skippedNames, but works independantly. Can be redefined for subdirectories. Supported by recoll 1.20 and newer. See -https://www.lesbonscomptes.com/recoll/faqsandhowtos/Filtering%20out%20Zip%20archive%20members +https://www.lesbonscomptes.com/recoll/faqsandhowtos/FilteringOutZipArchiveMembers.html followLinks @@ -362,9 +362,17 @@ Default: 40 MB. Reducing the size will not physically truncate the file. webqueuedir -The path to the Web indexing queue. This is -hard-coded in the plugin as ~/.recollweb/ToIndex so there should be no -need or possibility to change it. +The path to the Web indexing queue. This used to be +hard-coded in the old plugin as ~/.recollweb/ToIndex so there would be no +need or possibility to change it, but the WebExtensions plugin now downloads +the files to the user Downloads directory, and a script moves them to +webqueuedir. The script reads this value from the config so it has become +possible to change it. + +webdownloadsdir +The path to browser downloads directory. This is +where the new browser add-on extension has to create the files. They are +then moved by a script to webqueuedir. aspellDicDir Aspell dictionary storage directory location. The diff --git a/src/doc/user/usermanual.html b/src/doc/user/usermanual.html index 179d865a..d88b2a00 100644 --- a/src/doc/user/usermanual.html +++ b/src/doc/user/usermanual.html @@ -131,8 +131,8 @@ alink="#0000FF">
2.4. Indexing WEB pages you - wisit
+ "#RCL.INDEXING.WEBQUEUE">Indexing the WEB pages which + you wisit.
2.5. Extended attributes data
@@ -1505,27 +1505,56 @@ thrQSizes = -1 -1 -1

2.4.�Indexing WEB pages - you wisit

+ "RCL.INDEXING.WEBQUEUE">2.4.�Indexing the WEB + pages which you wisit.

With the help of a Firefox extension, Recoll can index the Internet pages - that you visit. The extension was initially designed for - the Beagle indexer, but it - has recently be renamed and better adapted to Recoll.

+ that you visit. The extension has a long history: it was + initially designed for the Beagle indexer, then adapted to + Recoll and the + Firefox XUL API. A new version of the addon + has been written to work with the WebExtensions API, which is the only + one supported after Firefox version 57.

The extension works by copying visited WEB pages to an indexing queue directory, which Recoll then processes, indexing the data, storing it into a local cache, then removing the file from the queue.

-

This feature can be enabled in the GUI Index configuration panel, or by editing - the configuration file (set processwebqueue to 1).

+

Because the WebExtensions API introduces more + constraints to what extensions can do, the new version + works with one more step: the files are first created in + the browser default downloads location (typically + $HOME/Downloads ), then moved + by a script in the old queue location. The script is + automatically executed by the Recoll indexer versions 1.23.5 and + newer. It could conceivably be executed independantly to + make the new browser extension compatible with an older + Recoll version (the script + is named recoll-we-move-files.py).

+
+

Note

+

For the WebExtensions-based version to work, it is + necessary to set the webdownloadsdir value in the + configuration if it was changed from the default + $HOME/Downloads in the + browser preferences.

+
+

The visited WEB pages indexing feature can be enabled in + the GUI Index configuration + panel, or by editing the configuration file (set + processwebqueue to 1).

A current pointer to the extension can be found, along with up-to-date instructions, on the

charset
-
If set, this defines the file character set (mostly - useful for plain text files).
+
+

If set, this defines the file character set + (mostly useful for plain text files).

+

By default, other attributes are handled as + https://www.lesbonscomptes.com/recoll/faqsandhowtos/FilteringOutZipArchiveMembers.html

webqueuedir
-

The path to the Web indexing queue. This is - hard-coded in the plugin as ~/.recollweb/ToIndex so - there should be no need or possibility to change - it.

+

The path to the Web indexing queue. This used to + be hard-coded in the old plugin as + ~/.recollweb/ToIndex so there would be no need or + possibility to change it, but the WebExtensions + plugin now downloads the files to the user + Downloads directory, and a script moves them to + webqueuedir. The script reads this value from the + config so it has become possible to change it.

+
+
+ webdownloadsdir +
+
+

The path to browser downloads directory. This is + where the new browser add-on extension has to + create the files. They are then moved by a script + to webqueuedir.

- Indexing WEB pages you wisit + Indexing the WEB pages which you wisit. With the help of a Firefox extension, &RCL; can index the Internet pages that you visit. The - extension was initially designed for the - Beagle indexer, but it has recently be - renamed and better adapted to &RCL;. + extension has a long history: it was initially designed for the + Beagle indexer, then adapted to &RCL; and + the Firefox XUL + API. A new version of the addon has been written to work with the + WebExtensions API, which is the only one + supported after Firefox version 57. The extension works by copying visited WEB pages to an indexing - queue directory, which &RCL; then processes, indexing the data, - storing it into a local cache, then removing the file from the - queue. + queue directory, which &RCL; then processes, indexing the data, + storing it into a local cache, then removing the file from the + queue. - This feature can be enabled in the GUI - Index configuration - panel, or by editing the configuration file (set - processwebqueue to 1). + Because the WebExtensions API introduces more constraints to + what extensions can do, the new version works with one + more step: the files are first created in the browser default + downloads location (typically $HOME/Downloads ), + then moved by a script in the old queue location. The script is + automatically executed by the &RCL; indexer versions 1.23.5 and + newer. It could conceivably be executed independantly to make the new + browser extension compatible with an older &RCL; version (the script + is named recoll-we-move-files.py). + + For the WebExtensions-based version to work, it is + necessary to set the webdownloadsdir value in the + configuration if it was changed from the default + $HOME/Downloads in the browser + preferences. + + The visited WEB pages indexing feature can be enabled in the + GUI Index configuration panel, or by editing the + configuration file (set processwebqueue to + 1). A current pointer to the extension can be found, along with - up-to-date instructions, on the - Recoll wiki. + up-to-date instructions, on the + Recoll wiki. A copy of the indexed WEB pages is retained by Recoll in a local cache (from which previews can be fetched). The cache size can @@ -1020,8 +1039,8 @@ thrQSizes = -1 -1 -1 charset - If set, this defines the file character set - (mostly useful for plain text files). + If set, this defines the file character set + (mostly useful for plain text files). diff --git a/src/filters/recoll-we-move-files.py b/src/filters/recoll-we-move-files.py index 791c77fe..7668baf9 100755 --- a/src/filters/recoll-we-move-files.py +++ b/src/filters/recoll-we-move-files.py @@ -42,8 +42,10 @@ try: except: import rclconfig +verbosity = 0 def logdeb(s): - print("%s"%s, file=sys.stderr) + if verbosity >= 4: + print("%s"%s, file=sys.stderr) # # wnloaded instances of the same page are suffixed with (nn) by the # browser. We are passed a list of (hash, instancenum, filename) @@ -94,8 +96,12 @@ def usage(): print("Usage: recoll-we-move-files.py []", file=sys.stderr) sys.exit(1) -# Source dir is parameter, else default Downloads directory -downloadsdir = os.path.expanduser("~/Downloads") +config = rclconfig.RclConfig() + +# Source dir is parameter, else from config else default Downloads directory +downloadsdir = config.getConfParam("webdownloadsdir") +if not downloadsdir: + downloadsdir = os.path.expanduser("~/Downloads") if len(sys.argv) == 2: mydir = sys.argv[1] elif len(sys.argv) == 1: @@ -106,12 +112,13 @@ if not os.path.isdir(mydir): usage() # Get target webqueue recoll directory from recoll configuration -config = rclconfig.RclConfig() webqueuedir = config.getConfParam("webqueuedir") if not webqueuedir: webqueuedir = "~/.recollweb/ToIndex" webqueuedir = os.path.expanduser(webqueuedir) -logdeb("webqueuedir is %s" % webqueuedir) +os.makedirs(webqueuedir, exist_ok = True) + +# logdeb("webqueuedir is %s" % webqueuedir) # Get the lists of all files created by the browser addon mfiles, cfiles = list_all_files(mydir) @@ -130,7 +137,9 @@ cfiles = delete_previous_instances(cfiles, downloadsdir) for hash in cfiles.keys(): if hash in mfiles.keys(): newname = "firefox-recoll-web-"+hash - shutil.move(cfiles[hash], os.path.join(webqueuedir, newname)) - shutil.move(mfiles[hash], os.path.join(webqueuedir, "." + newname)) + shutil.move(os.path.join(downloadsdir, cfiles[hash]), + os.path.join(webqueuedir, newname)) + shutil.move(os.path.join(downloadsdir, mfiles[hash]), + os.path.join(webqueuedir, "." + newname)) diff --git a/src/index/indexer.cpp b/src/index/indexer.cpp index e5c9b082..099bbc27 100644 --- a/src/index/indexer.cpp +++ b/src/index/indexer.cpp @@ -125,6 +125,7 @@ bool ConfIndexer::index(bool resetbefore, ixType typestorun, int flags) } #ifndef DISABLE_WEB_INDEXER if (m_doweb && (typestorun & IxTWebQueue)) { + runWebFilesMoverScript(m_config); deleteZ(m_webindexer); m_webindexer = new BeagleQueueIndexer(m_config, &m_db, m_updater); if (!m_webindexer || !m_webindexer->index()) { diff --git a/src/index/rclmonprc.cpp b/src/index/rclmonprc.cpp index 833ed18b..41cf7589 100644 --- a/src/index/rclmonprc.cpp +++ b/src/index/rclmonprc.cpp @@ -464,15 +464,22 @@ bool startMonitor(RclConfig *conf, int opts) bool timedout; time_t lastauxtime = time(0); time_t lastixtime = lastauxtime; + time_t lastmovetime = 0; bool didsomething = false; list modified; list deleted; - ; - - // Set a relatively short timeout for better monitoring of exit requests while (true) { + time_t now = time(0); + if (now - lastmovetime > ixinterval) { + lastmovetime = now; + runWebFilesMoverScript(conf); + } + { + // Wait for event or timeout. + // Set a relatively short timeout for better monitoring of + // exit requests. std::unique_lock lock = rclEQ.wait(2, &timedout); // x11IsAlive() can't be called from ok() because both @@ -525,9 +532,9 @@ bool startMonitor(RclConfig *conf, int opts) } } + now = time(0); // Process. We don't do this every time but let the lists accumulate // a little, this saves processing. Start at once if list is big. - time_t now = time(0); if (expeditedIndexingRequested(conf) || (now - lastixtime > ixinterval) || (deleted.size() + modified.size() > 20)) { @@ -553,8 +560,9 @@ bool startMonitor(RclConfig *conf, int opts) } // Recreate the auxiliary dbs every hour at most. - if (didsomething && time(0) - lastauxtime > auxinterval) { - lastauxtime = time(0); + now = time(0); + if (didsomething && now - lastauxtime > auxinterval) { + lastauxtime = now; didsomething = false; if (!createAuxDbs(conf)) { // We used to bail out on error here. Not anymore, diff --git a/src/index/recollindex.cpp b/src/index/recollindex.cpp index 2daa7d37..48034757 100644 --- a/src/index/recollindex.cpp +++ b/src/index/recollindex.cpp @@ -337,6 +337,48 @@ static bool checktopdirs(RclConfig *config, vector& nonexist) return true; } +bool runWebFilesMoverScript(RclConfig *config) +{ + static string downloadsdir; + if (downloadsdir.empty()) { + if (!config->getConfParam("downloadsdir", downloadsdir)) { + downloadsdir = path_tildexpand("~/Downloads"); + } + } + static string cmdpath; + if (cmdpath.empty()) { + cmdpath = config->findFilter("recoll-we-move-files.py"); + if (cmdpath.empty()) { + LOGERR("runWFMoverScript: recoll-we-move-files.py not found\n"); + return false; + } + } + + /* Arrange to not actually run the script if the directory did not change */ + static time_t dirmtime; + time_t ndirmtime = 0; + struct stat st; + if (::stat(downloadsdir.c_str(), &st) == 0) { + ndirmtime = st.st_mtime; + } + /* If stat fails, presumably Downloads does not exist or is not + accessible, dirmtime and mdirmtime stay at 0, and we never + execute the script, which is the right thing. */ + if (dirmtime != ndirmtime) { + /* The script is going to change the directory, so updating + dirmtime before it runs means that we are going to execute + it one time too many (it will run without doing anything), + but we can't set the mtime to after the run in case files + are created during the run. */ + dirmtime = ndirmtime; + ExecCmd cmd; + int status = cmd.doexec(cmdpath, {}); + return status == 0; + } + return true; +} + + static const char *thisprog; static const char usage [] = diff --git a/src/index/recollindex.h b/src/index/recollindex.h index ba92d7cc..6870c5ea 100644 --- a/src/index/recollindex.h +++ b/src/index/recollindex.h @@ -26,6 +26,16 @@ extern bool indexfiles(RclConfig *config, std::list &filenames); extern bool purgefiles(RclConfig *config, std::list &filenames); extern bool createAuxDbs(RclConfig *config); +/** + * Helper method for executing the recoll-we (new WebExtensions plugin) helper + * script. This moves files from the browser download directory (only + * place where the browser accepts to create them), to the web queue + * dir). This keeps the c++ code compatible with old and new addon. + * The script is executed before a batch pass, or from time to time in + * the monitor, if web processing is enabled. + */ +extern bool runWebFilesMoverScript(RclConfig *); + extern int stopindexing; // Try to explain what went wrong... diff --git a/src/sampleconf/recoll.conf b/src/sampleconf/recoll.conf index 40128f3c..73734265 100644 --- a/src/sampleconf/recoll.conf +++ b/src/sampleconf/recoll.conf @@ -438,6 +438,13 @@ webcachemaxmbs = 40 # possible to change it. #webqueuedir = ~/.recollweb/ToIndex +# +# +# The path to browser downloads directory.This is +# where the new browser add-on extension has to create the files. They are +# then moved by a script to webqueuedir. +#webdownloadsdir = ~/Downloads + # # # Aspell dictionary storage directory location. The