diff --git a/src/filters/recoll-we-move-files.py b/src/filters/recoll-we-move-files.py new file mode 100755 index 00000000..791c77fe --- /dev/null +++ b/src/filters/recoll-we-move-files.py @@ -0,0 +1,136 @@ +#!/usr/bin/env python3 +# Copyright (C) 2017 J.F.Dockes +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the +# Free Software Foundation, Inc., +# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + +""" Monitor download directory for pages created by the recoll-we extension. + +The files (content file and metadata side file) are then renamed and +moved to the Recoll web queue directory. + +We recognize recoll files by their pattern: +recoll-we-[cm]-.rclwe, and the fact that both a d and an m file +must exist. While not absolutely foolproof, this should be quite robust. + +The script is normally executed by recollindex at appropriate times, +but it can also be run by hand. +""" + +import sys +import os +import re +try: + from hashlib import md5 as md5 +except: + import md5 +import shutil + +try: + from recoll import rclconfig +except: + import rclconfig + +def logdeb(s): + print("%s"%s, file=sys.stderr) + +# # wnloaded instances of the same page are suffixed with (nn) by the +# browser. We are passed a list of (hash, instancenum, filename) +# triplets, sort it, and keep only the latest file. +def delete_previous_instances(l, downloadsdir): + l.sort(key = lambda e: "%s-%05d"%(e[0], e[1]), reverse=True) + ret = {} + i = 0 + while i < len(l): + hash,num,fn = l[i] + logdeb("Found %s"%fn) + ret[hash] = fn + j = 1 + while i + j < len(l): + if l[i+j][0] == hash: + ofn = l[i+j][2] + logdeb("Deleting %s"%ofn) + os.unlink(os.path.join(downloadsdir, ofn)) + j += 1 + else: + break + i += j + return ret + +fn_re = re.compile('''recoll-we-([mc])-([0-9a-f]+)(\([0-9]+\))?\.rclwe''') + +def list_all_files(dir): + files=os.listdir(dir) + mfiles = [] + cfiles = [] + for fn in files: + mo = fn_re.match(fn) + if mo: + mc = mo.group(1) + hash = mo.group(2) + num = mo.group(3) + if not num: + num = "(0)" + num = int(num.strip("()")) + if mc == 'm': + mfiles.append([hash, num, fn]) + else: + cfiles.append([hash, num, fn]) + return mfiles,cfiles + +####################### +def usage(): + print("Usage: recoll-we-move-files.py []", file=sys.stderr) + sys.exit(1) + +# Source dir is parameter, else default Downloads directory +downloadsdir = os.path.expanduser("~/Downloads") +if len(sys.argv) == 2: + mydir = sys.argv[1] +elif len(sys.argv) == 1: + mydir = downloadsdir +else: + usage() +if not os.path.isdir(mydir): + usage() + +# Get target webqueue recoll directory from recoll configuration +config = rclconfig.RclConfig() +webqueuedir = config.getConfParam("webqueuedir") +if not webqueuedir: + webqueuedir = "~/.recollweb/ToIndex" +webqueuedir = os.path.expanduser(webqueuedir) +logdeb("webqueuedir is %s" % webqueuedir) + +# Get the lists of all files created by the browser addon +mfiles, cfiles = list_all_files(mydir) + +# Only keep the last version +mfiles = delete_previous_instances(mfiles, downloadsdir) +cfiles = delete_previous_instances(cfiles, downloadsdir) + +#logdeb("Mfiles: %s"% mfiles) +#logdeb("Cfiles: %s"% cfiles) + +# Move files to webqueuedir target directory +# The webextensions plugin creates the metadata files first. So it may +# happen that a data file is missing, keep them for next pass. +# The old plugin created the data first, so we move data then meta +for hash in cfiles.keys(): + if hash in mfiles.keys(): + newname = "firefox-recoll-web-"+hash + shutil.move(cfiles[hash], os.path.join(webqueuedir, newname)) + shutil.move(mfiles[hash], os.path.join(webqueuedir, "." + newname)) + + diff --git a/src/sampleconf/recoll.conf b/src/sampleconf/recoll.conf index 44f28b65..40128f3c 100644 --- a/src/sampleconf/recoll.conf +++ b/src/sampleconf/recoll.conf @@ -430,9 +430,12 @@ webcachemaxmbs = 40 # # -# The path to the Web indexing queue.This is -# hard-coded in the plugin as ~/.recollweb/ToIndex so there should be no -# need or possibility to change it. +# The path to the Web indexing queue.This used to be +# hard-coded in the old plugin as ~/.recollweb/ToIndex so there would be no +# need or possibility to change it, but the WebExtensions plugin now downloads +# the files to the user Downloads directory, and a script moves them to +# webqueuedir. The script reads this value from the config so it has become +# possible to change it. #webqueuedir = ~/.recollweb/ToIndex #