added adaptor script for new browser plugin
This commit is contained in:
parent
efcb4e0947
commit
cd44aa33e1
136
src/filters/recoll-we-move-files.py
Executable file
136
src/filters/recoll-we-move-files.py
Executable file
@ -0,0 +1,136 @@
|
||||
#!/usr/bin/env python3
|
||||
# Copyright (C) 2017 J.F.Dockes
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the
|
||||
# Free Software Foundation, Inc.,
|
||||
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
|
||||
""" Monitor download directory for pages created by the recoll-we extension.
|
||||
|
||||
The files (content file and metadata side file) are then renamed and
|
||||
moved to the Recoll web queue directory.
|
||||
|
||||
We recognize recoll files by their pattern:
|
||||
recoll-we-[cm]-<md5>.rclwe, and the fact that both a d and an m file
|
||||
must exist. While not absolutely foolproof, this should be quite robust.
|
||||
|
||||
The script is normally executed by recollindex at appropriate times,
|
||||
but it can also be run by hand.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import re
|
||||
try:
|
||||
from hashlib import md5 as md5
|
||||
except:
|
||||
import md5
|
||||
import shutil
|
||||
|
||||
try:
|
||||
from recoll import rclconfig
|
||||
except:
|
||||
import rclconfig
|
||||
|
||||
def logdeb(s):
|
||||
print("%s"%s, file=sys.stderr)
|
||||
|
||||
# # wnloaded instances of the same page are suffixed with (nn) by the
|
||||
# browser. We are passed a list of (hash, instancenum, filename)
|
||||
# triplets, sort it, and keep only the latest file.
|
||||
def delete_previous_instances(l, downloadsdir):
|
||||
l.sort(key = lambda e: "%s-%05d"%(e[0], e[1]), reverse=True)
|
||||
ret = {}
|
||||
i = 0
|
||||
while i < len(l):
|
||||
hash,num,fn = l[i]
|
||||
logdeb("Found %s"%fn)
|
||||
ret[hash] = fn
|
||||
j = 1
|
||||
while i + j < len(l):
|
||||
if l[i+j][0] == hash:
|
||||
ofn = l[i+j][2]
|
||||
logdeb("Deleting %s"%ofn)
|
||||
os.unlink(os.path.join(downloadsdir, ofn))
|
||||
j += 1
|
||||
else:
|
||||
break
|
||||
i += j
|
||||
return ret
|
||||
|
||||
fn_re = re.compile('''recoll-we-([mc])-([0-9a-f]+)(\([0-9]+\))?\.rclwe''')
|
||||
|
||||
def list_all_files(dir):
|
||||
files=os.listdir(dir)
|
||||
mfiles = []
|
||||
cfiles = []
|
||||
for fn in files:
|
||||
mo = fn_re.match(fn)
|
||||
if mo:
|
||||
mc = mo.group(1)
|
||||
hash = mo.group(2)
|
||||
num = mo.group(3)
|
||||
if not num:
|
||||
num = "(0)"
|
||||
num = int(num.strip("()"))
|
||||
if mc == 'm':
|
||||
mfiles.append([hash, num, fn])
|
||||
else:
|
||||
cfiles.append([hash, num, fn])
|
||||
return mfiles,cfiles
|
||||
|
||||
#######################
|
||||
def usage():
|
||||
print("Usage: recoll-we-move-files.py [<downloaddir>]", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# Source dir is parameter, else default Downloads directory
|
||||
downloadsdir = os.path.expanduser("~/Downloads")
|
||||
if len(sys.argv) == 2:
|
||||
mydir = sys.argv[1]
|
||||
elif len(sys.argv) == 1:
|
||||
mydir = downloadsdir
|
||||
else:
|
||||
usage()
|
||||
if not os.path.isdir(mydir):
|
||||
usage()
|
||||
|
||||
# Get target webqueue recoll directory from recoll configuration
|
||||
config = rclconfig.RclConfig()
|
||||
webqueuedir = config.getConfParam("webqueuedir")
|
||||
if not webqueuedir:
|
||||
webqueuedir = "~/.recollweb/ToIndex"
|
||||
webqueuedir = os.path.expanduser(webqueuedir)
|
||||
logdeb("webqueuedir is %s" % webqueuedir)
|
||||
|
||||
# Get the lists of all files created by the browser addon
|
||||
mfiles, cfiles = list_all_files(mydir)
|
||||
|
||||
# Only keep the last version
|
||||
mfiles = delete_previous_instances(mfiles, downloadsdir)
|
||||
cfiles = delete_previous_instances(cfiles, downloadsdir)
|
||||
|
||||
#logdeb("Mfiles: %s"% mfiles)
|
||||
#logdeb("Cfiles: %s"% cfiles)
|
||||
|
||||
# Move files to webqueuedir target directory
|
||||
# The webextensions plugin creates the metadata files first. So it may
|
||||
# happen that a data file is missing, keep them for next pass.
|
||||
# The old plugin created the data first, so we move data then meta
|
||||
for hash in cfiles.keys():
|
||||
if hash in mfiles.keys():
|
||||
newname = "firefox-recoll-web-"+hash
|
||||
shutil.move(cfiles[hash], os.path.join(webqueuedir, newname))
|
||||
shutil.move(mfiles[hash], os.path.join(webqueuedir, "." + newname))
|
||||
|
||||
|
||||
@ -430,9 +430,12 @@ webcachemaxmbs = 40
|
||||
|
||||
# <var name="webqueuedir" type="fn">
|
||||
#
|
||||
# <brief>The path to the Web indexing queue.</brief><descr>This is
|
||||
# hard-coded in the plugin as ~/.recollweb/ToIndex so there should be no
|
||||
# need or possibility to change it.</descr></var>
|
||||
# <brief>The path to the Web indexing queue.</brief><descr>This used to be
|
||||
# hard-coded in the old plugin as ~/.recollweb/ToIndex so there would be no
|
||||
# need or possibility to change it, but the WebExtensions plugin now downloads
|
||||
# the files to the user Downloads directory, and a script moves them to
|
||||
# webqueuedir. The script reads this value from the config so it has become
|
||||
# possible to change it.</descr></var>
|
||||
#webqueuedir = ~/.recollweb/ToIndex
|
||||
|
||||
# <var name="aspellDicDir" type="dfn">
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user