Merge branch 'master' of https://opensourceprojects.eu/git/p/recoll1/code
This commit is contained in:
commit
beebb1028b
2
.gitignore
vendored
2
.gitignore
vendored
@ -36,6 +36,8 @@ src/depcomp
|
||||
src/doc/user/usermanual.pdf
|
||||
src/doc/user/webhelp/docs/*
|
||||
src/doc/user/webhelp/xincluded-profiled.xml
|
||||
src/filters/conftree.py
|
||||
src/filters/rclconfig.py
|
||||
src/filters/rclexecm.pyc
|
||||
src/filters/rcllatinclass.pyc
|
||||
src/install-sh
|
||||
|
||||
@ -6,9 +6,9 @@
|
||||
|
||||
PPA_KEYID=D38B9201
|
||||
|
||||
RCLVERS=1.23.8
|
||||
RCLVERS=1.24.1
|
||||
SCOPEVERS=1.20.2.4
|
||||
PPAVERS=1
|
||||
PPAVERS=2
|
||||
|
||||
#
|
||||
RCLSRC=/y/home/dockes/projets/fulltext/recoll/src
|
||||
|
||||
@ -1,3 +1,9 @@
|
||||
recoll (1.25.0pre0-1~ppaPPAVERS~SERIES1) SERIES; urgency=low
|
||||
|
||||
* Not a release: 1.25 development and testing
|
||||
|
||||
-- Jean-Francois Dockes <jf@dockes.org> Wed, 13 Jun 2018 08:38:00 +0200
|
||||
|
||||
recoll (1.24.1-1~ppaPPAVERS~SERIES1) SERIES; urgency=low
|
||||
|
||||
* New release 1.24.1.
|
||||
|
||||
@ -19,10 +19,10 @@ X-Python-Version: >= 2.7
|
||||
Vcs-Git: https://anonscm.debian.org/cgit/collab-maint/recoll.git
|
||||
Vcs-Browser: https://anonscm.debian.org/cgit/collab-maint/recoll.git
|
||||
Homepage: http://www.lesbonscomptes.com/recoll
|
||||
Standards-Version: 3.9.8
|
||||
Standards-Version: 4.1.4
|
||||
|
||||
Package: recoll
|
||||
Architecture: any
|
||||
Architecture: all
|
||||
Depends: recollgui, recollcmd, ${misc:Depends}
|
||||
Description: Personal full text search package with a Qt GUI
|
||||
This package is a personal full text search package is based on a very strong
|
||||
@ -49,30 +49,40 @@ Description: Personal full text search package with a Qt GUI
|
||||
|
||||
Package: recollcmd
|
||||
Architecture: any
|
||||
Breaks: recoll (<< 1.23.4)
|
||||
Replaces: recoll (<< 1.23.4)
|
||||
Depends: python, ${misc:Depends}, ${shlibs:Depends}
|
||||
Recommends: python-recoll, aspell, xdg-utils, xsltproc,
|
||||
python-libxml2, python-libxslt1
|
||||
Breaks: recoll (<< 1.23.9)
|
||||
Replaces: recoll (<< 1.23.9)
|
||||
Depends: python, python3, ${misc:Depends}, ${shlibs:Depends}
|
||||
Recommends: aspell,
|
||||
python-future,
|
||||
python-libxml2,
|
||||
python-libxslt1,
|
||||
python-recoll,
|
||||
python3-recoll,
|
||||
xdg-utils,
|
||||
xsltproc
|
||||
Suggests: antiword,
|
||||
ghostscript,
|
||||
groff,
|
||||
libimage-exiftool-perl,
|
||||
libinotifytools0,
|
||||
poppler-utils,
|
||||
pstotext,
|
||||
python-chm,
|
||||
python-lzma,
|
||||
python-mido,
|
||||
python-mutagen,
|
||||
python-rarfile,
|
||||
unrtf,
|
||||
untex
|
||||
untex,
|
||||
wv
|
||||
Description: Command line programs for recoll
|
||||
This package supports indexing and command line querying.
|
||||
|
||||
Package: recollgui
|
||||
Architecture: any
|
||||
Breaks: recoll (<< 1.23.4)
|
||||
Replaces: recoll (<< 1.23.4)
|
||||
Depends: recollcmd (= ${binary:Version}),
|
||||
${misc:Depends},
|
||||
${shlibs:Depends}
|
||||
Breaks: recoll (<< 1.23.9)
|
||||
Replaces: recoll (<< 1.23.9)
|
||||
Depends: recollcmd (= ${binary:Version}), ${misc:Depends}, ${shlibs:Depends}
|
||||
Description: GUI program and elements for recoll
|
||||
Main recoll GUI for configuring, controlling and querying recoll indexes.
|
||||
|
||||
|
||||
@ -1,2 +1,2 @@
|
||||
usr/lib/python2*/*-packages/*.egg-info
|
||||
usr/lib/python2*/*-packages/Recoll*.egg-info
|
||||
usr/lib/python2*/*-packages/recoll/*
|
||||
|
||||
@ -1,2 +1,2 @@
|
||||
usr/lib/python3*/*-packages/*.egg-info
|
||||
usr/lib/python3*/*-packages/Recoll*.egg-info
|
||||
usr/lib/python3*/*-packages/recoll/*
|
||||
|
||||
@ -1,7 +1,9 @@
|
||||
usr/bin/recollindex
|
||||
usr/bin/recollq
|
||||
usr/bin/xadump
|
||||
usr/lib/recoll
|
||||
usr/lib/*/recoll
|
||||
usr/lib/python*/*-packages/recollchm/*
|
||||
usr/lib/python*/*-packages/recollchm-*/*
|
||||
usr/share/man
|
||||
usr/share/recoll/doc
|
||||
usr/share/recoll/examples
|
||||
|
||||
@ -1,96 +1,54 @@
|
||||
#!/usr/bin/make -f
|
||||
# See debhelper(7) (uncomment to enable)
|
||||
# output every command that modifies files on the build system.
|
||||
#DH_VERBOSE = 1
|
||||
|
||||
# Uncomment this to turn on verbose mode.
|
||||
#export DH_VERBOSE=1
|
||||
# see EXAMPLES in dpkg-buildflags(1) and read /usr/share/dpkg/*
|
||||
DPKG_EXPORT_BUILDFLAGS = 1
|
||||
include /usr/share/dpkg/default.mk
|
||||
|
||||
export DEB_BUILD_MAINT_OPTIONS = hardening=+all
|
||||
# see FEATURE AREAS in dpkg-buildflags(1)
|
||||
#export DEB_BUILD_MAINT_OPTIONS = hardening=+all
|
||||
|
||||
DEB_HOST_GNU_TYPE ?= $(shell dpkg-architecture -qDEB_HOST_GNU_TYPE)
|
||||
DEB_BUILD_GNU_TYPE ?= $(shell dpkg-architecture -qDEB_BUILD_GNU_TYPE)
|
||||
# see ENVIRONMENT in dpkg-buildflags(1)
|
||||
# package maintainers to append CFLAGS
|
||||
#export DEB_CFLAGS_MAINT_APPEND = -Wall -pedantic
|
||||
# package maintainers to append LDFLAGS
|
||||
#export DEB_LDFLAGS_MAINT_APPEND = -Wl,--as-needed
|
||||
|
||||
CPPFLAGS:=$(shell dpkg-buildflags --get CPPFLAGS)
|
||||
CFLAGS:=$(shell dpkg-buildflags --get CFLAGS) $(CPPFLAGS)
|
||||
CXXFLAGS:=$(shell dpkg-buildflags --get CXXFLAGS) $(CPPFLAGS)
|
||||
LDFLAGS:=$(shell dpkg-buildflags --get LDFLAGS)
|
||||
# main packaging script based on dh7 syntax
|
||||
%:
|
||||
dh $@ --parallel --with python2 --with python3 --with autotools-dev
|
||||
|
||||
override_dh_auto_configure:
|
||||
dh_auto_configure -- --enable-recollq --enable-xadump
|
||||
|
||||
build3vers := $(shell py3versions -sv)
|
||||
|
||||
#build qt5 UI
|
||||
export QT_SELECT := qt5
|
||||
|
||||
ifneq (,$(findstring noopt,$(DEB_BUILD_OPTIONS)))
|
||||
CFLAGS += -O0
|
||||
else
|
||||
CFLAGS += -O2
|
||||
endif
|
||||
|
||||
config.status: configure
|
||||
dh_testdir
|
||||
./configure CFLAGS="$(CFLAGS)" LDFLAGS="$(LDFLAGS)" \
|
||||
--host=$(DEB_HOST_GNU_TYPE) \
|
||||
--build=$(DEB_BUILD_GNU_TYPE) \
|
||||
--mandir=\$${prefix}/share/man \
|
||||
--prefix=/usr \
|
||||
--enable-recollq \
|
||||
--enable-xadump
|
||||
|
||||
build: build-arch build-indep
|
||||
build-arch: build-stamp
|
||||
build-indep: build-stamp
|
||||
build-stamp: config.status
|
||||
dh_testdir
|
||||
$(MAKE)
|
||||
touch $@
|
||||
|
||||
clean:
|
||||
dh_testdir
|
||||
dh_testroot
|
||||
rm -f build-stamp config.log
|
||||
[ ! -f Makefile ] || $(MAKE) distclean
|
||||
dh_clean Makefile
|
||||
|
||||
install:
|
||||
dh_testdir
|
||||
dh_testroot
|
||||
dh_prep
|
||||
dh_installdirs
|
||||
$(MAKE) STRIP=ls prefix=$(CURDIR)/debian/tmp/usr install
|
||||
# Executable fixes
|
||||
chmod +x $(CURDIR)/debian/tmp/usr/share/recoll/examples/rclmon.sh
|
||||
chmod -x $(CURDIR)/debian/tmp/usr/share/recoll/filters/rclexec1.py
|
||||
chmod -x $(CURDIR)/debian/tmp/usr/share/recoll/filters/rclxslt.py
|
||||
find $(CURDIR) -type f -name '*.la' -exec rm -f '{}' \;
|
||||
(cd python/recoll; python setup.py install \
|
||||
override_dh_auto_install:
|
||||
dh_auto_install
|
||||
(cd python/recoll; libdir=/usr/lib/$${DEB_BUILD_MULTIARCH} python \
|
||||
./setup.py install \
|
||||
--install-layout=deb \
|
||||
--prefix=$(CURDIR)/debian/tmp/usr )
|
||||
--prefix=/usr \
|
||||
--root=$(CURDIR)/debian/tmp/usr )
|
||||
set -e && for i in $(build3vers); do \
|
||||
(cd python/recoll; python$$i ./setup.py install \
|
||||
--install-layout=deb \
|
||||
--prefix=$(CURDIR)/debian/tmp/usr ) ; \
|
||||
(cd python/recoll; libdir=/usr/lib/$${DEB_BUILD_MULTIARCH} python$$i \
|
||||
./setup.py install \
|
||||
--install-layout=deb \
|
||||
--prefix=/usr \
|
||||
--root=$(CURDIR)/debian/tmp/ ) ; \
|
||||
done
|
||||
|
||||
binary-arch: build install
|
||||
dh_testdir
|
||||
dh_testroot
|
||||
dh_installchangelogs ChangeLog
|
||||
dh_installdocs README
|
||||
dh_installman
|
||||
dh_install --sourcedir=debian/tmp
|
||||
dh_makeshlibs
|
||||
dh_python2 -p python-recoll
|
||||
dh_python3 -p python3-recoll
|
||||
dh_link
|
||||
dh_strip
|
||||
dh_compress
|
||||
dh_fixperms
|
||||
dh_lintian
|
||||
dh_installdeb
|
||||
dh_shlibdeps
|
||||
dh_gencontrol
|
||||
dh_md5sums
|
||||
dh_builddeb
|
||||
|
||||
binary-indep: build install
|
||||
|
||||
binary: binary-indep binary-arch
|
||||
.PHONY: build build-arch build-indep clean binary-indep binary-arch binary install
|
||||
(cd python/pychm; python ./setup.py install \
|
||||
--install-layout=deb \
|
||||
--prefix=/usr \
|
||||
--root=$(CURDIR)/debian/tmp/ )
|
||||
set -e && for i in $(build3vers); do \
|
||||
(cd python/pychm; python$$i ./setup.py install \
|
||||
--install-layout=deb \
|
||||
--prefix=/usr \
|
||||
--root=$(CURDIR)/debian/tmp/ ) ; \
|
||||
done
|
||||
find $(CURDIR) -type f -name '*.la' -exec rm -f '{}' \;
|
||||
find $(CURDIR) -type f -name '*.pyc' -exec rm -f '{}' \;
|
||||
rm -rf $(CURDIR)/debian/tmp/usr/lib/python*/*/*/__pycache__
|
||||
|
||||
113
src/Makefile.am
113
src/Makefile.am
@ -58,8 +58,8 @@ bincimapmime/mime-printbody.cc \
|
||||
bincimapmime/mime-utils.h \
|
||||
bincimapmime/mime.cc \
|
||||
bincimapmime/mime.h \
|
||||
common/beaglequeuecache.cpp \
|
||||
common/beaglequeuecache.h \
|
||||
common/webstore.cpp \
|
||||
common/webstore.h \
|
||||
common/conf_post.h \
|
||||
common/cstr.cpp \
|
||||
common/cstr.h \
|
||||
@ -76,10 +76,10 @@ common/unacpp.h \
|
||||
common/uproplist.h \
|
||||
common/utf8fn.cpp \
|
||||
common/utf8fn.h \
|
||||
index/beaglequeue.cpp \
|
||||
index/beaglequeue.h \
|
||||
index/bglfetcher.cpp \
|
||||
index/bglfetcher.h \
|
||||
index/webqueue.cpp \
|
||||
index/webqueue.h \
|
||||
index/webqueuefetcher.cpp \
|
||||
index/webqueuefetcher.h \
|
||||
index/checkretryfailed.cpp \
|
||||
index/checkretryfailed.h \
|
||||
index/exefetcher.cpp \
|
||||
@ -236,7 +236,6 @@ utils/rclutil.h \
|
||||
utils/rclutil.cpp \
|
||||
utils/readfile.cpp \
|
||||
utils/readfile.h \
|
||||
utils/refcntr.h \
|
||||
utils/smallut.cpp \
|
||||
utils/smallut.h \
|
||||
utils/strmatcher.cpp \
|
||||
@ -507,11 +506,30 @@ qtgui/xmltosd.cpp \
|
||||
qtgui/xmltosd.h \
|
||||
\
|
||||
python/README.txt \
|
||||
python/recoll/Makefile.in \
|
||||
python/pychm/AUTHORS \
|
||||
python/pychm/COPYING \
|
||||
python/pychm/MANIFEST.in \
|
||||
python/pychm/README-RECOLL.txt \
|
||||
python/pychm/pychm.egg-info \
|
||||
python/pychm/pychm.egg-info/PKG-INFO \
|
||||
python/pychm/pychm.egg-info/SOURCES.txt \
|
||||
python/pychm/pychm.egg-info/dependency_links.txt \
|
||||
python/pychm/pychm.egg-info/top_level.txt \
|
||||
python/pychm/recollchm \
|
||||
python/pychm/recollchm/__init__.py \
|
||||
python/pychm/recollchm/__pycache__ \
|
||||
python/pychm/recollchm/chm.py \
|
||||
python/pychm/recollchm/chmlib.py \
|
||||
python/pychm/recollchm/extra.c \
|
||||
python/pychm/recollchm/swig_chm.c \
|
||||
python/pychm/recollchm/swig_chm.i \
|
||||
python/pychm/setup.py.in \
|
||||
python/recoll/Makefile \
|
||||
python/recoll/pyrclextract.cpp \
|
||||
python/recoll/pyrecoll.cpp \
|
||||
python/recoll/pyrecoll.h \
|
||||
python/recoll/recoll/__init__.py \
|
||||
python/recoll/recoll/conftree.py \
|
||||
python/recoll/recoll/rclconfig.py \
|
||||
python/recoll/setup.py.in \
|
||||
python/samples/docdups.py \
|
||||
@ -538,34 +556,67 @@ VERSION
|
||||
# php/00README.txt php/recoll/config.m4 php/recoll/make.sh
|
||||
# php/recoll/php_recoll.h php/recoll/recoll.cpp php/sample/shell.php
|
||||
|
||||
OPTSFORPYTHON = $(shell test -f /etc/debian_version && echo --install-layout=deb)
|
||||
|
||||
if MAKEPYTHON
|
||||
all-local: recollpython
|
||||
all-local:: recollpython
|
||||
install-exec-local:: recollpython-install
|
||||
clean-local:: recollpython-clean
|
||||
recollpython: librecoll.la
|
||||
${MAKE} -C python/recoll libdir=$(libdir)
|
||||
install-exec-local: recollpython-install
|
||||
(cd python/recoll; set -x; \
|
||||
for v in 2 3;do test -n "`which python$${v}`" && \
|
||||
libdir=$(libdir) python$${v} setup.py build; \
|
||||
done \
|
||||
)
|
||||
recollpython-install:
|
||||
(cd python/recoll; \
|
||||
if test -f /etc/debian_version ; then \
|
||||
OPTSFORPYTHON=--install-layout=deb; \
|
||||
fi; \
|
||||
set -x; \
|
||||
python setup.py install \
|
||||
--prefix=${prefix} --root=$${DESTDIR:-/} $${OPTSFORPYTHON})
|
||||
clean-local: recollpython-clean
|
||||
(cd python/recoll; set -x; \
|
||||
for v in 2 3;do test -n "`which python$${v}`" && \
|
||||
python$${v} setup.py install \
|
||||
--prefix=${prefix} --root=$${DESTDIR:-/} $(OPTSFORPYTHON); \
|
||||
done; \
|
||||
)
|
||||
recollpython-clean:
|
||||
rm -f python/recoll/*.pyc
|
||||
rm -rf python/pychm/build
|
||||
rm -rf python/pychm/recollchm.egg-info
|
||||
rm -rf python/pychm/setup.py
|
||||
rm -rf python/recoll/Recoll.egg-info
|
||||
rm -rf python/recoll/__pycache__
|
||||
rm -rf python/recoll/build
|
||||
endif
|
||||
|
||||
if MAKEPYTHONCHM
|
||||
all-local:: rclpychm
|
||||
install-exec-local:: rclpychm-install
|
||||
clean-local:: rclpychm-clean
|
||||
rclpychm:
|
||||
(cd python/pychm; set -x; \
|
||||
for v in 2 3;do \
|
||||
test -n "`which python$${v}`" && python$${v} setup.py build;\
|
||||
done \
|
||||
)
|
||||
rclpychm-install:
|
||||
(cd python/pychm; set -x; \
|
||||
for v in 2 3;do test -n "`which python$${v}`" && \
|
||||
python$${v} setup.py install \
|
||||
--prefix=${prefix} --root=$${DESTDIR:-/} $(OPTSFORPYTHON); \
|
||||
done \
|
||||
)
|
||||
rclpychm-clean:
|
||||
rm -rf python/pychm/build
|
||||
rm -rf python/pychm/dist/*
|
||||
endif
|
||||
|
||||
if MAKEQT
|
||||
all-local: recollqt
|
||||
all-local:: recollqt
|
||||
recollqt: librecoll.la
|
||||
(cd $(QTGUI); ${QMAKE} PREFIX=${prefix} recoll.pro)
|
||||
$(MAKE) -C $(QTGUI) LFLAGS="$(LDFLAGS)" prefix=$(prefix) \
|
||||
exec_prefix=$(exec_prefix) libdir=$(libdir)
|
||||
clean-local: recollqt-clean
|
||||
clean-local:: recollqt-clean
|
||||
recollqt-clean:
|
||||
-$(MAKE) -C $(QTGUI) clean
|
||||
install-exec-local: recollqt-install
|
||||
install-exec-local:: recollqt-install
|
||||
recollqt-install:
|
||||
$(MAKE) -C $(QTGUI) LFLAGS="$(LDFLAGS)" INSTALL_ROOT=$(DESTDIR) \
|
||||
prefix=$(prefix) exec_prefix=$(exec_prefix) libdir=$(libdir) \
|
||||
@ -588,9 +639,10 @@ filterdir = $(pkgdatadir)/filters
|
||||
filter_DATA = \
|
||||
desktop/hotrecoll.py \
|
||||
filters/rcl7z \
|
||||
filters/rclabw \
|
||||
filters/rclabw.py \
|
||||
filters/rclaptosidman \
|
||||
filters/rclaudio \
|
||||
filters/rclbasehandler.py \
|
||||
filters/rclbibtex.sh \
|
||||
filters/rclcheckneedretry.sh \
|
||||
filters/rclchm \
|
||||
@ -602,9 +654,10 @@ filters/rclepub \
|
||||
filters/rclepub1 \
|
||||
filters/rclexec1.py \
|
||||
filters/rclexecm.py \
|
||||
filters/rclfb2 \
|
||||
filters/rclfb2.py \
|
||||
filters/rclgaim \
|
||||
filters/rclgnm \
|
||||
filters/rclgenxslt.py \
|
||||
filters/rclgnm.py \
|
||||
filters/rclics \
|
||||
filters/rclimg \
|
||||
filters/rclimg.py \
|
||||
@ -617,17 +670,15 @@ filters/rcllyx \
|
||||
filters/rclman \
|
||||
filters/rclmidi.py \
|
||||
filters/rclpdf.py \
|
||||
filters/rclokulnote \
|
||||
filters/rclokulnote.py \
|
||||
filters/rclopxml.py \
|
||||
filters/rclppt.py \
|
||||
filters/rclps \
|
||||
filters/rclpurple \
|
||||
filters/rclpython \
|
||||
filters/rclrar \
|
||||
filters/rclrtf.py \
|
||||
filters/rclscribus \
|
||||
filters/rclshowinfo \
|
||||
filters/rclsiduxman \
|
||||
filters/rclsoff.py \
|
||||
filters/rclsoff-flat.py \
|
||||
filters/rclsvg.py \
|
||||
@ -637,7 +688,6 @@ filters/rcltext.py \
|
||||
filters/rcluncomp \
|
||||
filters/rcluncomp.py \
|
||||
filters/rclwar \
|
||||
filters/rclwpd \
|
||||
filters/rclxls.py \
|
||||
filters/rclxml.py \
|
||||
filters/rclxmp.py \
|
||||
@ -648,13 +698,16 @@ filters/ppt-dump.py \
|
||||
filters/xls-dump.py \
|
||||
filters/xlsxmltocsv.py \
|
||||
filters/msodump.zip \
|
||||
filters/recollepub.zip \
|
||||
python/recoll/recoll/conftree.py \
|
||||
python/recoll/recoll/rclconfig.py
|
||||
|
||||
install-data-hook:
|
||||
(cd $(DESTDIR)/$(filterdir); \
|
||||
chmod a+x rcl* ppt-dump.py xls-dump.py xlsxmltocsv.py hotrecoll.py; \
|
||||
chmod a+x recoll-we-move-files.py; \
|
||||
chmod 0644 msodump.zip rclexecm.py rcllatinstops.zip rclconfig.py rclmidi.py)
|
||||
chmod 0644 msodump.zip recollepub.zip rclexecm.py rcllatinstops.zip \
|
||||
rclconfig.py conftree.py rclmidi.py)
|
||||
|
||||
if MAKEUSERDOC
|
||||
rdocdir = $(pkgdatadir)/doc
|
||||
|
||||
@ -1 +1 @@
|
||||
1.24.1
|
||||
1.25.0pre0
|
||||
|
||||
@ -38,6 +38,7 @@
|
||||
#include <sstream>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <unordered_map>
|
||||
|
||||
#include "cstr.h"
|
||||
#include "pathut.h"
|
||||
@ -70,6 +71,12 @@ bool o_uptodate_test_use_mtime = false;
|
||||
string RclConfig::o_localecharset;
|
||||
string RclConfig::o_origcwd;
|
||||
|
||||
// We build this once. Used to ensure that the suffix used for a temp
|
||||
// file of a given MIME type is the FIRST one from the mimemap config
|
||||
// file. Previously it was the first in alphabetic (map) order, with
|
||||
// sometimes strange results.
|
||||
static unordered_map<string, string> mime_suffixes;
|
||||
|
||||
// Compute the difference of 1st to 2nd sets and return as plus/minus
|
||||
// sets. Some args are std::set and some others stringToString()
|
||||
// strings for convenience
|
||||
@ -316,6 +323,27 @@ RclConfig::RclConfig(const string *argcnf)
|
||||
m_reason = string("No or bad mimemap file in: ") + cnferrloc;
|
||||
return;
|
||||
}
|
||||
|
||||
// Maybe create the MIME to suffix association reverse map. Do it
|
||||
// in file order so that we can control what suffix is used when
|
||||
// there are several. This only uses the distributed file, not any
|
||||
// local customization (too complicated).
|
||||
if (mime_suffixes.empty()) {
|
||||
ConfSimple mm(
|
||||
path_cat(path_cat(m_datadir, "examples"), "mimemap").c_str());
|
||||
vector<ConfLine> order = mm.getlines();
|
||||
for (const auto& entry: order) {
|
||||
if (entry.m_kind == ConfLine::CFL_VAR) {
|
||||
LOGDEB1("CONFIG: " << entry.m_data << " -> " << entry.m_value <<
|
||||
endl);
|
||||
// Remember: insert() only does anything for new keys,
|
||||
// so we only have the first value in the map
|
||||
mime_suffixes.insert(
|
||||
pair<string,string>(entry.m_value, entry.m_data));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
mimeconf = new ConfStack<ConfSimple>("mimeconf", m_cdirs, true);
|
||||
if (mimeconf == 0 || !mimeconf->ok()) {
|
||||
m_reason = string("No/bad mimeconf in: ") + cnferrloc;
|
||||
@ -753,14 +781,20 @@ string RclConfig::getMimeTypeFromSuffix(const string& suff) const
|
||||
|
||||
string RclConfig::getSuffixFromMimeType(const string &mt) const
|
||||
{
|
||||
string suffix;
|
||||
vector<string>sfs = mimemap->getNames(cstr_null);
|
||||
string mt1;
|
||||
for (vector<string>::const_iterator it = sfs.begin();
|
||||
it != sfs.end(); it++) {
|
||||
if (mimemap->get(*it, mt1, cstr_null))
|
||||
if (!stringicmp(mt, mt1))
|
||||
return *it;
|
||||
// First try from standard data, ensuring that we can control the value
|
||||
// from the order in the configuration file.
|
||||
auto rclsuff = mime_suffixes.find(mt);
|
||||
if (rclsuff != mime_suffixes.end()) {
|
||||
return rclsuff->second;
|
||||
}
|
||||
// Try again from local data. The map is in the wrong direction,
|
||||
// have to walk it.
|
||||
vector<string> sfs = mimemap->getNames(cstr_null);
|
||||
for (const auto& suff : sfs) {
|
||||
string mt1;
|
||||
if (mimemap->get(suff, mt1, cstr_null) && !stringicmp(mt, mt1)) {
|
||||
return suff;
|
||||
}
|
||||
}
|
||||
return cstr_null;
|
||||
}
|
||||
|
||||
@ -68,11 +68,11 @@ SynGroups::SynGroups()
|
||||
|
||||
bool SynGroups::setfile(const string& fn)
|
||||
{
|
||||
LOGDEB("SynGroups::setfile(" << (fn) << ")\n" );
|
||||
LOGDEB("SynGroups::setfile(" << fn << ")\n");
|
||||
if (!m) {
|
||||
m = new Internal;
|
||||
if (!m) {
|
||||
LOGERR("SynGroups:setfile:: new Internal failed: no mem ?\n" );
|
||||
LOGERR("SynGroups:setfile:: new Internal failed: no mem ?\n");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@ -86,7 +86,7 @@ bool SynGroups::setfile(const string& fn)
|
||||
ifstream input;
|
||||
input.open(fn.c_str(), ios::in);
|
||||
if (!input.is_open()) {
|
||||
LOGERR("SynGroups:setfile:: could not open " << (fn) << " errno " << (errno) << "\n" );
|
||||
LOGSYSERR("SynGroups:setfile", "open", fn);
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -101,7 +101,7 @@ bool SynGroups::setfile(const string& fn)
|
||||
getline(input, cline);
|
||||
if (!input.good()) {
|
||||
if (input.bad()) {
|
||||
LOGERR("Syngroup::setfile(" << (fn) << "):Parse: input.bad()\n" );
|
||||
LOGERR("Syngroup::setfile(" << fn << "):Parse: input.bad()\n");
|
||||
return false;
|
||||
}
|
||||
// Must be eof ? But maybe we have a partial line which
|
||||
@ -142,23 +142,25 @@ bool SynGroups::setfile(const string& fn)
|
||||
|
||||
vector<string> words;
|
||||
if (!stringToStrings(line, words)) {
|
||||
LOGERR("SynGroups:setfile: " << (fn) << ": bad line " << (lnum) << ": " << (line) << "\n" );
|
||||
LOGERR("SynGroups:setfile: " << fn << ": bad line " << lnum <<
|
||||
": " << line << "\n");
|
||||
continue;
|
||||
}
|
||||
|
||||
if (words.empty())
|
||||
continue;
|
||||
if (words.size() == 1) {
|
||||
LOGERR("Syngroup::setfile(" << (fn) << "):single term group at line " << (lnum) << " ??\n" );
|
||||
LOGERR("Syngroup::setfile(" << fn << "):single term group at line "
|
||||
<< lnum << " ??\n");
|
||||
continue;
|
||||
}
|
||||
|
||||
m->groups.push_back(words);
|
||||
for (vector<string>::const_iterator it = words.begin();
|
||||
it != words.end(); it++) {
|
||||
m->terms[*it] = m->groups.size()-1;
|
||||
for (const auto& word : words) {
|
||||
m->terms[word] = m->groups.size()-1;
|
||||
}
|
||||
LOGDEB1("SynGroups::setfile: group: [" << (stringsToString(m->groups.back())) << "]\n" );
|
||||
LOGDEB1("SynGroups::setfile: group: [" <<
|
||||
stringsToString(m->groups.back()) << "]\n");
|
||||
}
|
||||
m->ok = true;
|
||||
return true;
|
||||
@ -170,16 +172,15 @@ vector<string> SynGroups::getgroup(const string& term)
|
||||
if (!ok())
|
||||
return ret;
|
||||
|
||||
std::unordered_map<string, unsigned int>::const_iterator it1 =
|
||||
m->terms.find(term);
|
||||
const auto it1 = m->terms.find(term);
|
||||
if (it1 == m->terms.end()) {
|
||||
LOGDEB1("SynGroups::getgroup: [" << (term) << "] not found in direct map\n" );
|
||||
LOGDEB1("SynGroups::getgroup: [" << term<<"] not found in direct map\n");
|
||||
return ret;
|
||||
}
|
||||
|
||||
unsigned int idx = it1->second;
|
||||
if (idx >= m->groups.size()) {
|
||||
LOGERR("SynGroups::getgroup: line index higher than line count !\n" );
|
||||
LOGERR("SynGroups::getgroup: line index higher than line count !\n");
|
||||
return ret;
|
||||
}
|
||||
return m->groups[idx];
|
||||
|
||||
@ -28,14 +28,17 @@ class SynGroups {
|
||||
public:
|
||||
SynGroups();
|
||||
~SynGroups();
|
||||
SynGroups(const SynGroups&) = delete;
|
||||
SynGroups& operator=(const SynGroups&) = delete;
|
||||
SynGroups(const SynGroups&&) = delete;
|
||||
SynGroups& operator=(const SynGroups&&) = delete;
|
||||
|
||||
bool setfile(const std::string& fname);
|
||||
std::vector<std::string> getgroup(const std::string& term);
|
||||
bool ok();
|
||||
private:
|
||||
class Internal;
|
||||
Internal *m;
|
||||
SynGroups(const SynGroups&);
|
||||
SynGroups& operator=(const SynGroups&);
|
||||
};
|
||||
|
||||
#endif /* _SYNGROUPS_H_INCLUDED_ */
|
||||
|
||||
@ -17,10 +17,11 @@
|
||||
|
||||
#include "autoconfig.h"
|
||||
|
||||
#include "webstore.h"
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "cstr.h"
|
||||
#include "beaglequeuecache.h"
|
||||
#include "circache.h"
|
||||
#include "log.h"
|
||||
#include "rclconfig.h"
|
||||
@ -29,42 +30,43 @@
|
||||
|
||||
const string cstr_bgc_mimetype("mimetype");
|
||||
|
||||
BeagleQueueCache::BeagleQueueCache(RclConfig *cnf)
|
||||
WebStore::WebStore(RclConfig *cnf)
|
||||
{
|
||||
string ccdir = cnf->getWebcacheDir();
|
||||
|
||||
int maxmbs = 40;
|
||||
cnf->getConfParam("webcachemaxmbs", &maxmbs);
|
||||
if ((m_cache = new CirCache(ccdir)) == 0) {
|
||||
LOGERR("BeagleQueueCache: cant create CirCache object\n" );
|
||||
LOGERR("WebStore: cant create CirCache object\n" );
|
||||
return;
|
||||
}
|
||||
if (!m_cache->create(int64_t(maxmbs)*1000*1024, CirCache::CC_CRUNIQUE)) {
|
||||
LOGERR("BeagleQueueCache: cache file creation failed: " << (m_cache->getReason()) << "\n" );
|
||||
LOGERR("WebStore: cache file creation failed: " <<
|
||||
m_cache->getReason() << "\n");
|
||||
delete m_cache;
|
||||
m_cache = 0;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
BeagleQueueCache::~BeagleQueueCache()
|
||||
WebStore::~WebStore()
|
||||
{
|
||||
delete m_cache;
|
||||
}
|
||||
|
||||
// Read document from cache. Return the metadata as an Rcl::Doc
|
||||
// @param htt Beagle Hit Type
|
||||
bool BeagleQueueCache::getFromCache(const string& udi, Rcl::Doc &dotdoc,
|
||||
// @param htt Web Hit Type
|
||||
bool WebStore::getFromCache(const string& udi, Rcl::Doc &dotdoc,
|
||||
string& data, string *htt)
|
||||
{
|
||||
string dict;
|
||||
|
||||
if (m_cache == 0) {
|
||||
LOGERR("BeagleQueueCache::getFromCache: cache is null\n" );
|
||||
LOGERR("WebStore::getFromCache: cache is null\n");
|
||||
return false;
|
||||
}
|
||||
if (!m_cache->get(udi, dict, &data)) {
|
||||
LOGDEB("BeagleQueueCache::getFromCache: get failed\n" );
|
||||
LOGDEB("WebStore::getFromCache: get failed\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -14,11 +14,10 @@
|
||||
* Free Software Foundation, Inc.,
|
||||
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
*/
|
||||
#ifndef _beaglequeuecache_h_included_
|
||||
#define _beaglequeuecache_h_included_
|
||||
#ifndef _webstore_h_included_
|
||||
#define _webstore_h_included_
|
||||
|
||||
#include <string>
|
||||
using std::string;
|
||||
|
||||
class RclConfig;
|
||||
namespace Rcl {
|
||||
@ -28,23 +27,24 @@ namespace Rcl {
|
||||
class CirCache;
|
||||
|
||||
/**
|
||||
* Manage the CirCache for the Beagle Queue indexer. Separated from the main
|
||||
* Manage the CirCache for the Web Queue indexer. Separated from the main
|
||||
* indexer code because it's also used for querying (getting the data for a
|
||||
* preview
|
||||
*/
|
||||
class BeagleQueueCache {
|
||||
class WebStore {
|
||||
public:
|
||||
BeagleQueueCache(RclConfig *config);
|
||||
~BeagleQueueCache();
|
||||
WebStore(RclConfig *config);
|
||||
~WebStore();
|
||||
|
||||
bool getFromCache(const string& udi, Rcl::Doc &doc, string& data,
|
||||
string *hittype = 0);
|
||||
bool getFromCache(const std::string& udi, Rcl::Doc &doc, std::string& data,
|
||||
std::string *hittype = 0);
|
||||
// We could write proxies for all the circache ops, but why bother?
|
||||
CirCache *cc() {return m_cache;}
|
||||
|
||||
private:
|
||||
CirCache *m_cache;
|
||||
};
|
||||
extern const string cstr_bgc_mimetype;
|
||||
|
||||
#endif /* _beaglequeuecache_h_included_ */
|
||||
extern const std::string cstr_bgc_mimetype;
|
||||
|
||||
#endif /* _webstore_h_included_ */
|
||||
@ -252,6 +252,13 @@ fi
|
||||
|
||||
AM_CONDITIONAL(MAKEPYTHON, [test X$pythonEnabled = Xyes])
|
||||
|
||||
# Disable building the libchm python wrapper
|
||||
AC_ARG_ENABLE(python-chm, AC_HELP_STRING([--disable-python-chm],
|
||||
[Do not build the libchm Python wrapper.]),
|
||||
pythonChmEnabled=$enableval, pythonChmEnabled=yes)
|
||||
|
||||
AM_CONDITIONAL(MAKEPYTHONCHM, [test X$pythonChmEnabled = Xyes])
|
||||
|
||||
|
||||
AC_CHECK_FUNCS(mkdtemp)
|
||||
AC_CHECK_LIB([pthread], [pthread_create], [], [])
|
||||
@ -523,6 +530,6 @@ AC_SUBST(RCLLIBVERSION)
|
||||
AC_CONFIG_FILES(Makefile)
|
||||
AC_CONFIG_FILES(common/rclversion.h)
|
||||
AC_CONFIG_FILES(python/recoll/setup.py)
|
||||
AC_CONFIG_FILES(python/recoll/Makefile)
|
||||
AC_CONFIG_FILES(python/pychm/setup.py)
|
||||
|
||||
AC_OUTPUT
|
||||
|
||||
@ -1,4 +1,7 @@
|
||||
#!/usr/bin/python
|
||||
#!/usr/bin/python2
|
||||
#
|
||||
# wnck does not have a python3 binding as far as I can see (or at
|
||||
# least it's not packaged by, e.g. Debian. So python2 only for now.
|
||||
#
|
||||
# This script should be linked to a keyboard shortcut. Under gnome,
|
||||
# you can do this from the main preferences menu, or directly execute
|
||||
|
||||
@ -1,8 +1,10 @@
|
||||
[Desktop Entry]
|
||||
Categories=Utility;Filesystem;Database;
|
||||
Categories=Qt;Utility;Filesystem;Database;
|
||||
Comment=Find documents by specifying search terms
|
||||
Comment[ru]=ðÏÉÓË ÄÏËÕÍÅÎÔÏ× ÐÏ ÚÁÄÁÎÎÙÍ ÕÓÌÏ×ÉÑÍ
|
||||
Exec=recoll
|
||||
GenericName=Local Text Search
|
||||
GenericName[ru]=ìÏËÁÌØÎÙÊ ÔÅËÓÔÏ×ÙÊ ÐÏÉÓË
|
||||
Icon=recoll
|
||||
Name=Recoll
|
||||
Terminal=false
|
||||
|
||||
@ -10,7 +10,7 @@
|
||||
<link rel="stylesheet" type="text/css" href="docbook-xsl.css">
|
||||
<meta name="generator" content="DocBook XSL Stylesheets V1.79.1">
|
||||
<meta name="description" content=
|
||||
"Permission is granted to copy, distribute and/or modify this document under the terms of the GNU Free Documentation License, Version 1.3 or any later version published by the Free Software Foundation; with no Invariant Sections, no Front-Cover Texts, and no Back-Cover Texts. A copy of the license can be found at the following location: GNU web site. This document introduces full text search notions and describes the installation and use of the Recoll application. This version describes Recoll 1.23.">
|
||||
"Permission is granted to copy, distribute and/or modify this document under the terms of the GNU Free Documentation License, Version 1.3 or any later version published by the Free Software Foundation; with no Invariant Sections, no Front-Cover Texts, and no Back-Cover Texts. A copy of the license can be found at the following location: GNU web site. This document introduces full text search notions and describes the installation and use of the Recoll application. This version describes Recoll 1.24.">
|
||||
</head>
|
||||
<body bgcolor="white" text="black" link="#0000FF" vlink="#840084"
|
||||
alink="#0000FF">
|
||||
@ -53,7 +53,7 @@ alink="#0000FF">
|
||||
and describes the installation and use of the
|
||||
<span class="application">Recoll</span> application.
|
||||
This version describes <span class=
|
||||
"application">Recoll</span> 1.23.</p>
|
||||
"application">Recoll</span> 1.24.</p>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
@ -376,9 +376,15 @@ alink="#0000FF">
|
||||
<dt><span class="sect2">6.3.1. <a href=
|
||||
"#RCL.INSTALL.BUILDING.PREREQS">Prerequisites</a></span></dt>
|
||||
<dt><span class="sect2">6.3.2. <a href=
|
||||
"#RCL.INSTALL.BUILDING.BUILD">Building</a></span></dt>
|
||||
"#RCL.INSTALL.BUILDING.BUILDING">Building</a></span></dt>
|
||||
<dt><span class="sect2">6.3.3. <a href=
|
||||
"#RCL.INSTALL.BUILDING.INSTALL">Installation</a></span></dt>
|
||||
"#RCL.INSTALL.BUILDING.INSTALL">Installing</a></span></dt>
|
||||
<dt><span class="sect2">6.3.4. <a href=
|
||||
"#RCL.INSTALL.BUILDING.PYTHON">Python API
|
||||
package</a></span></dt>
|
||||
<dt><span class="sect2">6.3.5. <a href=
|
||||
"#RCL.INSTALL.BUILDING.SOLARIS">Building on
|
||||
Solaris</a></span></dt>
|
||||
</dl>
|
||||
</dd>
|
||||
<dt><span class="sect1">6.4. <a href=
|
||||
@ -428,7 +434,7 @@ alink="#0000FF">
|
||||
<p>This document introduces full text search notions and
|
||||
describes the installation and use of the <span class=
|
||||
"application">Recoll</span> application. It is updated for
|
||||
<span class="application">Recoll</span> 1.23.</p>
|
||||
<span class="application">Recoll</span> 1.24.</p>
|
||||
<p><span class="application">Recoll</span> was for a long
|
||||
time dedicated to Unix-like systems. It was only lately
|
||||
(2015) ported to <span class="application">MS-Windows</span>.
|
||||
@ -2128,8 +2134,8 @@ alink="#0000FF">
|
||||
grow quite big, depending on the log level.</p>
|
||||
<p>When building <span class="application">Recoll</span>,
|
||||
the real time indexing support can be customised during
|
||||
package <a class="link" href="#RCL.INSTALL.BUILDING.BUILD"
|
||||
title="6.3.2. Building">configuration</a> with the
|
||||
package <a class="link" href="#RCL.INSTALL.BUILDING" title=
|
||||
"6.3. Building from source">configuration</a> with the
|
||||
<code class="option">--with[out]-fam</code> or <code class=
|
||||
"option">--with[out]-inotify</code> options. The default is
|
||||
currently to include <span class=
|
||||
@ -6170,31 +6176,13 @@ recollindex -c "$confdir"
|
||||
here. A paragraph at the end of this section will explain
|
||||
a few differences and ways to write code compatible with
|
||||
both versions.</p>
|
||||
<p>The Python interface can be found in the source
|
||||
package, under <code class=
|
||||
"filename">python/recoll</code>.</p>
|
||||
<p>The <code class="filename">python/recoll/</code>
|
||||
directory contains the usual <code class=
|
||||
"filename">setup.py</code>. After configuring the main
|
||||
<span class="application">Recoll</span> code, you can use
|
||||
the script to build and install the Python module:</p>
|
||||
<pre class="screen">
|
||||
<strong class=
|
||||
"userinput"><code>cd recoll-xxx/python/recoll</code></strong>
|
||||
<strong class=
|
||||
"userinput"><code>python setup.py build</code></strong>
|
||||
<strong class=
|
||||
"userinput"><code>python setup.py install</code></strong>
|
||||
</pre>
|
||||
<p>As of <span class="application">Recoll</span> 1.19,
|
||||
the module can be compiled for Python3.</p>
|
||||
<p>The normal <span class="application">Recoll</span>
|
||||
installer installs the Python2 API along with the main
|
||||
code. The Python3 version must be explicitely built and
|
||||
installed.</p>
|
||||
<p>When installing from a repository, and depending on
|
||||
the distribution, the Python API can sometimes be found
|
||||
in a separate package.</p>
|
||||
<p>There is a good chance that your system repository has
|
||||
packages for the Recoll Python API, sometimes in a
|
||||
package separate from the main one (maybe named something
|
||||
like python-recoll). Else refer to the <a class="link"
|
||||
href="#RCL.INSTALL.BUILDING" title=
|
||||
"6.3. Building from source">Building from source
|
||||
chapter</a>.</p>
|
||||
<p>As an introduction, the following small sample will
|
||||
run a query and list the title and url for each of the
|
||||
results. It would work with <span class=
|
||||
@ -6216,6 +6204,13 @@ recollindex -c "$confdir"
|
||||
for doc in results:
|
||||
print(doc.url, doc.title)
|
||||
</pre>
|
||||
<p>You can also take a look at the source for the
|
||||
<a class="ulink" href=
|
||||
"https://github.com/koniu/recoll-webui" target=
|
||||
"_top">Recoll WebUI</a>, or the <a class="ulink" href=
|
||||
"https://opensourceprojects.eu/p/upmpdcli/code/ci/c8c8e75bd181ad9db2df14da05934e53ca867a06/tree/src/mediaserver/cdplugins/uprcl/uprclfolders.py"
|
||||
target="_top">upmpdcli local media server</a>, which are
|
||||
both based on the Python API.</p>
|
||||
</div>
|
||||
<div class="sect2">
|
||||
<div class="titlepage">
|
||||
@ -7575,8 +7570,8 @@ for i in range(nres):
|
||||
<div>
|
||||
<div>
|
||||
<h3 class="title"><a name=
|
||||
"RCL.INSTALL.BUILDING.BUILD" id=
|
||||
"RCL.INSTALL.BUILDING.BUILD"></a>6.3.2. Building</h3>
|
||||
"RCL.INSTALL.BUILDING.BUILDING" id=
|
||||
"RCL.INSTALL.BUILDING.BUILDING"></a>6.3.2. Building</h3>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
@ -7718,7 +7713,7 @@ for i in range(nres):
|
||||
<strong class=
|
||||
"userinput"><code>(practices usual hardship-repelling invocations)</code></strong>
|
||||
</pre>
|
||||
<p>When building from source cloned from the BitBucket
|
||||
<p>When building from source cloned from the git
|
||||
repository, you also need to install <span class=
|
||||
"application">autoconf</span>, <span class=
|
||||
"application">automake</span>, and <span class=
|
||||
@ -7726,29 +7721,6 @@ for i in range(nres):
|
||||
<code class="literal">sh autogen.sh</code> in the top
|
||||
source directory before running <code class=
|
||||
"literal">configure</code>.</p>
|
||||
<div class="sect3">
|
||||
<div class="titlepage">
|
||||
<div>
|
||||
<div>
|
||||
<h4 class="title"><a name=
|
||||
"RCL.INSTALL.BUILDING.BUILD.SOLARIS" id=
|
||||
"RCL.INSTALL.BUILDING.BUILD.SOLARIS"></a>6.3.2.1. Building
|
||||
on Solaris</h4>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<p>We did not test building the GUI on Solaris for
|
||||
recent versions. You will need at least Qt 4.4. There
|
||||
are some hints on <a class="ulink" href=
|
||||
"http://www.recoll.org/download-1.14.html" target=
|
||||
"_top">an old web site page</a>, they may still be
|
||||
valid.</p>
|
||||
<p>Someone did test the 1.19 indexer and Python module
|
||||
build, they do work, with a few minor glitches. Be sure
|
||||
to use GNU <span class=
|
||||
"command"><strong>make</strong></span> and <span class=
|
||||
"command"><strong>install</strong></span>.</p>
|
||||
</div>
|
||||
</div>
|
||||
<div class="sect2">
|
||||
<div class="titlepage">
|
||||
@ -7756,7 +7728,7 @@ for i in range(nres):
|
||||
<div>
|
||||
<h3 class="title"><a name=
|
||||
"RCL.INSTALL.BUILDING.INSTALL" id=
|
||||
"RCL.INSTALL.BUILDING.INSTALL"></a>6.3.3. Installation</h3>
|
||||
"RCL.INSTALL.BUILDING.INSTALL"></a>6.3.3. Installing</h3>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
@ -7769,6 +7741,66 @@ for i in range(nres):
|
||||
to <code class="filename"><em class=
|
||||
"replaceable"><code>prefix</code></em>/share/recoll</code>.</p>
|
||||
</div>
|
||||
<div class="sect2">
|
||||
<div class="titlepage">
|
||||
<div>
|
||||
<div>
|
||||
<h3 class="title"><a name=
|
||||
"RCL.INSTALL.BUILDING.PYTHON" id=
|
||||
"RCL.INSTALL.BUILDING.PYTHON"></a>6.3.4. Python
|
||||
API package</h3>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<p>The Python interface can be found in the source tree,
|
||||
under the <code class="filename">python/recoll</code>
|
||||
directory.</p>
|
||||
<p>As of <span class="application">Recoll</span> 1.19,
|
||||
the module can be compiled for Python3.</p>
|
||||
<p>The normal <span class="application">Recoll</span>
|
||||
build procedure (see above) installs the API package for
|
||||
the default system version (python) along with the main
|
||||
code. The package for other Python versions (e.g. python3
|
||||
if the system default is python2) must be explicitely
|
||||
built and installed.</p>
|
||||
<p>The <code class="filename">python/recoll/</code>
|
||||
directory contains the usual <code class=
|
||||
"filename">setup.py</code>. After configuring and
|
||||
building the main <span class="application">Recoll</span>
|
||||
code, you can use the script to build and install the
|
||||
Python module:</p>
|
||||
<pre class="screen">
|
||||
<strong class=
|
||||
"userinput"><code>cd recoll-xxx/python/recoll</code></strong>
|
||||
<strong class=
|
||||
"userinput"><code>pythonX setup.py build</code></strong>
|
||||
<strong class=
|
||||
"userinput"><code>sudo pythonX setup.py install</code></strong>
|
||||
</pre>
|
||||
</div>
|
||||
<div class="sect2">
|
||||
<div class="titlepage">
|
||||
<div>
|
||||
<div>
|
||||
<h3 class="title"><a name=
|
||||
"RCL.INSTALL.BUILDING.SOLARIS" id=
|
||||
"RCL.INSTALL.BUILDING.SOLARIS"></a>6.3.5. Building
|
||||
on Solaris</h3>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<p>We did not test building the GUI on Solaris for recent
|
||||
versions. You will need at least Qt 4.4. There are some
|
||||
hints on <a class="ulink" href=
|
||||
"http://www.recoll.org/download-1.14.html" target=
|
||||
"_top">an old web site page</a>, they may still be
|
||||
valid.</p>
|
||||
<p>Someone did test the 1.19 indexer and Python module
|
||||
build, they do work, with a few minor glitches. Be sure
|
||||
to use GNU <span class=
|
||||
"command"><strong>make</strong></span> and <span class=
|
||||
"command"><strong>install</strong></span>.</p>
|
||||
</div>
|
||||
</div>
|
||||
<div class="sect1">
|
||||
<div class="titlepage">
|
||||
|
||||
@ -5,7 +5,7 @@
|
||||
|
||||
<!ENTITY RCL "<application>Recoll</application>">
|
||||
<!ENTITY RCLAPPS "<ulink url='http://www.recoll.org/features.html#doctypes'>http://www.recoll.org/features.html</ulink>">
|
||||
<!ENTITY RCLVERSION "1.23">
|
||||
<!ENTITY RCLVERSION "1.24">
|
||||
<!ENTITY XAP "<application>Xapian</application>">
|
||||
<!ENTITY WIN "<application>Windows</application>">
|
||||
<!ENTITY FAQS "https://www.lesbonscomptes.com/recoll/faqsandhowtos/">
|
||||
@ -1470,7 +1470,7 @@
|
||||
|
||||
<para>When building &RCL;, the real time indexing support can be
|
||||
customised during package <link
|
||||
linkend="RCL.INSTALL.BUILDING.BUILD">configuration</link> with
|
||||
linkend="RCL.INSTALL.BUILDING">configuration</link> with
|
||||
the <option>--with[out]-fam</option> or
|
||||
<option>--with[out]-inotify</option> options. The default is
|
||||
currently to include <application>inotify</application>
|
||||
@ -4817,30 +4817,11 @@ recollindex -c "$confdir"
|
||||
paragraph at the end of this section will explain a few differences
|
||||
and ways to write code compatible with both versions.</para>
|
||||
|
||||
<para>The Python interface can be found in the source package,
|
||||
under <filename>python/recoll</filename>.</para>
|
||||
|
||||
<para>The <filename>python/recoll/</filename> directory
|
||||
contains the usual <filename>setup.py</filename>. After
|
||||
configuring the main &RCL; code, you can use the script to
|
||||
build and install the Python module:
|
||||
<screen>
|
||||
<userinput>cd recoll-xxx/python/recoll</userinput>
|
||||
<userinput>python setup.py build</userinput>
|
||||
<userinput>python setup.py install</userinput>
|
||||
</screen>
|
||||
</para>
|
||||
|
||||
<para>As of &RCL; 1.19, the module can be compiled for
|
||||
Python3.</para>
|
||||
|
||||
<para>The normal &RCL; installer installs the Python2
|
||||
API along with the main code. The Python3 version must be
|
||||
explicitely built and installed.</para>
|
||||
|
||||
<para>When installing from a repository, and depending on the
|
||||
distribution, the Python API can sometimes be found in a
|
||||
separate package.</para>
|
||||
<para>There is a good chance that your system repository has
|
||||
packages for the Recoll Python API, sometimes in a package separate
|
||||
from the main one (maybe named something like python-recoll). Else
|
||||
refer to the <link linkend="RCL.INSTALL.BUILDING">Building from
|
||||
source chapter</link>.</para>
|
||||
|
||||
<para>As an introduction, the following small sample will run a
|
||||
query and list the title and url for each of the results. It would
|
||||
@ -4863,6 +4844,11 @@ recollindex -c "$confdir"
|
||||
print(doc.url, doc.title)
|
||||
]]></programlisting>
|
||||
|
||||
<para>You can also take a look at the source for the <ulink
|
||||
url="https://github.com/koniu/recoll-webui">Recoll
|
||||
WebUI</ulink>, or the <ulink url="https://opensourceprojects.eu/p/upmpdcli/code/ci/c8c8e75bd181ad9db2df14da05934e53ca867a06/tree/src/mediaserver/cdplugins/uprcl/uprclfolders.py">upmpdcli local media server</ulink>, which are both
|
||||
based on the Python API.</para>
|
||||
|
||||
</sect2>
|
||||
|
||||
<sect2 id="RCL.PROGRAM.PYTHONAPI.ELEMENTS">
|
||||
@ -5894,7 +5880,7 @@ for i in range(nres):
|
||||
|
||||
</sect2>
|
||||
|
||||
<sect2 id="RCL.INSTALL.BUILDING.BUILD">
|
||||
<sect2 id="RCL.INSTALL.BUILDING.BUILDING">
|
||||
<title>Building</title>
|
||||
|
||||
<para>&RCL; has been built on Linux, FreeBSD, Mac OS X, and Solaris,
|
||||
@ -6010,30 +5996,16 @@ for i in range(nres):
|
||||
<userinput>(practices usual hardship-repelling invocations)</userinput>
|
||||
</screen>
|
||||
|
||||
<para>When building from source cloned from the BitBucket repository,
|
||||
<para>When building from source cloned from the git repository,
|
||||
you also need to install <application>autoconf</application>,
|
||||
<application>automake</application>, and
|
||||
<application>libtool</application> and you must execute <literal>sh
|
||||
autogen.sh</literal> in the top source directory before running
|
||||
<literal>configure</literal>.</para>
|
||||
|
||||
<sect3 id="RCL.INSTALL.BUILDING.BUILD.SOLARIS">
|
||||
<title>Building on Solaris</title>
|
||||
|
||||
<para>We did not test building the GUI on Solaris for recent
|
||||
versions. You will need at least Qt 4.4. There are some hints
|
||||
on <ulink url="http://www.recoll.org/download-1.14.html">an old
|
||||
web site page</ulink>, they may still be valid.</para>
|
||||
|
||||
<para>Someone did test the 1.19 indexer and Python module build,
|
||||
they do work, with a few minor glitches. Be sure to use
|
||||
GNU <command>make</command> and <command>install</command>.</para>
|
||||
</sect3>
|
||||
|
||||
</sect2>
|
||||
|
||||
|
||||
<sect2 id="RCL.INSTALL.BUILDING.INSTALL">
|
||||
<title>Installation</title>
|
||||
<title>Installing</title>
|
||||
|
||||
<para>Use <userinput>make install</userinput>
|
||||
in the root
|
||||
@ -6045,6 +6017,48 @@ for i in range(nres):
|
||||
</para>
|
||||
|
||||
</sect2>
|
||||
|
||||
<sect2 id="RCL.INSTALL.BUILDING.PYTHON">
|
||||
<title>Python API package</title>
|
||||
|
||||
<para>The Python interface can be found in the source tree,
|
||||
under the <filename>python/recoll</filename> directory.</para>
|
||||
|
||||
<para>As of &RCL; 1.19, the module can be compiled for
|
||||
Python3.</para>
|
||||
|
||||
<para>The normal &RCL; build procedure (see above) installs the API
|
||||
package for the default system version (python) along with the main
|
||||
code. The package for other Python versions (e.g. python3 if the
|
||||
system default is python2) must be explicitely built and
|
||||
installed.</para>
|
||||
|
||||
<para>The <filename>python/recoll/</filename> directory contains
|
||||
the usual <filename>setup.py</filename>. After configuring and
|
||||
building the main &RCL; code, you can use the script to build and
|
||||
install the Python module:
|
||||
<screen>
|
||||
<userinput>cd recoll-xxx/python/recoll</userinput>
|
||||
<userinput>pythonX setup.py build</userinput>
|
||||
<userinput>sudo pythonX setup.py install</userinput>
|
||||
</screen>
|
||||
</para>
|
||||
|
||||
</sect2>
|
||||
|
||||
<sect2 id="RCL.INSTALL.BUILDING.SOLARIS">
|
||||
<title>Building on Solaris</title>
|
||||
|
||||
<para>We did not test building the GUI on Solaris for recent
|
||||
versions. You will need at least Qt 4.4. There are some hints
|
||||
on <ulink url="http://www.recoll.org/download-1.14.html">an old
|
||||
web site page</ulink>, they may still be valid.</para>
|
||||
|
||||
<para>Someone did test the 1.19 indexer and Python module build,
|
||||
they do work, with a few minor glitches. Be sure to use
|
||||
GNU <command>make</command> and <command>install</command>.</para>
|
||||
</sect2>
|
||||
|
||||
</sect1>
|
||||
|
||||
<sect1 id="RCL.INSTALL.CONFIG">
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/env python2
|
||||
#!/usr/bin/env python3
|
||||
#
|
||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/env python2
|
||||
#!/usr/bin/env python3
|
||||
|
||||
# 7-Zip file filter for Recoll
|
||||
|
||||
|
||||
@ -1,91 +1,28 @@
|
||||
#!/bin/sh
|
||||
# @(#$Id: rclabw,v 1.3 2008-10-08 08:27:34 dockes Exp $ (C) 2004 J.F.Dockes
|
||||
# Parts taken from Estraier:
|
||||
#================================================================
|
||||
# Estraier: a personal full-text search system
|
||||
# Copyright (C) 2003-2004 Mikio Hirabayashi
|
||||
#================================================================
|
||||
#================================================================
|
||||
# Extract text from an abiword file
|
||||
#================================================================
|
||||
#!/usr/bin/env python3
|
||||
# Copyright (C) 2014 J.F.Dockes
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the
|
||||
# Free Software Foundation, Inc.,
|
||||
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
######################################
|
||||
|
||||
# set variables
|
||||
LANG=C ; export LANG
|
||||
LC_ALL=C ; export LC_ALL
|
||||
progname="rclabw"
|
||||
filetype=abiword
|
||||
from __future__ import print_function
|
||||
|
||||
import sys
|
||||
import rclexecm
|
||||
import rclgenxslt
|
||||
|
||||
#RECFILTCOMMONCODE
|
||||
##############################################################################
|
||||
# !! Leave the previous line unmodified!! Code imported from the
|
||||
# recfiltcommon file
|
||||
|
||||
# Utility code common to all shell filters. This could be sourced at run
|
||||
# time, but it's slightly more efficient to include the code in the
|
||||
# filters at build time (with a sed script).
|
||||
|
||||
# Describe error in a way that can be interpreted by our caller
|
||||
senderror()
|
||||
{
|
||||
echo RECFILTERROR $*
|
||||
# Also alert on stderr just in case
|
||||
echo ":2:$progname::: $*" 1>&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
iscmd()
|
||||
{
|
||||
cmd=$1
|
||||
case $cmd in
|
||||
*/*)
|
||||
if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;;
|
||||
*)
|
||||
oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
|
||||
for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done
|
||||
return 1 ;;
|
||||
esac
|
||||
}
|
||||
|
||||
checkcmds()
|
||||
{
|
||||
for cmd in $*;do
|
||||
if iscmd $cmd
|
||||
then
|
||||
a=1
|
||||
else
|
||||
senderror HELPERNOTFOUND $cmd
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
# show help message
|
||||
if test $# -ne 1 -o "$1" = "--help"
|
||||
then
|
||||
echo "Convert a $filetype file to HTML text for Recoll indexing."
|
||||
echo "Usage: $progname [infile]"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
infile="$1"
|
||||
|
||||
# check the input file existence (may be '-' for stdin)
|
||||
if test "X$infile" != X- -a ! -f "$infile"
|
||||
then
|
||||
senderror INPUTNOSUCHFILE "$infile"
|
||||
fi
|
||||
|
||||
# protect access to our temp files and directories
|
||||
umask 77
|
||||
|
||||
##############################################################################
|
||||
# !! Leave the following line unmodified !
|
||||
#ENDRECFILTCOMMONCODE
|
||||
|
||||
checkcmds xsltproc
|
||||
|
||||
xsltproc --nonet --novalid - "$infile" <<EOF
|
||||
<?xml version="1.0"?>
|
||||
stylesheet_all = '''<?xml version="1.0"?>
|
||||
<xsl:stylesheet version="1.0"
|
||||
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
|
||||
xmlns:ab="http://www.abisource.com/awml.dtd"
|
||||
@ -173,7 +110,9 @@ xsltproc --nonet --novalid - "$infile" <<EOF
|
||||
</xsl:template>
|
||||
|
||||
</xsl:stylesheet>
|
||||
EOF
|
||||
'''
|
||||
|
||||
# exit normally
|
||||
exit 0
|
||||
if __name__ == '__main__':
|
||||
proto = rclexecm.RclExecM()
|
||||
extract = rclgenxslt.XSLTExtractor(proto, stylesheet_all)
|
||||
rclexecm.main(proto, extract)
|
||||
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/env python2
|
||||
#!/usr/bin/env python3
|
||||
|
||||
# Audio tag filter for Recoll, using mutagen
|
||||
|
||||
@ -164,6 +164,13 @@ tagdict = {
|
||||
'\xa9wrt' : 'COMPOSER',
|
||||
}
|
||||
|
||||
def tobytes(s):
|
||||
if type(s) == type(b''):
|
||||
return s
|
||||
if type(s) != type(u''):
|
||||
s = str(s)
|
||||
return s.encode('utf-8', errors='replace')
|
||||
|
||||
# mp3: album, title, artist, genre, date, tracknumber
|
||||
# flac: album, title, artist, genre, xxx, tracknumber
|
||||
# oggvorbis:album, title, artist, genre, date, tracknumber
|
||||
@ -236,6 +243,7 @@ class AudioTagExtractor:
|
||||
filename = params["filename:"]
|
||||
mimetype = params["mimetype:"]
|
||||
self.filename = filename
|
||||
#self.em.rclog("%s" % filename)
|
||||
try:
|
||||
mutf = File(filename)
|
||||
except Exception as err:
|
||||
@ -247,6 +255,7 @@ class AudioTagExtractor:
|
||||
###################
|
||||
# Extract audio parameters. Not all file types supply all or
|
||||
# even use the same property names...
|
||||
# minf has natural str keys, and encoded values
|
||||
minf = {}
|
||||
for prop,dflt in [('sample_rate', 44100), ('channels', 2),
|
||||
('length', 0), ('bitrate', 0)]:
|
||||
@ -258,7 +267,7 @@ class AudioTagExtractor:
|
||||
|
||||
if minf['bitrate'] == 0 and minf['length'] > 0:
|
||||
br = int(os.path.getsize(filename)* 8 / minf['length'])
|
||||
minf['bitrate'] = str(br)
|
||||
minf['bitrate'] = br
|
||||
|
||||
minf['duration'] = minf['length']
|
||||
del minf['length']
|
||||
@ -274,41 +283,37 @@ class AudioTagExtractor:
|
||||
minf['bits_per_sample'] = 16
|
||||
|
||||
for tag,val in minf.items():
|
||||
minf[tag] = str(val)
|
||||
|
||||
#self.em.rclog("minf after audio %s\n" % minf)
|
||||
|
||||
minf[tag] = tobytes(val)
|
||||
|
||||
####################
|
||||
# Metadata tags. The names vary depending on the file type. We
|
||||
# just have a big translation dictionary for all
|
||||
for tag,val in mutf.items():
|
||||
#self.em.rclog("Original tag: <%s>, val <%s>" % (tag, val))
|
||||
if tag.upper() in tagdict:
|
||||
tag = tag.upper()
|
||||
if tag in tagdict:
|
||||
#self.em.rclog("Original tag: <%s>, type0 %s val <%s>" %
|
||||
# (tag, type(val), val))
|
||||
# Some file types return lists of value (e.g. FLAC)
|
||||
try:
|
||||
val = " ".join(val)
|
||||
#self.em.rclog("Joined tag: <%s>, type0 %s val <%s>" %
|
||||
# (tag, type(val), val))
|
||||
except:
|
||||
pass
|
||||
ntag = tagdict[tag].lower()
|
||||
#self.em.rclog("New tag: %s" % ntag)
|
||||
try:
|
||||
if isinstance(val, bool):
|
||||
val0 = str(val)
|
||||
else:
|
||||
try:
|
||||
val0 = val[0]
|
||||
except:
|
||||
val0 = val
|
||||
if val0:
|
||||
if type(val0) == type(u""):
|
||||
val0 = val0.encode('utf-8', errors='replace')
|
||||
else:
|
||||
val0 = str(val0)
|
||||
minf[ntag] = val0
|
||||
#self.em.rclog("Tag %s -> %s" % (ntag, val0))
|
||||
minf[ntag] = tobytes(val)
|
||||
#self.em.rclog("Tag %s -> %s" % (ntag, val))
|
||||
except Exception as err:
|
||||
self.em.rclog("Error while extracting tag: %s"%err)
|
||||
else:
|
||||
#self.em.rclog("Unprocessed tag: %s, value %s"%(tag,val))
|
||||
pass
|
||||
|
||||
#self.em.rclog("minf after extract %s\n" % minf)
|
||||
|
||||
# TPA,TPOS,disc DISCNUMBER/TOTALDISCS
|
||||
# TRCK,TRK,trkn TRACKNUMBER/TOTALTRACKS
|
||||
for what in ('disc', 'track'):
|
||||
@ -322,16 +327,17 @@ class AudioTagExtractor:
|
||||
else:
|
||||
l = l.split(b'/')
|
||||
else:
|
||||
self.em.rclog("l is tuple: %s" %l)
|
||||
self.em.rclog("l is tuple: %s tp1 %s tp2 %S" %
|
||||
(l, type(l[0]), type(l[1])))
|
||||
if len(l) == 2:
|
||||
minf[k] = str(l[0])
|
||||
minf[k] = l[0]
|
||||
#self.em.rclog("minf[%s] = %s" % (k, minf[k]))
|
||||
if l[1] != 0:
|
||||
minf['total' + what + 's'] = str(l[1])
|
||||
minf['total' + what + 's'] = l[1]
|
||||
|
||||
if 'orchestra' in minf:
|
||||
val = minf['orchestra']
|
||||
if val.startswith('orchestra='):
|
||||
if val.startswith(b'orchestra='):
|
||||
minf['orchestra'] = val[10:]
|
||||
|
||||
#self.em.rclog("minf after tags %s\n" % minf)
|
||||
@ -340,7 +346,7 @@ class AudioTagExtractor:
|
||||
embdimg = self._embeddedImageFormat(mutf)
|
||||
if embdimg:
|
||||
#self.em.rclog("Embedded image format: %s" % embdimg)
|
||||
minf["embdimg"] = embdimg
|
||||
embdimg = tobytes(embdimg)
|
||||
|
||||
self.em.setmimetype("text/plain")
|
||||
self.em.setfield("charset", 'utf-8')
|
||||
@ -353,7 +359,7 @@ class AudioTagExtractor:
|
||||
self.em.setfield('author', val)
|
||||
|
||||
try:
|
||||
docdata = mutf.pprint().encode('utf-8', errors='replace')
|
||||
docdata = tobytes(mutf.pprint())
|
||||
except Exception as err:
|
||||
self.em.rclog("Doc pprint error: %s" % err)
|
||||
|
||||
|
||||
64
src/filters/rclbasehandler.py
Normal file
64
src/filters/rclbasehandler.py
Normal file
@ -0,0 +1,64 @@
|
||||
#!/usr/bin/env python3
|
||||
# Copyright (C) 2016 J.F.Dockes
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the
|
||||
# Free Software Foundation, Inc.,
|
||||
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
|
||||
# Base for extractor classes. With some common generic implementations
|
||||
# for the boilerplate functions.
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
import os
|
||||
import sys
|
||||
import rclexecm
|
||||
|
||||
class RclBaseHandler(object):
|
||||
def __init__(self, em):
|
||||
self.em = em
|
||||
|
||||
|
||||
def extractone(self, params):
|
||||
#self.em.rclog("extractone %s %s" % (params["filename:"], \
|
||||
#params["mimetype:"]))
|
||||
if not "filename:" in params:
|
||||
self.em.rclog("extractone: no file name")
|
||||
return (False, "", "", rclexecm.RclExecM.eofnow)
|
||||
fn = params["filename:"]
|
||||
|
||||
try:
|
||||
html = self.html_text(fn)
|
||||
except Exception as err:
|
||||
self.em.rclog("RclBaseDumper: %s : %s" % (fn, err))
|
||||
return (False, "", "", rclexecm.RclExecM.eofnow)
|
||||
|
||||
self.em.setmimetype('text/html')
|
||||
return (True, html, "", rclexecm.RclExecM.eofnext)
|
||||
|
||||
|
||||
###### File type handler api, used by rclexecm ---------->
|
||||
def openfile(self, params):
|
||||
self.currentindex = 0
|
||||
return True
|
||||
|
||||
def getipath(self, params):
|
||||
return self.extractone(params)
|
||||
|
||||
def getnext(self, params):
|
||||
if self.currentindex >= 1:
|
||||
return (False, "", "", rclexecm.RclExecM.eofnow)
|
||||
else:
|
||||
ret= self.extractone(params)
|
||||
self.currentindex += 1
|
||||
return ret
|
||||
@ -1,12 +1,9 @@
|
||||
#!/usr/bin/env python2
|
||||
#!/usr/bin/env python3
|
||||
"""Extract Html files from a Microsoft Compiled Html Help file (.chm)
|
||||
Needs at least python 2.2 for HTMLParser (chmlib needs 2.2 too)"""
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
# Note: this is not converted to Py3, libchm does not have a
|
||||
# Py3 wrapper at this point (2018-03)
|
||||
|
||||
rclchm_html_mtype = "text/html"
|
||||
|
||||
import sys
|
||||
@ -18,21 +15,32 @@ if PY3:
|
||||
from urllib.parse import unquote as urllib_unquote
|
||||
from urllib.parse import urlparse as urlparse_urlparse
|
||||
from html.parser import HTMLParser
|
||||
chmpackname = 'pychm3.egg'
|
||||
else:
|
||||
from urlparse import urlparse as urlparse_urlparse
|
||||
from urllib import unquote as urllib_unquote
|
||||
from HTMLParser import HTMLParser
|
||||
|
||||
chmpackname = 'pychm2.egg'
|
||||
|
||||
import subprocess
|
||||
|
||||
import rclconfig
|
||||
import rclexecm
|
||||
|
||||
# pychm has no official port to Python3, hence no package in the
|
||||
# standard place. Recoll bundles a python3 port which we install out
|
||||
# of the standard python places. Look for it:
|
||||
# sys.path[0] is for MSW, where we install the egg in the filters
|
||||
# directory? TBD for now
|
||||
try:
|
||||
# First try the system version if any
|
||||
from chm import chm,chmlib
|
||||
except:
|
||||
print("RECFILTERROR HELPERNOTFOUND python:chm")
|
||||
sys.exit(1);
|
||||
try:
|
||||
from recollchm import chm,chmlib
|
||||
except:
|
||||
print("RECFILTERROR HELPERNOTFOUND python:chm")
|
||||
sys.exit(1);
|
||||
|
||||
# Small helper routines
|
||||
def getfile(chmfile, path):
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/env python2
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import print_function
|
||||
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/env python2
|
||||
#!/usr/bin/env python3
|
||||
# Copyright (C) 2016 J.F.Dockes
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/env python2
|
||||
#!/usr/bin/env python3
|
||||
from __future__ import print_function
|
||||
|
||||
import rclexecm
|
||||
|
||||
@ -17,11 +17,11 @@
|
||||
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
|
||||
#
|
||||
# Extract text from a dvi file by either executing dvitops and rclps
|
||||
# or using catdvi. dvitops has given better results during tests, and is
|
||||
# chosen first if available, but the dvitops/rclps combination is much
|
||||
# slower than catdvi
|
||||
# set variables
|
||||
# Extract text from a dvi file by either executing dvitops and
|
||||
# pstotext or using catdvi. dvitops has given better results during
|
||||
# tests, and is chosen first if available, but the dvitops/pstotext
|
||||
# combination is much slower than catdvi set variables. In any case,
|
||||
# the program is not too good with special characters (e.g. ligatures)
|
||||
LANG=C ; export LANG
|
||||
LC_ALL=C ; export LC_ALL
|
||||
progname="rcldvi"
|
||||
@ -94,26 +94,25 @@ umask 77
|
||||
# !! Leave the following line unmodified !
|
||||
#ENDRECFILTCOMMONCODE
|
||||
|
||||
# Find rclps. Note: this only works because we are always executed with a
|
||||
# full path
|
||||
rclps=`dirname $0`/rclps
|
||||
|
||||
decoderdvips()
|
||||
{
|
||||
dvips -f $1 2> /dev/null | pstotext | iconv -f cp1252 -t utf-8 -c -s
|
||||
}
|
||||
decodercatdvi()
|
||||
{
|
||||
catdvi $1
|
||||
}
|
||||
decoder=""
|
||||
if iscmd dvips -a iscmd pstotext ; then
|
||||
decoder=dvips
|
||||
decoder=decoderdvips
|
||||
elif iscmd catdvi ; then
|
||||
decoder=catdvi
|
||||
decoder=decodercatdvi
|
||||
fi
|
||||
|
||||
if test X$decoder = X ; then
|
||||
senderror HELPERNOTFOUND dvips or catdvi
|
||||
fi
|
||||
|
||||
if test X$decoder = Xdvips ; then
|
||||
$decoder -f < "$infile" 2> /dev/null | $rclps -
|
||||
exit $?
|
||||
fi
|
||||
|
||||
# The strange 'BEGIN' setup is to prevent 'file' from thinking this file
|
||||
# is an awk program
|
||||
$decoder "$infile" |
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/env python2
|
||||
#!/usr/bin/env python3
|
||||
"""Extract Html content from an EPUB file (.epub)"""
|
||||
from __future__ import print_function
|
||||
|
||||
@ -12,6 +12,7 @@ import subprocess
|
||||
import rclexecm
|
||||
import rclconfig
|
||||
|
||||
sys.path.append(sys.path[0]+"/recollepub.zip")
|
||||
try:
|
||||
import epub
|
||||
except:
|
||||
@ -112,7 +113,7 @@ class rclEPUB:
|
||||
|
||||
|
||||
def getipath(self, params):
|
||||
return self.extractone(params["ipath:"])
|
||||
return self.extractone(params["ipath:"].decode('UTF-8'))
|
||||
|
||||
def getnext(self, params):
|
||||
if self.catenate:
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/env python2
|
||||
#!/usr/bin/env python3
|
||||
"""Extract Html content from an EPUB file (.chm), concatenating all sections"""
|
||||
from __future__ import print_function
|
||||
|
||||
@ -8,6 +8,7 @@ import re
|
||||
|
||||
import rclexecm
|
||||
|
||||
sys.path.append(sys.path[0]+"/recollepub.zip")
|
||||
try:
|
||||
import epub
|
||||
except:
|
||||
|
||||
@ -30,18 +30,18 @@ from __future__ import print_function
|
||||
|
||||
import subprocess
|
||||
import rclexecm
|
||||
from rclbasehandler import RclBaseHandler
|
||||
|
||||
# This class has the code to execute the subprocess and call a
|
||||
# data-specific post-processor. Command and processor are supplied by
|
||||
# the object which we receive as a parameter, which in turn is defined
|
||||
# in the actual executable filter (e.g. rcldoc.py)
|
||||
class Executor:
|
||||
class Executor(RclBaseHandler):
|
||||
opt_ignxval = 1
|
||||
|
||||
def __init__(self, em, flt):
|
||||
self.em = em
|
||||
super(Executor, self).__init__(em)
|
||||
self.flt = flt
|
||||
self.currentindex = 0
|
||||
|
||||
def runCmd(self, cmd, filename, postproc, opt):
|
||||
''' Substitute parameters and execute command, process output
|
||||
@ -109,19 +109,4 @@ class Executor:
|
||||
return (ok, data, "", rclexecm.RclExecM.eofnext)
|
||||
else:
|
||||
return (ok, "", "", rclexecm.RclExecM.eofnow)
|
||||
|
||||
###### File type handler api, used by rclexecm ---------->
|
||||
def openfile(self, params):
|
||||
self.currentindex = 0
|
||||
return True
|
||||
|
||||
def getipath(self, params):
|
||||
return self.extractone(params)
|
||||
|
||||
def getnext(self, params):
|
||||
if self.currentindex >= 1:
|
||||
return (False, "", "", rclexecm.RclExecM.eofnow)
|
||||
else:
|
||||
ret= self.extractone(params)
|
||||
self.currentindex += 1
|
||||
return ret
|
||||
|
||||
|
||||
@ -1,139 +0,0 @@
|
||||
#!/bin/sh
|
||||
# @(#$Id: rclopxml,v 1.3 2008-10-08 08:27:34 dockes Exp $ (C) 2004 J.F.Dockes
|
||||
#================================================================
|
||||
# Extract text from an fb2 ebook (xml)
|
||||
#================================================================
|
||||
|
||||
# set variables
|
||||
LANG=C ; export LANG
|
||||
LC_ALL=C ; export LC_ALL
|
||||
progname=rclfb2
|
||||
filetype=fb2
|
||||
|
||||
|
||||
#RECFILTCOMMONCODE
|
||||
##############################################################################
|
||||
# !! Leave the previous line unmodified!! Code imported from the
|
||||
# recfiltcommon file
|
||||
|
||||
# Utility code common to all shell filters. This could be sourced at run
|
||||
# time, but it's slightly more efficient to include the code in the
|
||||
# filters at build time (with a sed script).
|
||||
|
||||
# Describe error in a way that can be interpreted by our caller
|
||||
senderror()
|
||||
{
|
||||
echo RECFILTERROR $*
|
||||
# Also alert on stderr just in case
|
||||
echo ":2:$progname::: $*" 1>&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
iscmd()
|
||||
{
|
||||
cmd=$1
|
||||
case $cmd in
|
||||
*/*)
|
||||
if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;;
|
||||
*)
|
||||
oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
|
||||
for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done
|
||||
return 1 ;;
|
||||
esac
|
||||
}
|
||||
|
||||
checkcmds()
|
||||
{
|
||||
for cmd in $*;do
|
||||
if iscmd $cmd
|
||||
then
|
||||
a=1
|
||||
else
|
||||
senderror HELPERNOTFOUND $cmd
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
# show help message
|
||||
if test $# -ne 1 -o "$1" = "--help"
|
||||
then
|
||||
echo "Convert a $filetype file to HTML text for Recoll indexing."
|
||||
echo "Usage: $progname [infile]"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
infile="$1"
|
||||
|
||||
# check the input file existence (may be '-' for stdin)
|
||||
if test "X$infile" != X- -a ! -f "$infile"
|
||||
then
|
||||
senderror INPUTNOSUCHFILE "$infile"
|
||||
fi
|
||||
|
||||
# protect access to our temp files and directories
|
||||
umask 77
|
||||
|
||||
##############################################################################
|
||||
# !! Leave the following line unmodified !
|
||||
#ENDRECFILTCOMMONCODE
|
||||
|
||||
checkcmds xsltproc
|
||||
|
||||
xsltproc --nonet --novalid - "$infile" <<EOF
|
||||
<?xml version="1.0"?>
|
||||
<xsl:stylesheet version="1.0"
|
||||
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
|
||||
xmlns:fb="http://www.gribuser.ru/xml/fictionbook/2.0"
|
||||
exclude-result-prefixes="fb"
|
||||
>
|
||||
|
||||
<xsl:output method="html" encoding="UTF-8"/>
|
||||
|
||||
<xsl:template match="/fb:FictionBook">
|
||||
<html>
|
||||
<xsl:apply-templates select="fb:description"/>
|
||||
<xsl:apply-templates select="fb:body"/>
|
||||
</html>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="fb:description">
|
||||
<head>
|
||||
<xsl:apply-templates select="fb:title-info"/>
|
||||
</head><xsl:text>
|
||||
</xsl:text>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="fb:description/fb:title-info">
|
||||
<xsl:apply-templates select="fb:book-title"/>
|
||||
<xsl:apply-templates select="fb:author"/>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="fb:description/fb:title-info/fb:book-title">
|
||||
<title> <xsl:value-of select="."/> </title>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="fb:description/fb:title-info/fb:author">
|
||||
<meta>
|
||||
<xsl:attribute name="name">author</xsl:attribute>
|
||||
<xsl:attribute name="content">
|
||||
<xsl:value-of select="fb:first-name"/><xsl:text> </xsl:text>
|
||||
<xsl:value-of select="fb:middle-name"/><xsl:text> </xsl:text>
|
||||
<xsl:value-of select="fb:last-name"/>
|
||||
</xsl:attribute>
|
||||
</meta>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="fb:body">
|
||||
<body>
|
||||
<xsl:apply-templates select="fb:section"/>
|
||||
</body>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="fb:body/fb:section">
|
||||
<xsl:for-each select="fb:p">
|
||||
<p><xsl:value-of select="."/></p>
|
||||
</xsl:for-each>
|
||||
</xsl:template>
|
||||
|
||||
</xsl:stylesheet>
|
||||
EOF
|
||||
87
src/filters/rclfb2.py
Executable file
87
src/filters/rclfb2.py
Executable file
@ -0,0 +1,87 @@
|
||||
#!/usr/bin/env python3
|
||||
# Copyright (C) 2014 J.F.Dockes
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the
|
||||
# Free Software Foundation, Inc.,
|
||||
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
######################################
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
import sys
|
||||
import rclexecm
|
||||
import rclxslt
|
||||
import rclgenxslt
|
||||
|
||||
stylesheet_all = '''<?xml version="1.0"?>
|
||||
<xsl:stylesheet version="1.0"
|
||||
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
|
||||
xmlns:fb="http://www.gribuser.ru/xml/fictionbook/2.0"
|
||||
exclude-result-prefixes="fb"
|
||||
>
|
||||
|
||||
<xsl:output method="html" encoding="UTF-8"/>
|
||||
|
||||
<xsl:template match="/fb:FictionBook">
|
||||
<html>
|
||||
<xsl:apply-templates select="fb:description"/>
|
||||
<xsl:apply-templates select="fb:body"/>
|
||||
</html>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="fb:description">
|
||||
<head>
|
||||
<xsl:apply-templates select="fb:title-info"/>
|
||||
</head><xsl:text>
|
||||
</xsl:text>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="fb:description/fb:title-info">
|
||||
<xsl:apply-templates select="fb:book-title"/>
|
||||
<xsl:apply-templates select="fb:author"/>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="fb:description/fb:title-info/fb:book-title">
|
||||
<title> <xsl:value-of select="."/> </title>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="fb:description/fb:title-info/fb:author">
|
||||
<meta>
|
||||
<xsl:attribute name="name">author</xsl:attribute>
|
||||
<xsl:attribute name="content">
|
||||
<xsl:value-of select="fb:first-name"/><xsl:text> </xsl:text>
|
||||
<xsl:value-of select="fb:middle-name"/><xsl:text> </xsl:text>
|
||||
<xsl:value-of select="fb:last-name"/>
|
||||
</xsl:attribute>
|
||||
</meta>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="fb:body">
|
||||
<body>
|
||||
<xsl:apply-templates select="fb:section"/>
|
||||
</body>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="fb:body/fb:section">
|
||||
<xsl:for-each select="fb:p">
|
||||
<p><xsl:value-of select="."/></p>
|
||||
</xsl:for-each>
|
||||
</xsl:template>
|
||||
|
||||
</xsl:stylesheet>
|
||||
'''
|
||||
|
||||
if __name__ == '__main__':
|
||||
proto = rclexecm.RclExecM()
|
||||
extract = rclgenxslt.XSLTExtractor(proto, stylesheet_all)
|
||||
rclexecm.main(proto, extract)
|
||||
39
src/filters/rclgenxslt.py
Executable file
39
src/filters/rclgenxslt.py
Executable file
@ -0,0 +1,39 @@
|
||||
#!/usr/bin/env python3
|
||||
# Copyright (C) 2018 J.F.Dockes
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the
|
||||
# Free Software Foundation, Inc.,
|
||||
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
######################################
|
||||
|
||||
# Base class for simple (one stylesheet) xslt-based handlers
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
import sys
|
||||
import rclxslt
|
||||
import gzip
|
||||
from rclbasehandler import RclBaseHandler
|
||||
|
||||
class XSLTExtractor(RclBaseHandler):
|
||||
def __init__(self, em, stylesheet, gzip=False):
|
||||
super(XSLTExtractor, self).__init__(em)
|
||||
self.stylesheet = stylesheet
|
||||
self.dogz = gzip
|
||||
|
||||
def html_text(self, fn):
|
||||
if self.dogz:
|
||||
data = gzip.open(fn, 'rb').read()
|
||||
else:
|
||||
data = open(fn, 'rb').read()
|
||||
return rclxslt.apply_sheet_data(self.stylesheet, data)
|
||||
@ -1,191 +0,0 @@
|
||||
#!/bin/sh
|
||||
# @(#$Id: rclsoff,v 1.12 2008-10-08 08:27:34 dockes Exp $ (C) 2004 J.F.Dockes
|
||||
# Parts taken from Estraier:
|
||||
#================================================================
|
||||
# Estraier: a personal full-text search system
|
||||
# Copyright (C) 2003-2004 Mikio Hirabayashi
|
||||
#================================================================
|
||||
#================================================================
|
||||
# Extract text from a gnumeric spreadsheet
|
||||
#================================================================
|
||||
|
||||
# set variables
|
||||
LANG=C ; export LANG
|
||||
LC_ALL=C ; export LC_ALL
|
||||
progname="rclgnumeric"
|
||||
filetype=gnumeric
|
||||
|
||||
|
||||
#RECFILTCOMMONCODE
|
||||
##############################################################################
|
||||
# !! Leave the previous line unmodified!! Code imported from the
|
||||
# recfiltcommon file
|
||||
|
||||
# Utility code common to all shell filters. This could be sourced at run
|
||||
# time, but it's slightly more efficient to include the code in the
|
||||
# filters at build time (with a sed script).
|
||||
|
||||
# Describe error in a way that can be interpreted by our caller
|
||||
senderror()
|
||||
{
|
||||
echo RECFILTERROR $*
|
||||
# Also alert on stderr just in case
|
||||
echo ":2:$progname::: $*" 1>&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
iscmd()
|
||||
{
|
||||
cmd=$1
|
||||
case $cmd in
|
||||
*/*)
|
||||
if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;;
|
||||
*)
|
||||
oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
|
||||
for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done
|
||||
return 1 ;;
|
||||
esac
|
||||
}
|
||||
|
||||
checkcmds()
|
||||
{
|
||||
for cmd in $*;do
|
||||
if iscmd $cmd
|
||||
then
|
||||
a=1
|
||||
else
|
||||
senderror HELPERNOTFOUND $cmd
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
# show help message
|
||||
if test $# -ne 1 -o "$1" = "--help"
|
||||
then
|
||||
echo "Convert a $filetype file to HTML text for Recoll indexing."
|
||||
echo "Usage: $progname [infile]"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
infile="$1"
|
||||
|
||||
# check the input file existence (may be '-' for stdin)
|
||||
if test "X$infile" != X- -a ! -f "$infile"
|
||||
then
|
||||
senderror INPUTNOSUCHFILE "$infile"
|
||||
fi
|
||||
|
||||
# protect access to our temp files and directories
|
||||
umask 77
|
||||
|
||||
##############################################################################
|
||||
# !! Leave the following line unmodified !
|
||||
#ENDRECFILTCOMMONCODE
|
||||
|
||||
checkcmds xsltproc gunzip
|
||||
|
||||
# We need a temporary file
|
||||
if test z"$RECOLL_TMPDIR" != z; then
|
||||
ttdir=$RECOLL_TMPDIR
|
||||
elif test z"$TMPDIR" != z ; then
|
||||
ttdir=$TMPDIR
|
||||
else
|
||||
ttdir=/tmp
|
||||
fi
|
||||
tmpfile=$ttdir/rclgnm.XXXXXX
|
||||
|
||||
tmpfile=`mktemp "$tmpfile"`
|
||||
if [ $? -ne 0 ]; then
|
||||
senderror "$0: Can't create temp file, exiting..."
|
||||
fi
|
||||
|
||||
cleanup()
|
||||
{
|
||||
rm -f $tmpfile
|
||||
}
|
||||
|
||||
trap cleanup EXIT HUP QUIT INT TERM
|
||||
|
||||
gunzip < $1 > $tmpfile || senderror "Cant uncompress input"
|
||||
xsltproc --novalid --nonet - $tmpfile <<EOF
|
||||
<?xml version="1.0"?>
|
||||
<xsl:stylesheet version="1.0"
|
||||
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
|
||||
xmlns:office="urn:oasis:names:tc:opendocument:xmlns:office:1.0"
|
||||
xmlns:xlink="http://www.w3.org/1999/xlink"
|
||||
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
||||
xmlns:meta="urn:oasis:names:tc:opendocument:xmlns:meta:1.0"
|
||||
xmlns:ooo="http://openoffice.org/2004/office"
|
||||
xmlns:gnm="http://www.gnumeric.org/v10.dtd"
|
||||
|
||||
exclude-result-prefixes="office xlink meta ooo dc"
|
||||
>
|
||||
|
||||
<xsl:output method="html" encoding="UTF-8"/>
|
||||
|
||||
<xsl:template match="/">
|
||||
<html>
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
|
||||
<xsl:apply-templates select="//office:document-meta/office:meta"/>
|
||||
</head>
|
||||
|
||||
<body>
|
||||
<xsl:apply-templates select="//gnm:Cells"/>
|
||||
<xsl:apply-templates select="//gnm:Objects"/>
|
||||
</body>
|
||||
</html>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="//dc:date">
|
||||
<meta>
|
||||
<xsl:attribute name="name">date</xsl:attribute>
|
||||
<xsl:attribute name="content"><xsl:value-of select="."/></xsl:attribute>
|
||||
</meta>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="//dc:description">
|
||||
<meta>
|
||||
<xsl:attribute name="name">abstract</xsl:attribute>
|
||||
<xsl:attribute name="content"><xsl:value-of select="."/></xsl:attribute>
|
||||
</meta>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="//meta:keyword">
|
||||
<meta>
|
||||
<xsl:attribute name="name">keywords</xsl:attribute>
|
||||
<xsl:attribute name="content"><xsl:value-of select="."/></xsl:attribute>
|
||||
</meta>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="//dc:subject">
|
||||
<meta>
|
||||
<xsl:attribute name="name">keywords</xsl:attribute>
|
||||
<xsl:attribute name="content"><xsl:value-of select="."/></xsl:attribute>
|
||||
</meta>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="//dc:title">
|
||||
<title> <xsl:value-of select="."/> </title>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="//meta:initial-creator">
|
||||
<meta>
|
||||
<xsl:attribute name="name">author</xsl:attribute>
|
||||
<xsl:attribute name="content"><xsl:value-of select="."/></xsl:attribute>
|
||||
</meta>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="office:meta/*"/>
|
||||
|
||||
<xsl:template match="gnm:Cell">
|
||||
<p><xsl:value-of select="."/></p>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="gnm:CellComment">
|
||||
<blockquote><xsl:value-of select="@Text"/></blockquote>
|
||||
</xsl:template>
|
||||
|
||||
</xsl:stylesheet>
|
||||
EOF
|
||||
|
||||
112
src/filters/rclgnm.py
Executable file
112
src/filters/rclgnm.py
Executable file
@ -0,0 +1,112 @@
|
||||
#!/usr/bin/env python3
|
||||
# Copyright (C) 2014 J.F.Dockes
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the
|
||||
# Free Software Foundation, Inc.,
|
||||
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
######################################
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
import sys
|
||||
import rclexecm
|
||||
import rclgenxslt
|
||||
|
||||
|
||||
stylesheet_all = '''<?xml version="1.0"?>
|
||||
<xsl:stylesheet version="1.0"
|
||||
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
|
||||
xmlns:office="urn:oasis:names:tc:opendocument:xmlns:office:1.0"
|
||||
xmlns:xlink="http://www.w3.org/1999/xlink"
|
||||
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
||||
xmlns:meta="urn:oasis:names:tc:opendocument:xmlns:meta:1.0"
|
||||
xmlns:ooo="http://openoffice.org/2004/office"
|
||||
xmlns:gnm="http://www.gnumeric.org/v10.dtd"
|
||||
|
||||
exclude-result-prefixes="office xlink meta ooo dc"
|
||||
>
|
||||
|
||||
<xsl:output method="html" encoding="UTF-8"/>
|
||||
|
||||
<xsl:template match="/">
|
||||
<html>
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
|
||||
<xsl:apply-templates select="//office:document-meta/office:meta"/>
|
||||
</head>
|
||||
|
||||
<body>
|
||||
<xsl:apply-templates select="//gnm:Cells"/>
|
||||
<xsl:apply-templates select="//gnm:Objects"/>
|
||||
</body>
|
||||
</html>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="//dc:date">
|
||||
<meta>
|
||||
<xsl:attribute name="name">date</xsl:attribute>
|
||||
<xsl:attribute name="content"><xsl:value-of select="."/></xsl:attribute>
|
||||
</meta>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="//dc:description">
|
||||
<meta>
|
||||
<xsl:attribute name="name">abstract</xsl:attribute>
|
||||
<xsl:attribute name="content"><xsl:value-of select="."/></xsl:attribute>
|
||||
</meta>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="//meta:keyword">
|
||||
<meta>
|
||||
<xsl:attribute name="name">keywords</xsl:attribute>
|
||||
<xsl:attribute name="content"><xsl:value-of select="."/></xsl:attribute>
|
||||
</meta>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="//dc:subject">
|
||||
<meta>
|
||||
<xsl:attribute name="name">keywords</xsl:attribute>
|
||||
<xsl:attribute name="content"><xsl:value-of select="."/></xsl:attribute>
|
||||
</meta>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="//dc:title">
|
||||
<title> <xsl:value-of select="."/> </title>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="//meta:initial-creator">
|
||||
<meta>
|
||||
<xsl:attribute name="name">author</xsl:attribute>
|
||||
<xsl:attribute name="content"><xsl:value-of select="."/></xsl:attribute>
|
||||
</meta>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="office:meta/*"/>
|
||||
|
||||
<xsl:template match="gnm:Cell">
|
||||
<p><xsl:value-of select="."/></p>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="gnm:CellComment">
|
||||
<blockquote><xsl:value-of select="@Text"/></blockquote>
|
||||
</xsl:template>
|
||||
|
||||
</xsl:stylesheet>
|
||||
'''
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
proto = rclexecm.RclExecM()
|
||||
extract = rclgenxslt.XSLTExtractor(proto, stylesheet_all, gzip=True)
|
||||
rclexecm.main(proto, extract)
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/env python2
|
||||
#!/usr/bin/env python3
|
||||
from __future__ import print_function
|
||||
|
||||
# Read an ICS file, break it into "documents" which are events, todos,
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/env perl
|
||||
#!/usr/bin/perl
|
||||
# @(#$Id: rclimg,v 1.5 2008-10-09 06:41:21 dockes Exp $ (C) 2007 Cedric Scott
|
||||
#######################################################
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/env python2
|
||||
#!/usr/bin/env python3
|
||||
|
||||
# Python-based Image Tag extractor for Recoll. This is less thorough
|
||||
# than the Perl-based rclimg script, but useful if you don't want to
|
||||
@ -12,6 +12,7 @@ import sys
|
||||
import os
|
||||
import rclexecm
|
||||
import re
|
||||
from rclbasehandler import RclBaseHandler
|
||||
|
||||
try:
|
||||
import pyexiv2
|
||||
@ -41,31 +42,21 @@ meta_pyexiv2_keys = {
|
||||
exiv2_dates = ['Exif.Photo.DateTimeOriginal',
|
||||
'Exif.Image.DateTime', 'Exif.Photo.DateTimeDigitized']
|
||||
|
||||
class ImgTagExtractor:
|
||||
class ImgTagExtractor(RclBaseHandler):
|
||||
def __init__(self, em):
|
||||
self.em = em
|
||||
self.currentindex = 0
|
||||
super(ImgTagExtractor, self).__init__(em)
|
||||
|
||||
def extractone(self, params):
|
||||
#self.em.rclog("extractone %s" % params["filename:"])
|
||||
def html_text(self, filename):
|
||||
ok = False
|
||||
if "filename:" not in params:
|
||||
self.em.rclog("extractone: no file name")
|
||||
return (ok, docdata, "", rclexecm.RclExecM.eofnow)
|
||||
filename = params["filename:"]
|
||||
|
||||
try:
|
||||
metadata = pyexiv2.ImageMetadata(filename)
|
||||
metadata.read()
|
||||
keys = metadata.exif_keys + metadata.iptc_keys + metadata.xmp_keys
|
||||
mdic = {}
|
||||
for k in keys:
|
||||
# we skip numeric keys and undecoded makernote data
|
||||
if k != 'Exif.Photo.MakerNote' and not khexre.match(k):
|
||||
mdic[k] = str(metadata[k].raw_value)
|
||||
except Exception as err:
|
||||
self.em.rclog("extractone: extract failed: [%s]" % err)
|
||||
return (ok, "", "", rclexecm.RclExecM.eofnow)
|
||||
metadata = pyexiv2.ImageMetadata(filename)
|
||||
metadata.read()
|
||||
keys = metadata.exif_keys + metadata.iptc_keys + metadata.xmp_keys
|
||||
mdic = {}
|
||||
for k in keys:
|
||||
# we skip numeric keys and undecoded makernote data
|
||||
if k != 'Exif.Photo.MakerNote' and not khexre.match(k):
|
||||
mdic[k] = str(metadata[k].raw_value)
|
||||
|
||||
docdata = b'<html><head>\n'
|
||||
|
||||
@ -101,25 +92,8 @@ class ImgTagExtractor:
|
||||
self.em.htmlescape(mdic[k]) + "<br />\n")
|
||||
docdata += b'</body></html>'
|
||||
|
||||
self.em.setmimetype("text/html")
|
||||
return docdata
|
||||
|
||||
return (True, docdata, "", rclexecm.RclExecM.eofnext)
|
||||
|
||||
###### File type handler api, used by rclexecm ---------->
|
||||
def openfile(self, params):
|
||||
self.currentindex = 0
|
||||
return True
|
||||
|
||||
def getipath(self, params):
|
||||
return self.extractone(params)
|
||||
|
||||
def getnext(self, params):
|
||||
if self.currentindex >= 1:
|
||||
return (False, "", "", rclexecm.RclExecM.eofnow)
|
||||
else:
|
||||
ret= self.extractone(params)
|
||||
self.currentindex += 1
|
||||
return ret
|
||||
|
||||
if __name__ == '__main__':
|
||||
proto = rclexecm.RclExecM()
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/env python2
|
||||
#!/usr/bin/env python3
|
||||
|
||||
# Read a file in GNU info format and output its nodes as subdocs,
|
||||
# interfacing with recoll execm
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/env python2
|
||||
#!/usr/bin/env python3
|
||||
|
||||
# Read a .kar midi karaoke file and translate to recoll indexable format
|
||||
# This does not work with Python3 yet because python:midi doesn't
|
||||
@ -10,6 +10,7 @@ import os.path
|
||||
import string
|
||||
import re
|
||||
import codecs
|
||||
from rclbasehandler import RclBaseHandler
|
||||
|
||||
try:
|
||||
import rcllatinclass
|
||||
@ -51,7 +52,7 @@ if PY3:
|
||||
else:
|
||||
nullchar = chr(0)
|
||||
|
||||
class KarTextExtractor:
|
||||
class KarTextExtractor(RclBaseHandler):
|
||||
# Afaik, the only charset encodings with null bytes are variations on
|
||||
# utf-16 and utf-32 and iso relatives. A hopefully comprehensive
|
||||
# list follows, compiled from iconv and python values. This is used for
|
||||
@ -66,8 +67,7 @@ class KarTextExtractor:
|
||||
'utf_16_le', 'utf_32', 'utf_32_be', 'utf_32_le'))
|
||||
|
||||
def __init__(self, em):
|
||||
self.em = em
|
||||
self.currentindex = 0
|
||||
super(KarTextExtractor, self).__init__(em)
|
||||
self.encoding = ""
|
||||
self.defaultencoding = ""
|
||||
self.hadnulls = False
|
||||
@ -182,16 +182,7 @@ class KarTextExtractor:
|
||||
return (encoding, confidence)
|
||||
|
||||
|
||||
def extractone(self, params):
|
||||
'''Process one file'''
|
||||
docdata = ""
|
||||
ok = False
|
||||
|
||||
if "filename:" not in params:
|
||||
self.em.rclog("extractone: no mime or file name")
|
||||
return (ok, docdata, "", rclexecm.RclExecM.eofnow)
|
||||
filename = params["filename:"]
|
||||
|
||||
def html_text(self, filename):
|
||||
# Character encoding from file name ?
|
||||
self.encoding = self.encodingfromfilename(filename)
|
||||
if self.encoding:
|
||||
@ -200,18 +191,8 @@ class KarTextExtractor:
|
||||
except:
|
||||
self.encoding = ""
|
||||
|
||||
# Mimetype not used for now
|
||||
if "mimetype:" not in params:
|
||||
mimetype = 'audio/x-midi'
|
||||
else:
|
||||
mimetype = params["mimetype:"]
|
||||
|
||||
# Read in and midi-decode the file
|
||||
try:
|
||||
stream = midi.read_midifile(filename)
|
||||
except Exception as err:
|
||||
self.em.rclog("extractone: read_midifile failed: [%s]" % err)
|
||||
return (ok, docdata, "", rclexecm.RclExecM.eofnow)
|
||||
stream = midi.read_midifile(filename)
|
||||
|
||||
title = None
|
||||
author = None
|
||||
@ -262,7 +243,6 @@ class KarTextExtractor:
|
||||
lyrics += self.nulltrunc(edata)
|
||||
lyricsN += edata
|
||||
|
||||
|
||||
# Try to guess the encoding. First do it with the data
|
||||
# possibly containing nulls. If we get one of the accepted
|
||||
# nullbyte encodings, go with this, else repeat with the
|
||||
@ -305,28 +285,8 @@ class KarTextExtractor:
|
||||
lyrics = self.reencode(lyrics)
|
||||
language = self.reencode(language)
|
||||
|
||||
self.em.setmimetype("text/html")
|
||||
docdata = htmltemplate % (title, author, language, lyrics)
|
||||
return htmltemplate % (title, author, language, lyrics)
|
||||
|
||||
ok = True
|
||||
return (ok, docdata, "", rclexecm.RclExecM.eofnext)
|
||||
|
||||
###### File type handler api, used by rclexecm. Some stuff makes little
|
||||
# sense because we only have one doc per file.
|
||||
def openfile(self, params):
|
||||
self.currentindex = 0
|
||||
return True
|
||||
|
||||
def getipath(self, params):
|
||||
return self.extractone(params)
|
||||
|
||||
def getnext(self, params):
|
||||
if self.currentindex >= 1:
|
||||
return (False, "", "", rclexecm.RclExecM.eofnow)
|
||||
else:
|
||||
ret= self.extractone(params)
|
||||
self.currentindex += 1
|
||||
return ret
|
||||
|
||||
proto = rclexecm.RclExecM()
|
||||
extract = KarTextExtractor(proto)
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/env python2
|
||||
#!/usr/bin/env python3
|
||||
"""Try to guess a text's language and character set by checking how it matches lists of
|
||||
common words. This is not a primary method of detection because it's slow and unreliable, but it
|
||||
may be a help in discrimating, for exemple, before european languages using relatively close
|
||||
|
||||
@ -1,130 +0,0 @@
|
||||
#!/bin/sh
|
||||
# @(#$Id: rclsoff,v 1.12 2008-10-08 08:27:34 dockes Exp $ (C) 2004 J.F.Dockes
|
||||
# Parts taken from Estraier:
|
||||
#================================================================
|
||||
# Estraier: a personal full-text search system
|
||||
# Copyright (C) 2003-2004 Mikio Hirabayashi
|
||||
#================================================================
|
||||
#================================================================
|
||||
# Extract text from a gnumeric spreadsheet
|
||||
#================================================================
|
||||
|
||||
# set variables
|
||||
LANG=C ; export LANG
|
||||
LC_ALL=C ; export LC_ALL
|
||||
progname="rclgnumeric"
|
||||
filetype=gnumeric
|
||||
|
||||
|
||||
#RECFILTCOMMONCODE
|
||||
##############################################################################
|
||||
# !! Leave the previous line unmodified!! Code imported from the
|
||||
# recfiltcommon file
|
||||
|
||||
# Utility code common to all shell filters. This could be sourced at run
|
||||
# time, but it's slightly more efficient to include the code in the
|
||||
# filters at build time (with a sed script).
|
||||
|
||||
# Describe error in a way that can be interpreted by our caller
|
||||
senderror()
|
||||
{
|
||||
echo RECFILTERROR $*
|
||||
# Also alert on stderr just in case
|
||||
echo ":2:$progname::: $*" 1>&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
iscmd()
|
||||
{
|
||||
cmd=$1
|
||||
case $cmd in
|
||||
*/*)
|
||||
if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;;
|
||||
*)
|
||||
oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
|
||||
for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done
|
||||
return 1 ;;
|
||||
esac
|
||||
}
|
||||
|
||||
checkcmds()
|
||||
{
|
||||
for cmd in $*;do
|
||||
if iscmd $cmd
|
||||
then
|
||||
a=1
|
||||
else
|
||||
senderror HELPERNOTFOUND $cmd
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
# show help message
|
||||
if test $# -ne 1 -o "$1" = "--help"
|
||||
then
|
||||
echo "Convert a $filetype file to HTML text for Recoll indexing."
|
||||
echo "Usage: $progname [infile]"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
infile="$1"
|
||||
|
||||
# check the input file existence (may be '-' for stdin)
|
||||
if test "X$infile" != X- -a ! -f "$infile"
|
||||
then
|
||||
senderror INPUTNOSUCHFILE "$infile"
|
||||
fi
|
||||
|
||||
# protect access to our temp files and directories
|
||||
umask 77
|
||||
|
||||
##############################################################################
|
||||
# !! Leave the following line unmodified !
|
||||
#ENDRECFILTCOMMONCODE
|
||||
|
||||
checkcmds xsltproc
|
||||
|
||||
xsltproc --novalid --nonet - "$infile" <<EOF
|
||||
<?xml version="1.0"?>
|
||||
<xsl:stylesheet version="1.0"
|
||||
xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
|
||||
|
||||
<xsl:output method="html" encoding="UTF-8"/>
|
||||
<xsl:strip-space elements="*" />
|
||||
|
||||
|
||||
<xsl:template match="/">
|
||||
<html>
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
|
||||
<title>
|
||||
Okular notes about: <xsl:value-of select="/documentInfo/@url" />
|
||||
</title>
|
||||
</head>
|
||||
<body>
|
||||
<xsl:apply-templates />
|
||||
</body>
|
||||
</html>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="node()">
|
||||
<xsl:apply-templates select="@* | node() "/>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="text()">
|
||||
<p><xsl:value-of select="."/></p>
|
||||
<xsl:text >
|
||||
</xsl:text>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="@contents|@author">
|
||||
<p><xsl:value-of select="." /></p>
|
||||
<xsl:text >
|
||||
</xsl:text>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="@*"/>
|
||||
|
||||
</xsl:stylesheet>
|
||||
EOF
|
||||
|
||||
70
src/filters/rclokulnote.py
Executable file
70
src/filters/rclokulnote.py
Executable file
@ -0,0 +1,70 @@
|
||||
#!/usr/bin/env python3
|
||||
# Copyright (C) 2014 J.F.Dockes
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the
|
||||
# Free Software Foundation, Inc.,
|
||||
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
######################################
|
||||
from __future__ import print_function
|
||||
|
||||
import sys
|
||||
import rclexecm
|
||||
import rclgenxslt
|
||||
|
||||
stylesheet_all = '''<?xml version="1.0"?>
|
||||
<xsl:stylesheet version="1.0"
|
||||
xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
|
||||
|
||||
<xsl:output method="html" encoding="UTF-8"/>
|
||||
<xsl:strip-space elements="*" />
|
||||
|
||||
<xsl:template match="/">
|
||||
<html>
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
|
||||
<title>
|
||||
Okular notes about: <xsl:value-of select="/documentInfo/@url" />
|
||||
</title>
|
||||
</head>
|
||||
<body>
|
||||
<xsl:apply-templates />
|
||||
</body>
|
||||
</html>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="node()">
|
||||
<xsl:apply-templates select="@* | node() "/>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="text()">
|
||||
<p><xsl:value-of select="."/></p>
|
||||
<xsl:text >
|
||||
</xsl:text>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="@contents|@author">
|
||||
<p><xsl:value-of select="." /></p>
|
||||
<xsl:text >
|
||||
</xsl:text>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="@*"/>
|
||||
|
||||
</xsl:stylesheet>
|
||||
'''
|
||||
|
||||
if __name__ == '__main__':
|
||||
proto = rclexecm.RclExecM()
|
||||
extract = rclgenxslt.XSLTExtractor(proto, stylesheet_all)
|
||||
rclexecm.main(proto, extract)
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/env python2
|
||||
#!/usr/bin/env python3
|
||||
# Copyright (C) 2015 J.F.Dockes
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/env python2
|
||||
#!/usr/bin/env python3
|
||||
# Copyright (C) 2014 J.F.Dockes
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
@ -91,13 +91,12 @@ class PDFExtractor:
|
||||
# error at once
|
||||
return
|
||||
|
||||
cf = rclconfig.RclConfig()
|
||||
self.confdir = cf.getConfDir()
|
||||
|
||||
self.config = rclconfig.RclConfig()
|
||||
self.confdir = self.config.getConfDir()
|
||||
# The user can set a list of meta tags to be extracted from
|
||||
# the XMP metadata packet. These are specified as
|
||||
# (xmltag,rcltag) pairs
|
||||
self.extrameta = cf.getConfParam("pdfextrameta")
|
||||
self.extrameta = self.config.getConfParam("pdfextrameta")
|
||||
if self.extrameta:
|
||||
self._initextrameta()
|
||||
|
||||
@ -119,7 +118,7 @@ class PDFExtractor:
|
||||
# either the presence of a file in the config dir (historical)
|
||||
# or a set config variable.
|
||||
self.ocrpossible = False
|
||||
cf_doocr = cf.getConfParam("pdfocr")
|
||||
cf_doocr = self.config.getConfParam("pdfocr")
|
||||
if cf_doocr or os.path.isfile(os.path.join(self.confdir, "ocrpdf")):
|
||||
self.tesseract = rclexecm.which("tesseract")
|
||||
if self.tesseract:
|
||||
@ -134,7 +133,7 @@ class PDFExtractor:
|
||||
# so it can be disabled in the configuration.
|
||||
self.attextractdone = False
|
||||
self.attachlist = []
|
||||
cf_attach = cf.getConfParam("pdfattach")
|
||||
cf_attach = self.config.getConfParam("pdfattach")
|
||||
if cf_attach:
|
||||
self.pdftk = rclexecm.which("pdftk")
|
||||
if self.pdftk:
|
||||
@ -224,18 +223,28 @@ class PDFExtractor:
|
||||
# environment and hope for the best.
|
||||
def guesstesseractlang(self):
|
||||
tesseractlang = ""
|
||||
pdflangfile = os.path.join(os.path.dirname(self.filename), ".ocrpdflang")
|
||||
|
||||
# First look for a language def file in the file's directory
|
||||
pdflangfile = os.path.join(os.path.dirname(self.filename),
|
||||
b".ocrpdflang")
|
||||
if os.path.isfile(pdflangfile):
|
||||
tesseractlang = open(pdflangfile, "r").read().strip()
|
||||
if tesseractlang:
|
||||
return tesseractlang
|
||||
|
||||
# Then look for a global option. The normal way now that we
|
||||
# have config reading capability in the handlers is to use the
|
||||
# config. Then, for backwards compat, environment variable and
|
||||
# file inside the configuration directory
|
||||
tesseractlang = self.config.getConfParam("pdfocrlang")
|
||||
if tesseractlang:
|
||||
return tesseractlang
|
||||
tesseractlang = os.environ.get("RECOLL_TESSERACT_LANG", "");
|
||||
if tesseractlang:
|
||||
return tesseractlang
|
||||
|
||||
tesseractlang = \
|
||||
open(os.path.join(self.confdir, "ocrpdf"), "r").read().strip()
|
||||
pdflangfile = os.path.join(self.confdir, b"ocrpdf")
|
||||
if os.path.isfile(pdflangfile):
|
||||
tesseractlang = open(pdflangfile, "r").read().strip()
|
||||
if tesseractlang:
|
||||
return tesseractlang
|
||||
|
||||
@ -285,7 +294,7 @@ class PDFExtractor:
|
||||
except Exception as e:
|
||||
self.em.rclog("tesseract failed: %s" % e)
|
||||
|
||||
errlines = out.split('\n')
|
||||
errlines = out.split(b'\n')
|
||||
if len(errlines) > 2:
|
||||
self.em.rclog("Tesseract error: %s" % out)
|
||||
|
||||
|
||||
@ -1,8 +1,6 @@
|
||||
#!/usr/bin/env python2
|
||||
#!/usr/bin/env python3
|
||||
|
||||
# Recoll PPT text extractor
|
||||
# Mso-dumper is not compatible with Python3. We use sys.executable to
|
||||
# start the actual extractor, so we need to use python2 too.
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
|
||||
@ -1,135 +0,0 @@
|
||||
#!/bin/sh
|
||||
# @(#$Id: rclps,v 1.10 2007-06-08 13:51:09 dockes Exp $ (C) 2004 J.F.Dockes
|
||||
# Parts taken from Estraier:
|
||||
#================================================================
|
||||
# Estraier: a personal full-text search system
|
||||
# Copyright (C) 2003-2004 Mikio Hirabayashi
|
||||
#================================================================
|
||||
#================================================================
|
||||
# Extract text from a postscript file by executing pstotext or ps2ascii.
|
||||
#
|
||||
# The default is to use pstotext which can deal with accents, but in a
|
||||
# partially broken way (it always outputs iso8859-1, when it should use utf.
|
||||
#
|
||||
# OTOH, ps2ascii is much faster, comes with ghostscript, and sometimes work
|
||||
# better (ie: on some openoffice output files).
|
||||
#
|
||||
#================================================================
|
||||
|
||||
# set variables
|
||||
LANG=C ; export LANG
|
||||
LC_ALL=C ; export LC_ALL
|
||||
progname="rclps"
|
||||
decoder=pstotext
|
||||
#decoder=ps2ascii
|
||||
filetype=postscript
|
||||
|
||||
|
||||
#RECFILTCOMMONCODE
|
||||
##############################################################################
|
||||
# !! Leave the previous line unmodified!! Code imported from the
|
||||
# recfiltcommon file
|
||||
|
||||
# Utility code common to all shell filters. This could be sourced at run
|
||||
# time, but it's slightly more efficient to include the code in the
|
||||
# filters at build time (with a sed script).
|
||||
|
||||
# Describe error in a way that can be interpreted by our caller
|
||||
senderror()
|
||||
{
|
||||
echo RECFILTERROR $*
|
||||
# Also alert on stderr just in case
|
||||
echo ":2:$progname::: $*" 1>&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
iscmd()
|
||||
{
|
||||
cmd=$1
|
||||
case $cmd in
|
||||
*/*)
|
||||
if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;;
|
||||
*)
|
||||
oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
|
||||
for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done
|
||||
return 1 ;;
|
||||
esac
|
||||
}
|
||||
|
||||
checkcmds()
|
||||
{
|
||||
for cmd in $*;do
|
||||
if iscmd $cmd
|
||||
then
|
||||
a=1
|
||||
else
|
||||
senderror HELPERNOTFOUND $cmd
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
# show help message
|
||||
if test $# -ne 1 -o "$1" = "--help"
|
||||
then
|
||||
echo "Convert a $filetype file to HTML text for Recoll indexing."
|
||||
echo "Usage: $progname [infile]"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
infile="$1"
|
||||
|
||||
# check the input file existence (may be '-' for stdin)
|
||||
if test "X$infile" != X- -a ! -f "$infile"
|
||||
then
|
||||
senderror INPUTNOSUCHFILE "$infile"
|
||||
fi
|
||||
|
||||
# protect access to our temp files and directories
|
||||
umask 77
|
||||
|
||||
##############################################################################
|
||||
# !! Leave the following line unmodified !
|
||||
#ENDRECFILTCOMMONCODE
|
||||
|
||||
checkcmds $decoder iconv awk
|
||||
|
||||
# output the result
|
||||
# The strange 'BEGIN' setup is to prevent 'file' from thinking this file
|
||||
# is an awk program
|
||||
$decoder "$infile" |
|
||||
awk 'BEGIN'\
|
||||
' {
|
||||
printf("<html><head><title></title>\n")
|
||||
printf("<meta http-equiv=\"Content-Type\" content=\"text/html;charset=UTF-8\">\n")
|
||||
printf("</head>\n<body><p>");
|
||||
doescape = 1
|
||||
cont = ""
|
||||
}
|
||||
{
|
||||
$0 = cont $0
|
||||
cont = ""
|
||||
|
||||
if ($0 == "\f") {
|
||||
print "</p>\n<hr>\n\f<p>"
|
||||
next
|
||||
} else if ($0 ~ /$/) {
|
||||
# Note : soft-hyphen is iso8859 0xad
|
||||
# Break at last whitespace
|
||||
match($0, "[ \t][^ \t]+$")
|
||||
line = substr($0, 0, RSTART)
|
||||
cont = substr($0, RSTART, RLENGTH)
|
||||
$0 = line
|
||||
gsub("", "", cont)
|
||||
}
|
||||
|
||||
if(doescape > 0) {
|
||||
gsub(/&/, "\\&", $0)
|
||||
gsub(/</, "\\<", $0)
|
||||
gsub(/>/, "\\>", $0)
|
||||
}
|
||||
print $0 "<br>"
|
||||
}
|
||||
END {
|
||||
print "</p></body></html>"
|
||||
}' | iconv -f iso-8859-1 -t UTF-8 -c -s
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/env python2
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: iso-8859-1 -*-
|
||||
"""
|
||||
MoinMoin - Python source parser and colorizer
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/env python2
|
||||
#!/usr/bin/env python3
|
||||
|
||||
# Rar file filter for Recoll
|
||||
# Adapted from the Zip archive filter by mroark.
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/env python2
|
||||
#!/usr/bin/env python3
|
||||
from __future__ import print_function
|
||||
|
||||
import rclexecm
|
||||
|
||||
@ -1,92 +0,0 @@
|
||||
#!/bin/sh
|
||||
# @(#$Id: rclsiduxman,v 1.1 2008-06-09 09:12:05 dockes Exp $ (C) 2004 J.F.Dockes
|
||||
# Parts taken from Estraier:
|
||||
#================================================================
|
||||
# Estraier: a personal full-text search system
|
||||
# Copyright (C) 2003-2004 Mikio Hirabayashi
|
||||
#================================================================
|
||||
#================================================================
|
||||
# Strip the menu part from sidux manual pages to improve search precision
|
||||
#================================================================
|
||||
|
||||
# set variables
|
||||
LANG=C ; export LANG
|
||||
LC_ALL=C ; export LC_ALL
|
||||
progname="rclsiduxman"
|
||||
filetype="sidux manual htm"
|
||||
|
||||
|
||||
#RECFILTCOMMONCODE
|
||||
##############################################################################
|
||||
# !! Leave the previous line unmodified!! Code imported from the
|
||||
# recfiltcommon file
|
||||
|
||||
# Utility code common to all shell filters. This could be sourced at run
|
||||
# time, but it's slightly more efficient to include the code in the
|
||||
# filters at build time (with a sed script).
|
||||
|
||||
# Describe error in a way that can be interpreted by our caller
|
||||
senderror()
|
||||
{
|
||||
echo RECFILTERROR $*
|
||||
# Also alert on stderr just in case
|
||||
echo ":2:$progname::: $*" 1>&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
iscmd()
|
||||
{
|
||||
cmd=$1
|
||||
case $cmd in
|
||||
*/*)
|
||||
if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;;
|
||||
*)
|
||||
oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
|
||||
for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done
|
||||
return 1 ;;
|
||||
esac
|
||||
}
|
||||
|
||||
checkcmds()
|
||||
{
|
||||
for cmd in $*;do
|
||||
if iscmd $cmd
|
||||
then
|
||||
a=1
|
||||
else
|
||||
senderror HELPERNOTFOUND $cmd
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
# show help message
|
||||
if test $# -ne 1 -o "$1" = "--help"
|
||||
then
|
||||
echo "Convert a $filetype file to HTML text for Recoll indexing."
|
||||
echo "Usage: $progname [infile]"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
infile="$1"
|
||||
|
||||
# check the input file existence (may be '-' for stdin)
|
||||
if test "X$infile" != X- -a ! -f "$infile"
|
||||
then
|
||||
senderror INPUTNOSUCHFILE "$infile"
|
||||
fi
|
||||
|
||||
# protect access to our temp files and directories
|
||||
umask 77
|
||||
|
||||
##############################################################################
|
||||
# !! Leave the following line unmodified !
|
||||
#ENDRECFILTCOMMONCODE
|
||||
|
||||
checkcmds sed
|
||||
# Delete everything from <div id="menu"> to <div id="main-page">
|
||||
# This prints an additional blank line at top which does not matter
|
||||
sed -n -e '1,/<div id="menu">/{x;p' -e '}' \
|
||||
-e '/<div id="main-page">/,$p' < "$infile"
|
||||
|
||||
# exit normally
|
||||
exit 0
|
||||
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/env python2
|
||||
#!/usr/bin/env python3
|
||||
# Copyright (C) 2014 J.F.Dockes
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
@ -22,6 +22,7 @@ import sys
|
||||
import rclexecm
|
||||
import rclxslt
|
||||
from zipfile import ZipFile
|
||||
from rclbasehandler import RclBaseHandler
|
||||
|
||||
stylesheet_meta = '''<?xml version="1.0"?>
|
||||
<xsl:stylesheet version="1.0"
|
||||
@ -139,24 +140,14 @@ stylesheet_content = '''<?xml version="1.0"?>
|
||||
</xsl:stylesheet>
|
||||
'''
|
||||
|
||||
class OOExtractor:
|
||||
class OOExtractor(RclBaseHandler):
|
||||
def __init__(self, em):
|
||||
self.em = em
|
||||
self.currentindex = 0
|
||||
super(OOExtractor, self).__init__(em)
|
||||
|
||||
def extractone(self, params):
|
||||
if "filename:" not in params:
|
||||
self.em.rclog("extractone: no mime or file name")
|
||||
return (False, "", "", rclexecm.RclExecM.eofnow)
|
||||
fn = params["filename:"]
|
||||
|
||||
try:
|
||||
f = open(fn, 'rb')
|
||||
data = f.read()
|
||||
f.close()
|
||||
except Exception as err:
|
||||
self.em.rclog("open failed: %s" % err)
|
||||
return (False, "", "", rclexecm.RclExecM.eofnow)
|
||||
def html_text(self, fn):
|
||||
f = open(fn, 'rb')
|
||||
data = f.read()
|
||||
f.close()
|
||||
|
||||
docdata = b'<html>\n<head>\n<meta http-equiv="Content-Type"' \
|
||||
b'content="text/html; charset=UTF-8">\n'
|
||||
@ -172,31 +163,12 @@ class OOExtractor:
|
||||
|
||||
docdata += b'</head><body>'
|
||||
|
||||
try:
|
||||
res = rclxslt.apply_sheet_data(stylesheet_content, data)
|
||||
docdata += res
|
||||
docdata += b'</body></html>'
|
||||
except Exception as err:
|
||||
self.em.rclog("bad data in %s: %s" % (fn, err))
|
||||
return (False, "", "", rclexecm.RclExecM.eofnow)
|
||||
res = rclxslt.apply_sheet_data(stylesheet_content, data)
|
||||
docdata += res
|
||||
docdata += b'</body></html>'
|
||||
|
||||
return (True, docdata, "", rclexecm.RclExecM.eofnext)
|
||||
return docdata
|
||||
|
||||
###### File type handler api, used by rclexecm ---------->
|
||||
def openfile(self, params):
|
||||
self.currentindex = 0
|
||||
return True
|
||||
|
||||
def getipath(self, params):
|
||||
return self.extractone(params)
|
||||
|
||||
def getnext(self, params):
|
||||
if self.currentindex >= 1:
|
||||
return (False, "", "", rclexecm.RclExecM.eofnow)
|
||||
else:
|
||||
ret= self.extractone(params)
|
||||
self.currentindex += 1
|
||||
return ret
|
||||
|
||||
if __name__ == '__main__':
|
||||
proto = rclexecm.RclExecM()
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/env python2
|
||||
#!/usr/bin/env python3
|
||||
# Copyright (C) 2014 J.F.Dockes
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/env python2
|
||||
#!/usr/bin/env python3
|
||||
# Copyright (C) 2014 J.F.Dockes
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
@ -15,12 +15,11 @@
|
||||
# Free Software Foundation, Inc.,
|
||||
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
######################################
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
import sys
|
||||
import rclexecm
|
||||
import rclxslt
|
||||
import rclgenxslt
|
||||
|
||||
stylesheet_all = '''<?xml version="1.0"?>
|
||||
<xsl:stylesheet version="1.0"
|
||||
@ -100,43 +99,7 @@ stylesheet_all = '''<?xml version="1.0"?>
|
||||
</xsl:stylesheet>
|
||||
'''
|
||||
|
||||
class SVGExtractor:
|
||||
def __init__(self, em):
|
||||
self.em = em
|
||||
self.currentindex = 0
|
||||
|
||||
def extractone(self, params):
|
||||
if "filename:" not in params:
|
||||
self.em.rclog("extractone: no mime or file name")
|
||||
return (False, "", "", rclexecm.RclExecM.eofnow)
|
||||
fn = params["filename:"]
|
||||
|
||||
try:
|
||||
data = open(fn, 'rb').read()
|
||||
docdata = rclxslt.apply_sheet_data(stylesheet_all, data)
|
||||
except Exception as err:
|
||||
self.em.rclog("%s: bad data: " % (fn, err))
|
||||
return (False, "", "", rclexecm.RclExecM.eofnow)
|
||||
|
||||
return (True, docdata, "", rclexecm.RclExecM.eofnext)
|
||||
|
||||
###### File type handler api, used by rclexecm ---------->
|
||||
def openfile(self, params):
|
||||
self.currentindex = 0
|
||||
return True
|
||||
|
||||
def getipath(self, params):
|
||||
return self.extractone(params)
|
||||
|
||||
def getnext(self, params):
|
||||
if self.currentindex >= 1:
|
||||
return (False, "", "", rclexecm.RclExecM.eofnow)
|
||||
else:
|
||||
ret= self.extractone(params)
|
||||
self.currentindex += 1
|
||||
return ret
|
||||
|
||||
if __name__ == '__main__':
|
||||
proto = rclexecm.RclExecM()
|
||||
extract = SVGExtractor(proto)
|
||||
extract = rclgenxslt.XSLTExtractor(proto, stylesheet_all)
|
||||
rclexecm.main(proto, extract)
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/env python2
|
||||
#!/usr/bin/env python3
|
||||
|
||||
# Tar-file filter for Recoll
|
||||
# Thanks to Recoll user Martin Ziegler
|
||||
|
||||
@ -1,4 +1,19 @@
|
||||
#!/usr/bin/env python2
|
||||
#!/usr/bin/env python3
|
||||
# Copyright (C) 2016 J.F.Dockes
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the
|
||||
# Free Software Foundation, Inc.,
|
||||
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
|
||||
# Wrapping a text file. Recoll does it internally in most cases, but
|
||||
# this is for use by another filter.
|
||||
@ -7,46 +22,19 @@ from __future__ import print_function
|
||||
|
||||
import rclexecm
|
||||
import sys
|
||||
from rclbasehandler import RclBaseHandler
|
||||
|
||||
class TxtDump:
|
||||
class TxtDump(RclBaseHandler):
|
||||
def __init__(self, em):
|
||||
self.em = em
|
||||
super(TxtDump, self).__init__(em)
|
||||
|
||||
def extractone(self, params):
|
||||
#self.em.rclog("extractone %s %s" % (params["filename:"], \
|
||||
#params["mimetype:"]))
|
||||
if not "filename:" in params:
|
||||
self.em.rclog("extractone: no file name")
|
||||
return (False, "", "", rclexecm.RclExecM.eofnow)
|
||||
|
||||
fn = params["filename:"]
|
||||
def html_text(self, fn):
|
||||
# No charset, so recoll will have to use its config to guess it
|
||||
txt = b'<html><head><title></title></head><body><pre>'
|
||||
try:
|
||||
f = open(fn, "rb")
|
||||
txt += self.em.htmlescape(f.read())
|
||||
except Exception as err:
|
||||
self.em.rclog("TxtDump: %s : %s" % (fn, err))
|
||||
return (False, "", "", rclexecm.RclExecM.eofnow)
|
||||
|
||||
txt += b'</pre></body></html>'
|
||||
return (True, txt, "", rclexecm.RclExecM.eofnext)
|
||||
|
||||
###### File type handler api, used by rclexecm ---------->
|
||||
def openfile(self, params):
|
||||
self.currentindex = 0
|
||||
return True
|
||||
|
||||
def getipath(self, params):
|
||||
return self.extractone(params)
|
||||
|
||||
def getnext(self, params):
|
||||
if self.currentindex >= 1:
|
||||
return (False, "", "", rclexecm.RclExecM.eofnow)
|
||||
else:
|
||||
ret= self.extractone(params)
|
||||
self.currentindex += 1
|
||||
return ret
|
||||
html = b'<html><head><title></title></head><body><pre>'
|
||||
f = open(fn, "rb")
|
||||
html += self.em.htmlescape(f.read())
|
||||
html += b'</pre></body></html>'
|
||||
return html
|
||||
|
||||
if __name__ == '__main__':
|
||||
proto = rclexecm.RclExecM()
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/env python2
|
||||
#!/usr/bin/env python3
|
||||
"""Index text lines as document (execm handler sample). This exists
|
||||
to demonstrate the execm interface and is not meant to be useful or
|
||||
efficient"""
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/env python2
|
||||
#!/usr/bin/env python
|
||||
from __future__ import print_function
|
||||
|
||||
import rclexecm
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/env python2
|
||||
#!/usr/bin/env python3
|
||||
|
||||
# WAR web archive filter for recoll. War file are gzipped tar files
|
||||
|
||||
|
||||
@ -1,87 +0,0 @@
|
||||
#!/bin/sh
|
||||
# @(#$Id: rclwpd,v 1.1 2007-08-26 13:34:59 dockes Exp $ (C) 2004 J.F.Dockes
|
||||
# Some inspiration from estraier
|
||||
#================================================================
|
||||
# convert wordperfect documents to html, by executing the wpd2html program:
|
||||
# http://libwpd.sourceforge.net/download.html
|
||||
#================================================================
|
||||
|
||||
# set variables
|
||||
LANG=C ; export LANG
|
||||
LC_ALL=C ; export LC_ALL
|
||||
progname="rclwpd"
|
||||
filetype=wpd
|
||||
|
||||
|
||||
|
||||
#RECFILTCOMMONCODE
|
||||
##############################################################################
|
||||
# !! Leave the previous line unmodified!! Code imported from the
|
||||
# recfiltcommon file
|
||||
|
||||
# Utility code common to all shell filters. This could be sourced at run
|
||||
# time, but it's slightly more efficient to include the code in the
|
||||
# filters at build time (with a sed script).
|
||||
|
||||
# Describe error in a way that can be interpreted by our caller
|
||||
senderror()
|
||||
{
|
||||
echo RECFILTERROR $*
|
||||
# Also alert on stderr just in case
|
||||
echo ":2:$progname::: $*" 1>&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
iscmd()
|
||||
{
|
||||
cmd=$1
|
||||
case $cmd in
|
||||
*/*)
|
||||
if test -x $cmd -a ! -d $cmd ; then return 0; else return 1; fi ;;
|
||||
*)
|
||||
oldifs=$IFS; IFS=":"; set -- $PATH; IFS=$oldifs
|
||||
for d in $*;do test -x $d/$cmd -a ! -d $d/$cmd && return 0;done
|
||||
return 1 ;;
|
||||
esac
|
||||
}
|
||||
|
||||
checkcmds()
|
||||
{
|
||||
for cmd in $*;do
|
||||
if iscmd $cmd
|
||||
then
|
||||
a=1
|
||||
else
|
||||
senderror HELPERNOTFOUND $cmd
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
# show help message
|
||||
if test $# -ne 1 -o "$1" = "--help"
|
||||
then
|
||||
echo "Convert a $filetype file to HTML text for Recoll indexing."
|
||||
echo "Usage: $progname [infile]"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
infile="$1"
|
||||
|
||||
# check the input file existence (may be '-' for stdin)
|
||||
if test "X$infile" != X- -a ! -f "$infile"
|
||||
then
|
||||
senderror INPUTNOSUCHFILE "$infile"
|
||||
fi
|
||||
|
||||
# protect access to our temp files and directories
|
||||
umask 77
|
||||
|
||||
##############################################################################
|
||||
# !! Leave the following line unmodified !
|
||||
#ENDRECFILTCOMMONCODE
|
||||
|
||||
checkcmds wpd2html
|
||||
|
||||
# output the result. wpd2html output doesn't seem to need any adjustment?
|
||||
|
||||
wpd2html "$infile" 2> /dev/null
|
||||
@ -1,8 +1,6 @@
|
||||
#!/usr/bin/env python2
|
||||
#!/usr/bin/env python3
|
||||
|
||||
# Extractor for Excel files.
|
||||
# Mso-dumper is not compatible with Python3. We use sys.executable to
|
||||
# start the actual extractor, so we need to use python2 too.
|
||||
|
||||
import rclexecm
|
||||
import rclexec1
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/env python2
|
||||
#!/usr/bin/env python3
|
||||
# Copyright (C) 2014 J.F.Dockes
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
@ -18,7 +18,7 @@
|
||||
|
||||
import sys
|
||||
import rclexecm
|
||||
import rclxslt
|
||||
import rclgenxslt
|
||||
|
||||
stylesheet_all = '''<?xml version="1.0"?>
|
||||
<xsl:stylesheet version="1.0"
|
||||
@ -56,43 +56,7 @@ stylesheet_all = '''<?xml version="1.0"?>
|
||||
</xsl:stylesheet>
|
||||
'''
|
||||
|
||||
class XMLExtractor:
|
||||
def __init__(self, em):
|
||||
self.em = em
|
||||
self.currentindex = 0
|
||||
|
||||
def extractone(self, params):
|
||||
if "filename:" not in params:
|
||||
self.em.rclog("extractone: no mime or file name")
|
||||
return (False, "", "", rclexecm.RclExecM.eofnow)
|
||||
fn = params["filename:"]
|
||||
|
||||
try:
|
||||
data = open(fn, 'rb').read()
|
||||
docdata = rclxslt.apply_sheet_data(stylesheet_all, data)
|
||||
except Exception as err:
|
||||
self.em.rclog("%s: bad data: " % (fn, err))
|
||||
return (False, "", "", rclexecm.RclExecM.eofnow)
|
||||
|
||||
return (True, docdata, "", rclexecm.RclExecM.eofnext)
|
||||
|
||||
###### File type handler api, used by rclexecm ---------->
|
||||
def openfile(self, params):
|
||||
self.currentindex = 0
|
||||
return True
|
||||
|
||||
def getipath(self, params):
|
||||
return self.extractone(params)
|
||||
|
||||
def getnext(self, params):
|
||||
if self.currentindex >= 1:
|
||||
return (False, "", "", rclexecm.RclExecM.eofnow)
|
||||
else:
|
||||
ret= self.extractone(params)
|
||||
self.currentindex += 1
|
||||
return ret
|
||||
|
||||
if __name__ == '__main__':
|
||||
proto = rclexecm.RclExecM()
|
||||
extract = XMLExtractor(proto)
|
||||
extract = rclgenxslt.XSLTExtractor(proto, stylesheet_all)
|
||||
rclexecm.main(proto, extract)
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/env python2
|
||||
#!/usr/bin/env python3
|
||||
# Copyright (C) 2016 J.F.Dockes
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
|
||||
@ -31,40 +31,42 @@ if PY2:
|
||||
except:
|
||||
print("RECFILTERROR HELPERNOTFOUND python:libxml2/python:libxslt1")
|
||||
sys.exit(1);
|
||||
def _apply_sheet_doc(sheet, doc):
|
||||
styledoc = libxml2.readMemory(sheet, len(sheet), '', '',
|
||||
options=libxml2.XML_PARSE_NONET)
|
||||
style = libxslt.parseStylesheetDoc(styledoc)
|
||||
result = style.applyStylesheet(doc, None)
|
||||
res = ""
|
||||
try:
|
||||
res = style.saveResultToString(result)
|
||||
except Exception as err:
|
||||
# print("saveResultToString got exception: %s"%err)
|
||||
pass
|
||||
style.freeStylesheet()
|
||||
doc.freeDoc()
|
||||
result.freeDoc()
|
||||
return res
|
||||
def apply_sheet_data(sheet, data):
|
||||
styledoc = libxml2.parseMemory(sheet, len(sheet))
|
||||
style = libxslt.parseStylesheetDoc(styledoc)
|
||||
doc = libxml2.parseMemory(data, len(data))
|
||||
result = style.applyStylesheet(doc, None)
|
||||
res = style.saveResultToString(result)
|
||||
style.freeStylesheet()
|
||||
doc.freeDoc()
|
||||
result.freeDoc()
|
||||
return res
|
||||
doc = libxml2.readMemory(data, len(data), '', '',
|
||||
options=libxml2.XML_PARSE_NONET)
|
||||
return _apply_sheet_doc(sheet, doc)
|
||||
def apply_sheet_file(sheet, fn):
|
||||
styledoc = libxml2.parseMemory(sheet, len(sheet))
|
||||
style = libxslt.parseStylesheetDoc(styledoc)
|
||||
doc = libxml2.parseFile(fn)
|
||||
result = style.applyStylesheet(doc, None)
|
||||
res = style.saveResultToString(result)
|
||||
style.freeStylesheet()
|
||||
doc.freeDoc()
|
||||
result.freeDoc()
|
||||
return res
|
||||
doc = libxml2.readFile(fn, '', options=libxml2.XML_PARSE_NONET)
|
||||
return _apply_sheet_doc(sheet, doc)
|
||||
else:
|
||||
try:
|
||||
from lxml import etree
|
||||
except:
|
||||
print("RECFILTERROR HELPERNOTFOUND python3:lxml")
|
||||
sys.exit(1);
|
||||
def _apply_sheet_doc(sheet, doc):
|
||||
styledoc = etree.fromstring(sheet)
|
||||
transform = etree.XSLT(styledoc)
|
||||
return bytes(transform(doc))
|
||||
def apply_sheet_data(sheet, data):
|
||||
styledoc = etree.fromstring(sheet)
|
||||
transform = etree.XSLT(styledoc)
|
||||
doc = etree.fromstring(data)
|
||||
return bytes(transform(doc))
|
||||
return _apply_sheet_doc(sheet, doc)
|
||||
def apply_sheet_file(sheet, fn):
|
||||
styledoc = etree.fromstring(sheet)
|
||||
transform = etree.XSLT(styledoc)
|
||||
doc = etree.parse(fn)
|
||||
return bytes(transform(doc))
|
||||
return _apply_sheet_doc(sheet, doc)
|
||||
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/env python2
|
||||
#!/usr/bin/env python3
|
||||
# Copyright (C) 2014 J.F.Dockes
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
@ -28,6 +28,7 @@ from zipfile import ZipFile
|
||||
|
||||
try:
|
||||
from recoll import rclconfig
|
||||
from recoll import conftree
|
||||
hasrclconfig = True
|
||||
except:
|
||||
hasrclconfig = False
|
||||
@ -118,10 +119,14 @@ class ZipExtractor:
|
||||
if hasrclconfig:
|
||||
config = rclconfig.RclConfig()
|
||||
config.setKeyDir(os.path.dirname(filename))
|
||||
usebaseskipped = config.getConfParam("zipUseSkippedNames")
|
||||
if usebaseskipped:
|
||||
skipped = config.getConfParam("skippedNames")
|
||||
self.em.rclog("skippedNames: %s"%self.skiplist)
|
||||
self.skiplist += conftree.stringToStrings(skipped)
|
||||
skipped = config.getConfParam("zipSkippedNames")
|
||||
if skipped is not None:
|
||||
self.skiplist = skipped.split(" ")
|
||||
|
||||
self.skiplist += conftree.stringToStrings(skipped)
|
||||
try:
|
||||
if rclexecm.PY3:
|
||||
# Note: py3 ZipFile wants an str file name, which
|
||||
|
||||
BIN
src/filters/recollepub.zip
Normal file
BIN
src/filters/recollepub.zip
Normal file
Binary file not shown.
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/env python2
|
||||
#!/usr/bin/env python3
|
||||
#
|
||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/env python2
|
||||
#!/usr/bin/env python3
|
||||
# Copyright (C) 2015 J.F.Dockes
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
|
||||
@ -16,13 +16,11 @@
|
||||
*/
|
||||
#include "autoconfig.h"
|
||||
|
||||
|
||||
#include "log.h"
|
||||
#include "rclconfig.h"
|
||||
|
||||
#include "fetcher.h"
|
||||
#include "fsfetcher.h"
|
||||
#include "bglfetcher.h"
|
||||
#include "webqueuefetcher.h"
|
||||
#include "exefetcher.h"
|
||||
|
||||
DocFetcher *docFetcherMake(RclConfig *config, const Rcl::Doc& idoc)
|
||||
@ -37,7 +35,7 @@ DocFetcher *docFetcherMake(RclConfig *config, const Rcl::Doc& idoc)
|
||||
return new FSDocFetcher;
|
||||
#ifndef DISABLE_WEB_INDEXER
|
||||
} else if (!backend.compare("BGL")) {
|
||||
return new BGLDocFetcher;
|
||||
return new WQDocFetcher;
|
||||
#endif
|
||||
} else {
|
||||
DocFetcher *f = exeDocFetcherMake(config, backend);
|
||||
|
||||
@ -27,7 +27,7 @@
|
||||
#include "indexer.h"
|
||||
#include "fsindexer.h"
|
||||
#ifndef DISABLE_WEB_INDEXER
|
||||
#include "beaglequeue.h"
|
||||
#include "webqueue.h"
|
||||
#endif
|
||||
#include "mimehandler.h"
|
||||
#include "pathut.h"
|
||||
@ -132,7 +132,7 @@ bool ConfIndexer::index(bool resetbefore, ixType typestorun, int flags)
|
||||
if (m_doweb && (typestorun & IxTWebQueue)) {
|
||||
runWebFilesMoverScript(m_config);
|
||||
deleteZ(m_webindexer);
|
||||
m_webindexer = new BeagleQueueIndexer(m_config, &m_db, m_updater);
|
||||
m_webindexer = new WebQueueIndexer(m_config, &m_db, m_updater);
|
||||
if (!m_webindexer || !m_webindexer->index()) {
|
||||
m_db.close();
|
||||
addIdxReason("indexer", "Web index creation failed. See log");
|
||||
@ -208,7 +208,7 @@ bool ConfIndexer::indexFiles(list<string>& ifiles, int flag)
|
||||
|
||||
if (m_doweb && !myfiles.empty() && !(flag & IxFNoWeb)) {
|
||||
if (!m_webindexer)
|
||||
m_webindexer = new BeagleQueueIndexer(m_config, &m_db, m_updater);
|
||||
m_webindexer = new WebQueueIndexer(m_config, &m_db, m_updater);
|
||||
if (m_webindexer) {
|
||||
ret = ret && m_webindexer->indexFiles(myfiles);
|
||||
} else {
|
||||
@ -267,7 +267,7 @@ bool ConfIndexer::purgeFiles(list<string> &files, int flag)
|
||||
#ifndef DISABLE_WEB_INDEXER
|
||||
if (m_doweb && !myfiles.empty() && !(flag & IxFNoWeb)) {
|
||||
if (!m_webindexer)
|
||||
m_webindexer = new BeagleQueueIndexer(m_config, &m_db, m_updater);
|
||||
m_webindexer = new WebQueueIndexer(m_config, &m_db, m_updater);
|
||||
if (m_webindexer) {
|
||||
ret = ret && m_webindexer->purgeFiles(myfiles);
|
||||
} else {
|
||||
|
||||
@ -29,7 +29,7 @@
|
||||
#include "idxstatus.h"
|
||||
|
||||
class FsIndexer;
|
||||
class BeagleQueueIndexer;
|
||||
class WebQueueIndexer;
|
||||
|
||||
/** Callback to say what we're doing. If the update func returns false, we
|
||||
* stop as soon as possible without corrupting state */
|
||||
@ -118,7 +118,7 @@ class ConfIndexer {
|
||||
Rcl::Db m_db;
|
||||
FsIndexer *m_fsindexer;
|
||||
bool m_doweb;
|
||||
BeagleQueueIndexer *m_webindexer;
|
||||
WebQueueIndexer *m_webindexer;
|
||||
DbIxStatusUpdater *m_updater;
|
||||
string m_reason;
|
||||
|
||||
|
||||
@ -49,7 +49,7 @@ using namespace std;
|
||||
#include "cancelcheck.h"
|
||||
#include "rcldb.h"
|
||||
#ifndef DISABLE_WEB_INDEXER
|
||||
#include "beaglequeue.h"
|
||||
#include "webqueue.h"
|
||||
#endif
|
||||
#include "recollindex.h"
|
||||
#include "fsindexer.h"
|
||||
|
||||
@ -16,6 +16,8 @@
|
||||
*/
|
||||
#include "autoconfig.h"
|
||||
|
||||
#include "webqueue.h"
|
||||
|
||||
#include <string.h>
|
||||
#include <errno.h>
|
||||
#include "safesysstat.h"
|
||||
@ -26,8 +28,7 @@
|
||||
#include "rclutil.h"
|
||||
#include "log.h"
|
||||
#include "fstreewalk.h"
|
||||
#include "beaglequeue.h"
|
||||
#include "beaglequeuecache.h"
|
||||
#include "webstore.h"
|
||||
#include "circache.h"
|
||||
#include "smallut.h"
|
||||
#include "fileudi.h"
|
||||
@ -44,12 +45,13 @@
|
||||
|
||||
using namespace std;
|
||||
|
||||
// Beagle creates a file named .xxx (where xxx is the name for the main file
|
||||
// in the queue), to hold external metadata (http or created by Beagle).
|
||||
// This class reads the .xxx, dotfile, and turns it into an Rcl::Doc holder
|
||||
class BeagleDotFile {
|
||||
// The browser plugin creates a file named .xxx (where xxx is the name
|
||||
// for the main file in the queue), to hold external metadata (http or
|
||||
// created by the plugin). This class reads the .xxx, dotfile, and turns
|
||||
// it into an Rcl::Doc holder
|
||||
class WebQueueDotFile {
|
||||
public:
|
||||
BeagleDotFile(RclConfig *conf, const string& fn)
|
||||
WebQueueDotFile(RclConfig *conf, const string& fn)
|
||||
: m_conf(conf), m_fn(fn)
|
||||
{}
|
||||
|
||||
@ -62,7 +64,7 @@ public:
|
||||
m_input.getline(cline, LL-1);
|
||||
if (!m_input.good()) {
|
||||
if (m_input.bad()) {
|
||||
LOGERR("beagleDotFileRead: input.bad()\n" );
|
||||
LOGERR("WebQueueDotFileRead: input.bad()\n" );
|
||||
}
|
||||
return false;
|
||||
}
|
||||
@ -72,18 +74,18 @@ public:
|
||||
ll--;
|
||||
}
|
||||
line.assign(cline, ll);
|
||||
LOGDEB2("BeagleDotFile:readLine: [" << (line) << "]\n" );
|
||||
LOGDEB2("WebQueueDotFile:readLine: [" << (line) << "]\n" );
|
||||
return true;
|
||||
}
|
||||
|
||||
// Process a beagle dot file and set interesting stuff in the doc
|
||||
// Process a Web queue dot file and set interesting stuff in the doc
|
||||
bool toDoc(Rcl::Doc& doc)
|
||||
{
|
||||
string line;
|
||||
|
||||
m_input.open(m_fn.c_str(), ios::in);
|
||||
if (!m_input.good()) {
|
||||
LOGERR("BeagleDotFile: open failed for [" << (m_fn) << "]\n" );
|
||||
LOGERR("WebQueueDotFile: open failed for [" << (m_fn) << "]\n" );
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -173,24 +175,24 @@ public:
|
||||
|
||||
// Initialize. Compute paths and create a temporary directory that will be
|
||||
// used by internfile()
|
||||
BeagleQueueIndexer::BeagleQueueIndexer(RclConfig *cnf, Rcl::Db *db,
|
||||
WebQueueIndexer::WebQueueIndexer(RclConfig *cnf, Rcl::Db *db,
|
||||
DbIxStatusUpdater *updfunc)
|
||||
: m_config(cnf), m_db(db), m_cache(0), m_updater(updfunc),
|
||||
m_nocacheindex(false)
|
||||
{
|
||||
m_queuedir = m_config->getWebQueueDir();
|
||||
path_catslash(m_queuedir);
|
||||
m_cache = new BeagleQueueCache(cnf);
|
||||
m_cache = new WebStore(cnf);
|
||||
}
|
||||
|
||||
BeagleQueueIndexer::~BeagleQueueIndexer()
|
||||
WebQueueIndexer::~WebQueueIndexer()
|
||||
{
|
||||
LOGDEB("BeagleQueueIndexer::~\n" );
|
||||
LOGDEB("WebQueueIndexer::~\n" );
|
||||
deleteZ(m_cache);
|
||||
}
|
||||
|
||||
// Index document stored in the cache.
|
||||
bool BeagleQueueIndexer::indexFromCache(const string& udi)
|
||||
bool WebQueueIndexer::indexFromCache(const string& udi)
|
||||
{
|
||||
if (!m_db)
|
||||
return false;
|
||||
@ -202,12 +204,12 @@ bool BeagleQueueIndexer::indexFromCache(const string& udi)
|
||||
string hittype;
|
||||
|
||||
if (!m_cache || !m_cache->getFromCache(udi, dotdoc, data, &hittype)) {
|
||||
LOGERR("BeagleQueueIndexer::indexFromCache: cache failed\n" );
|
||||
LOGERR("WebQueueIndexer::indexFromCache: cache failed\n" );
|
||||
return false;
|
||||
}
|
||||
|
||||
if (hittype.empty()) {
|
||||
LOGERR("BeagleIndexer::index: cc entry has no hit type\n" );
|
||||
LOGERR("WebQueueIndexer::index: cc entry has no hit type\n" );
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -224,11 +226,11 @@ bool BeagleQueueIndexer::indexFromCache(const string& udi)
|
||||
try {
|
||||
fis = interner.internfile(doc);
|
||||
} catch (CancelExcept) {
|
||||
LOGERR("BeagleQueueIndexer: interrupted\n" );
|
||||
LOGERR("WebQueueIndexer: interrupted\n" );
|
||||
return false;
|
||||
}
|
||||
if (fis != FileInterner::FIDone) {
|
||||
LOGERR("BeagleQueueIndexer: bad status from internfile\n" );
|
||||
LOGERR("WebQueueIndexer: bad status from internfile\n" );
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -242,7 +244,7 @@ bool BeagleQueueIndexer::indexFromCache(const string& udi)
|
||||
}
|
||||
}
|
||||
|
||||
void BeagleQueueIndexer::updstatus(const string& udi)
|
||||
void WebQueueIndexer::updstatus(const string& udi)
|
||||
{
|
||||
if (m_updater) {
|
||||
++(m_updater->status.docsdone);
|
||||
@ -253,18 +255,18 @@ void BeagleQueueIndexer::updstatus(const string& udi)
|
||||
}
|
||||
}
|
||||
|
||||
bool BeagleQueueIndexer::index()
|
||||
bool WebQueueIndexer::index()
|
||||
{
|
||||
if (!m_db)
|
||||
return false;
|
||||
LOGDEB("BeagleQueueIndexer::processqueue: [" << (m_queuedir) << "]\n" );
|
||||
LOGDEB("WebQueueIndexer::processqueue: [" << (m_queuedir) << "]\n" );
|
||||
m_config->setKeyDir(m_queuedir);
|
||||
if (!path_makepath(m_queuedir, 0700)) {
|
||||
LOGERR("BeagleQueueIndexer:: can't create queuedir [" << (m_queuedir) << "] errno " << (errno) << "\n" );
|
||||
LOGERR("WebQueueIndexer:: can't create queuedir [" << (m_queuedir) << "] errno " << (errno) << "\n" );
|
||||
return false;
|
||||
}
|
||||
if (!m_cache || !m_cache->cc()) {
|
||||
LOGERR("BeagleQueueIndexer: cache initialization failed\n" );
|
||||
LOGERR("WebQueueIndexer: cache initialization failed\n" );
|
||||
return false;
|
||||
}
|
||||
CirCache *cc = m_cache->cc();
|
||||
@ -282,7 +284,7 @@ bool BeagleQueueIndexer::index()
|
||||
do {
|
||||
string udi;
|
||||
if (!cc->getCurrentUdi(udi)) {
|
||||
LOGERR("BeagleQueueIndexer:: cache file damaged\n" );
|
||||
LOGERR("WebQueueIndexer:: cache file damaged\n" );
|
||||
break;
|
||||
}
|
||||
if (udi.empty())
|
||||
@ -295,7 +297,7 @@ bool BeagleQueueIndexer::index()
|
||||
indexFromCache(udi);
|
||||
updstatus(udi);
|
||||
} catch (CancelExcept) {
|
||||
LOGERR("BeagleQueueIndexer: interrupted\n" );
|
||||
LOGERR("WebQueueIndexer: interrupted\n" );
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@ -307,17 +309,17 @@ bool BeagleQueueIndexer::index()
|
||||
FsTreeWalker walker(FsTreeWalker::FtwNoRecurse);
|
||||
walker.addSkippedName(".*");
|
||||
FsTreeWalker::Status status = walker.walk(m_queuedir, *this);
|
||||
LOGDEB("BeagleQueueIndexer::processqueue: done: status " << (status) << "\n" );
|
||||
LOGDEB("WebQueueIndexer::processqueue: done: status " << (status) << "\n" );
|
||||
return true;
|
||||
}
|
||||
|
||||
// Index a list of files (sent by the real time monitor)
|
||||
bool BeagleQueueIndexer::indexFiles(list<string>& files)
|
||||
bool WebQueueIndexer::indexFiles(list<string>& files)
|
||||
{
|
||||
LOGDEB("BeagleQueueIndexer::indexFiles\n" );
|
||||
LOGDEB("WebQueueIndexer::indexFiles\n" );
|
||||
|
||||
if (!m_db) {
|
||||
LOGERR("BeagleQueueIndexer::indexfiles no db??\n" );
|
||||
LOGERR("WebQueueIndexer::indexfiles no db??\n" );
|
||||
return false;
|
||||
}
|
||||
for (list<string>::iterator it = files.begin(); it != files.end();) {
|
||||
@ -326,7 +328,7 @@ bool BeagleQueueIndexer::indexFiles(list<string>& files)
|
||||
}
|
||||
string father = path_getfather(*it);
|
||||
if (father.compare(m_queuedir)) {
|
||||
LOGDEB("BeagleQueueIndexer::indexfiles: skipping [" << *it << "] (nq)\n" );
|
||||
LOGDEB("WebQueueIndexer::indexfiles: skipping [" << *it << "] (nq)\n" );
|
||||
it++; continue;
|
||||
}
|
||||
// Pb: we are often called with the dot file, before the
|
||||
@ -342,11 +344,11 @@ bool BeagleQueueIndexer::indexFiles(list<string>& files)
|
||||
}
|
||||
struct stat st;
|
||||
if (path_fileprops(*it, &st) != 0) {
|
||||
LOGERR("BeagleQueueIndexer::indexfiles: cant stat [" << *it << "]\n" );
|
||||
LOGERR("WebQueueIndexer::indexfiles: cant stat [" << *it << "]\n" );
|
||||
it++; continue;
|
||||
}
|
||||
if (!S_ISREG(st.st_mode)) {
|
||||
LOGDEB("BeagleQueueIndexer::indexfiles: skipping [" << *it << "] (nr)\n" );
|
||||
LOGDEB("WebQueueIndexer::indexfiles: skipping [" << *it << "] (nr)\n" );
|
||||
it++; continue;
|
||||
}
|
||||
|
||||
@ -360,7 +362,7 @@ bool BeagleQueueIndexer::indexFiles(list<string>& files)
|
||||
}
|
||||
|
||||
FsTreeWalker::Status
|
||||
BeagleQueueIndexer::processone(const string &path,
|
||||
WebQueueIndexer::processone(const string &path,
|
||||
const struct stat *stp,
|
||||
FsTreeWalker::CbFlag flg)
|
||||
{
|
||||
@ -374,9 +376,9 @@ BeagleQueueIndexer::processone(const string &path,
|
||||
|
||||
string dotpath = path_cat(path_getfather(path),
|
||||
string(".") + path_getsimple(path));
|
||||
LOGDEB("BeagleQueueIndexer: prc1: [" << (path) << "]\n" );
|
||||
LOGDEB("WebQueueIndexer: prc1: [" << (path) << "]\n" );
|
||||
|
||||
BeagleDotFile dotfile(m_config, dotpath);
|
||||
WebQueueDotFile dotfile(m_config, dotpath);
|
||||
Rcl::Doc dotdoc;
|
||||
string udi, udipath;
|
||||
if (!dotfile.toDoc(dotdoc))
|
||||
@ -388,7 +390,7 @@ BeagleQueueIndexer::processone(const string &path,
|
||||
udipath = path_cat(dotdoc.meta[Rcl::Doc::keybght], url_gpath(dotdoc.url));
|
||||
make_udi(udipath, cstr_null, udi);
|
||||
|
||||
LOGDEB("BeagleQueueIndexer: prc1: udi [" << (udi) << "]\n" );
|
||||
LOGDEB("WebQueueIndexer: prc1: udi [" << (udi) << "]\n" );
|
||||
char ascdate[30];
|
||||
sprintf(ascdate, "%ld", long(stp->st_mtime));
|
||||
|
||||
@ -410,7 +412,7 @@ BeagleQueueIndexer::processone(const string &path,
|
||||
} else {
|
||||
Rcl::Doc doc;
|
||||
// Store the dotdoc fields in the future doc. In case someone wants
|
||||
// to use beagle-generated fields like beagle:inurl
|
||||
// to use fields generated by the browser plugin like inurl
|
||||
doc.meta = dotdoc.meta;
|
||||
|
||||
FileInterner interner(path, stp, m_config,
|
||||
@ -420,11 +422,11 @@ BeagleQueueIndexer::processone(const string &path,
|
||||
try {
|
||||
fis = interner.internfile(doc);
|
||||
} catch (CancelExcept) {
|
||||
LOGERR("BeagleQueueIndexer: interrupted\n" );
|
||||
LOGERR("WebQueueIndexer: interrupted\n" );
|
||||
goto out;
|
||||
}
|
||||
if (fis != FileInterner::FIDone && fis != FileInterner::FIAgain) {
|
||||
LOGERR("BeagleQueueIndexer: bad status from internfile\n" );
|
||||
LOGERR("WebQueueIndexer: bad status from internfile\n" );
|
||||
// TOBEDONE: internfile can return FIAgain here if it is
|
||||
// paging a big text file, we should loop. Means we're
|
||||
// only indexing the first page for text/plain files
|
||||
@ -457,11 +459,11 @@ BeagleQueueIndexer::processone(const string &path,
|
||||
string fdata;
|
||||
file_to_string(path, fdata);
|
||||
if (!m_cache || !m_cache->cc()) {
|
||||
LOGERR("BeagleQueueIndexer: cache initialization failed\n" );
|
||||
LOGERR("WebQueueIndexer: cache initialization failed\n" );
|
||||
goto out;
|
||||
}
|
||||
if (!m_cache->cc()->put(udi, &dotfile.m_fields, fdata, 0)) {
|
||||
LOGERR("BeagleQueueIndexer::prc1: cache_put failed; " << (m_cache->cc()->getReason()) << "\n" );
|
||||
LOGERR("WebQueueIndexer::prc1: cache_put failed; " << (m_cache->cc()->getReason()) << "\n" );
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
@ -14,18 +14,17 @@
|
||||
* Free Software Foundation, Inc.,
|
||||
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
*/
|
||||
#ifndef _beaglequeue_h_included_
|
||||
#define _beaglequeue_h_included_
|
||||
#ifndef _webqueue_h_included_
|
||||
#define _webqueue_h_included_
|
||||
|
||||
#include <list>
|
||||
|
||||
/**
|
||||
* Process the Beagle indexing queue.
|
||||
* Process the WEB indexing queue.
|
||||
*
|
||||
* Beagle MUST NOT be running, else mayhem will ensue.
|
||||
*
|
||||
* This is mainly written to reuse the Beagle Firefox plug-in (which
|
||||
* copies visited pages and bookmarks to the queue).
|
||||
* This was originally written to reuse the Beagle Firefox plug-in (which
|
||||
* copied visited pages and bookmarks to the queue), long dead and replaced by a
|
||||
* recoll-specific plugin.
|
||||
*/
|
||||
|
||||
#include "fstreewalk.h"
|
||||
@ -34,16 +33,16 @@
|
||||
class DbIxStatusUpdater;
|
||||
class CirCache;
|
||||
class RclConfig;
|
||||
class BeagleQueueCache;
|
||||
class WebStore;
|
||||
namespace Rcl {
|
||||
class Db;
|
||||
}
|
||||
|
||||
class BeagleQueueIndexer : public FsTreeWalkerCB {
|
||||
class WebQueueIndexer : public FsTreeWalkerCB {
|
||||
public:
|
||||
BeagleQueueIndexer(RclConfig *cnf, Rcl::Db *db,
|
||||
WebQueueIndexer(RclConfig *cnf, Rcl::Db *db,
|
||||
DbIxStatusUpdater *updfunc = 0);
|
||||
~BeagleQueueIndexer();
|
||||
~WebQueueIndexer();
|
||||
|
||||
/** This is called by the top indexer in recollindex.
|
||||
* Does the walking and the talking */
|
||||
@ -68,7 +67,7 @@ public:
|
||||
private:
|
||||
RclConfig *m_config;
|
||||
Rcl::Db *m_db;
|
||||
BeagleQueueCache *m_cache;
|
||||
WebStore *m_cache;
|
||||
string m_queuedir;
|
||||
DbIxStatusUpdater *m_updater;
|
||||
bool m_nocacheindex;
|
||||
@ -77,4 +76,4 @@ private:
|
||||
void updstatus(const string& udi);
|
||||
};
|
||||
|
||||
#endif /* _beaglequeue_h_included_ */
|
||||
#endif /* _webqueue_h_included_ */
|
||||
@ -16,23 +16,26 @@
|
||||
*/
|
||||
#include "autoconfig.h"
|
||||
|
||||
#include "webqueuefetcher.h"
|
||||
|
||||
#include <mutex>
|
||||
|
||||
#include "rcldoc.h"
|
||||
#include "fetcher.h"
|
||||
#include "bglfetcher.h"
|
||||
#include "log.h"
|
||||
#include "beaglequeuecache.h"
|
||||
#include "webstore.h"
|
||||
|
||||
// We use a single beagle cache object to access beagle data. We protect it
|
||||
using std::string;
|
||||
|
||||
// We use a single WebStore object to access the data. We protect it
|
||||
// against multiple thread access.
|
||||
static std::mutex o_beagler_mutex;
|
||||
|
||||
bool BGLDocFetcher::fetch(RclConfig* cnf, const Rcl::Doc& idoc, RawDoc& out)
|
||||
bool WQDocFetcher::fetch(RclConfig* cnf, const Rcl::Doc& idoc, RawDoc& out)
|
||||
{
|
||||
string udi;
|
||||
if (!idoc.getmeta(Rcl::Doc::keyudi, &udi) || udi.empty()) {
|
||||
LOGERR("BGLDocFetcher:: no udi in idoc\n" );
|
||||
LOGERR("WQDocFetcher:: no udi in idoc\n" );
|
||||
return false;
|
||||
}
|
||||
Rcl::Doc dotdoc;
|
||||
@ -41,24 +44,23 @@ bool BGLDocFetcher::fetch(RclConfig* cnf, const Rcl::Doc& idoc, RawDoc& out)
|
||||
// Retrieve from our webcache (beagle data). The beagler
|
||||
// object is created at the first call of this routine and
|
||||
// deleted when the program exits.
|
||||
static BeagleQueueCache o_beagler(cnf);
|
||||
static WebStore o_beagler(cnf);
|
||||
if (!o_beagler.getFromCache(udi, dotdoc, out.data)) {
|
||||
LOGINFO("BGLDocFetcher::fetch: failed for [" << (udi) << "]\n" );
|
||||
LOGINFO("WQDocFetcher::fetch: failed for [" << udi << "]\n");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if (dotdoc.mimetype.compare(idoc.mimetype)) {
|
||||
LOGINFO("BGLDocFetcher:: udi [" << (udi) << "], mimetp mismatch: in: [" << (idoc.mimetype) << "], bgl [" << (dotdoc.mimetype) << "]\n" );
|
||||
LOGINFO("WQDocFetcher:: udi [" << udi << "], mimetp mismatch: in: [" <<
|
||||
idoc.mimetype << "], bgl [" << dotdoc.mimetype << "]\n");
|
||||
}
|
||||
out.kind = RawDoc::RDK_DATA;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool BGLDocFetcher::makesig(RclConfig* cnf, const Rcl::Doc& idoc, string& sig)
|
||||
bool WQDocFetcher::makesig(RclConfig* cnf, const Rcl::Doc& idoc, string& sig)
|
||||
{
|
||||
// Bgl sigs are empty
|
||||
// Web queue sigs are empty
|
||||
sig.clear();
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
@ -14,18 +14,19 @@
|
||||
* Free Software Foundation, Inc.,
|
||||
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
*/
|
||||
#ifndef _BGLFETCHER_H_INCLUDED_
|
||||
#define _BGLFETCHER_H_INCLUDED_
|
||||
#ifndef _WEBQUEUEFETCHER_H_INCLUDED_
|
||||
#define _WEBQUEUEFETCHER_H_INCLUDED_
|
||||
|
||||
#include "fetcher.h"
|
||||
|
||||
/**
|
||||
* The Beagle cache fetcher:
|
||||
* The WEB queue cache fetcher:
|
||||
*/
|
||||
class BGLDocFetcher : public DocFetcher{
|
||||
class WQDocFetcher : public DocFetcher{
|
||||
virtual bool fetch(RclConfig* cnf, const Rcl::Doc& idoc, RawDoc& out);
|
||||
virtual bool makesig(RclConfig* cnf, const Rcl::Doc& idoc,
|
||||
std::string& sig);
|
||||
virtual ~BGLDocFetcher() {}
|
||||
virtual ~WQDocFetcher() {}
|
||||
};
|
||||
|
||||
#endif /* _BGLFETCHER_H_INCLUDED_ */
|
||||
#endif /* _WEBQUEUEFETCHER_H_INCLUDED_ */
|
||||
@ -47,6 +47,7 @@ using namespace std;
|
||||
#include "copyfile.h"
|
||||
#include "fetcher.h"
|
||||
#include "extrameta.h"
|
||||
#include "uncomp.h"
|
||||
|
||||
// The internal path element separator. This can't be the same as the rcldb
|
||||
// file to ipath separator : "|"
|
||||
@ -188,7 +189,7 @@ void FileInterner::init(const string &f, const struct stat *stp, RclConfig *cnf,
|
||||
int maxkbs = -1;
|
||||
if (!m_cfg->getConfParam("compressedfilemaxkbs", &maxkbs) ||
|
||||
maxkbs < 0 || !stp || int(stp->st_size / 1024) < maxkbs) {
|
||||
if (!m_uncomp.uncompressfile(m_fn, ucmd, m_tfile)) {
|
||||
if (!m_uncomp->uncompressfile(m_fn, ucmd, m_tfile)) {
|
||||
return;
|
||||
}
|
||||
LOGDEB1("FileInterner:: after ucomp: tfile " << m_tfile <<"\n");
|
||||
@ -293,8 +294,8 @@ void FileInterner::init(const string &data, RclConfig *cnf,
|
||||
result = df->set_document_data(m_mimetype, data.c_str(), data.length());
|
||||
} else if (df->is_data_input_ok(Dijon::Filter::DOCUMENT_FILE_NAME)) {
|
||||
TempFile temp = dataToTempFile(data, m_mimetype);
|
||||
if (temp &&
|
||||
(result = df->set_document_file(m_mimetype, temp->filename()))) {
|
||||
if (temp.ok() &&
|
||||
(result = df->set_document_file(m_mimetype, temp.filename()))) {
|
||||
m_tmpflgs[m_handlers.size()] = true;
|
||||
m_tempfiles.push_back(temp);
|
||||
}
|
||||
@ -312,7 +313,8 @@ void FileInterner::init(const string &data, RclConfig *cnf,
|
||||
void FileInterner::initcommon(RclConfig *cnf, int flags)
|
||||
{
|
||||
m_cfg = cnf;
|
||||
m_uncomp = m_forPreview = ((flags & FIF_forPreview) != 0);
|
||||
m_forPreview = ((flags & FIF_forPreview) != 0);
|
||||
m_uncomp = new Uncomp(m_forPreview);
|
||||
// Initialize handler stack.
|
||||
m_handlers.reserve(MAXHANDLERS);
|
||||
for (unsigned int i = 0; i < MAXHANDLERS; i++)
|
||||
@ -373,10 +375,10 @@ bool FileInterner::makesig(RclConfig *cnf, const Rcl::Doc& idoc, string& sig)
|
||||
|
||||
FileInterner::~FileInterner()
|
||||
{
|
||||
for (vector<RecollFilter*>::iterator it = m_handlers.begin();
|
||||
it != m_handlers.end(); it++) {
|
||||
returnMimeHandler(*it);
|
||||
for (auto& entry: m_handlers) {
|
||||
returnMimeHandler(entry);
|
||||
}
|
||||
delete m_uncomp;
|
||||
// m_tempfiles will take care of itself
|
||||
}
|
||||
|
||||
@ -386,14 +388,14 @@ FileInterner::~FileInterner()
|
||||
TempFile FileInterner::dataToTempFile(const string& dt, const string& mt)
|
||||
{
|
||||
// Create temp file with appropriate suffix for mime type
|
||||
TempFile temp(new TempFileInternal(m_cfg->getSuffixFromMimeType(mt)));
|
||||
if (!temp->ok()) {
|
||||
TempFile temp(m_cfg->getSuffixFromMimeType(mt));
|
||||
if (!temp.ok()) {
|
||||
LOGERR("FileInterner::dataToTempFile: cant create tempfile: " <<
|
||||
temp->getreason() << "\n");
|
||||
temp.getreason() << "\n");
|
||||
return TempFile();
|
||||
}
|
||||
string reason;
|
||||
if (!stringtofile(dt, temp->filename(), reason)) {
|
||||
if (!stringtofile(dt, temp.filename(), reason)) {
|
||||
LOGERR("FileInterner::dataToTempFile: stringtofile: " <<reason << "\n");
|
||||
return TempFile();
|
||||
}
|
||||
@ -723,8 +725,8 @@ int FileInterner::addHandler()
|
||||
setres = newflt->set_document_data(mimetype,txt->c_str(),txt->length());
|
||||
} else if (newflt->is_data_input_ok(Dijon::Filter::DOCUMENT_FILE_NAME)) {
|
||||
TempFile temp = dataToTempFile(*txt, mimetype);
|
||||
if (temp &&
|
||||
(setres = newflt->set_document_file(mimetype, temp->filename()))) {
|
||||
if (temp.ok() &&
|
||||
(setres = newflt->set_document_file(mimetype, temp.filename()))) {
|
||||
m_tmpflgs[m_handlers.size()] = true;
|
||||
m_tempfiles.push_back(temp);
|
||||
// Hack here, but really helps perfs: if we happen to
|
||||
@ -765,7 +767,7 @@ FileInterner::Status FileInterner::internfile(Rcl::Doc& doc,const string& ipath)
|
||||
LOGDEB("FileInterner::internfile. ipath [" << ipath << "]\n");
|
||||
|
||||
// Get rid of possible image tempfile from older call
|
||||
m_imgtmp.reset();
|
||||
m_imgtmp = TempFile();
|
||||
|
||||
if (m_handlers.size() < 1) {
|
||||
// Just means the constructor failed
|
||||
@ -916,9 +918,8 @@ FileInterner::Status FileInterner::internfile(Rcl::Doc& doc,const string& ipath)
|
||||
bool FileInterner::tempFileForMT(TempFile& otemp, RclConfig* cnf,
|
||||
const string& mimetype)
|
||||
{
|
||||
TempFile temp(new TempFileInternal(
|
||||
cnf->getSuffixFromMimeType(mimetype)));
|
||||
if (!temp->ok()) {
|
||||
TempFile temp(cnf->getSuffixFromMimeType(mimetype));
|
||||
if (!temp.ok()) {
|
||||
LOGERR("FileInterner::tempFileForMT: can't create temp file\n");
|
||||
return false;
|
||||
}
|
||||
@ -970,7 +971,7 @@ bool FileInterner::topdocToFile(
|
||||
if (!tempFileForMT(temp, cnf, idoc.mimetype)) {
|
||||
return false;
|
||||
}
|
||||
filename = temp->filename();
|
||||
filename = temp.filename();
|
||||
} else {
|
||||
filename = tofile.c_str();
|
||||
}
|
||||
@ -985,7 +986,7 @@ bool FileInterner::topdocToFile(
|
||||
return false;
|
||||
}
|
||||
}
|
||||
fn = temp ? temp->filename() : rawdoc.data;
|
||||
fn = temp.ok() ? temp.filename() : rawdoc.data;
|
||||
if (!copyfile(fn.c_str(), filename, reason)) {
|
||||
LOGERR("FileInterner::idocToFile: copyfile: " << reason << "\n");
|
||||
return false;
|
||||
@ -1040,7 +1041,7 @@ bool FileInterner::interntofile(TempFile& otemp, const string& tofile,
|
||||
if (!tempFileForMT(temp, m_cfg, mimetype)) {
|
||||
return false;
|
||||
}
|
||||
filename = temp->filename();
|
||||
filename = temp.filename();
|
||||
} else {
|
||||
filename = tofile.c_str();
|
||||
}
|
||||
@ -1106,9 +1107,8 @@ bool FileInterner::maybeUncompressToTemp(TempFile& temp, const string& fn,
|
||||
" kbs\n");
|
||||
return false;
|
||||
}
|
||||
temp =
|
||||
TempFile(new TempFileInternal(cnf->getSuffixFromMimeType(doc.mimetype)));
|
||||
if (!temp->ok()) {
|
||||
temp = TempFile(cnf->getSuffixFromMimeType(doc.mimetype));
|
||||
if (!temp.ok()) {
|
||||
LOGERR("FileInterner: cant create temporary file\n");
|
||||
return false;
|
||||
}
|
||||
@ -1123,9 +1123,9 @@ bool FileInterner::maybeUncompressToTemp(TempFile& temp, const string& fn,
|
||||
// reason for this, but it's not nice here. Have to move, the
|
||||
// uncompressed file, hopefully staying on the same dev.
|
||||
string reason;
|
||||
if (!renameormove(uncomped.c_str(), temp->filename(), reason)) {
|
||||
if (!renameormove(uncomped.c_str(), temp.filename(), reason)) {
|
||||
LOGERR("FileInterner::maybeUncompress: move [" << uncomped <<
|
||||
"] -> [" << temp->filename() << "] failed: " << reason << "\n");
|
||||
"] -> [" << temp.filename() << "] failed: " << reason << "\n");
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
|
||||
@ -28,14 +28,15 @@ using std::map;
|
||||
using std::set;
|
||||
|
||||
#include "mimehandler.h"
|
||||
#include "uncomp.h"
|
||||
#include "pathut.h"
|
||||
#include "rclutil.h"
|
||||
|
||||
class RclConfig;
|
||||
namespace Rcl {
|
||||
class Doc;
|
||||
}
|
||||
|
||||
class Uncomp;
|
||||
struct stat;
|
||||
|
||||
/** Storage for missing helper program info. We want to keep this out of the
|
||||
@ -277,7 +278,7 @@ class FileInterner {
|
||||
string m_reason;
|
||||
FIMissingStore *m_missingdatap{nullptr};
|
||||
|
||||
Uncomp m_uncomp;
|
||||
Uncomp *m_uncomp{nullptr};
|
||||
|
||||
bool m_noxattrs; // disable xattrs usage
|
||||
bool m_direct; // External app did the extraction
|
||||
|
||||
@ -260,8 +260,8 @@ bool MimeHandlerMail::processAttach()
|
||||
att->m_charset << "] fn [" << att->m_filename << "]\n");
|
||||
|
||||
// Erase current content and replace
|
||||
m_metaData[cstr_dj_keycontent] = string();
|
||||
string& body = m_metaData[cstr_dj_keycontent];
|
||||
body.clear();
|
||||
att->m_part->getBody(body, 0, att->m_part->bodylength);
|
||||
{
|
||||
string decoded;
|
||||
@ -285,10 +285,15 @@ bool MimeHandlerMail::processAttach()
|
||||
|
||||
// Special case for text/plain content. Internfile should deal
|
||||
// with this but it expects text/plain to be utf-8 already, so we
|
||||
// handle the transcoding if needed
|
||||
// handle the transcoding if needed. Same kind of issue for the MD5
|
||||
if (m_metaData[cstr_dj_keymt] == cstr_textplain) {
|
||||
if (!txtdcode("MimeHandlerMail::processAttach"))
|
||||
if (!txtdcode("MimeHandlerMail::processAttach")) {
|
||||
body.clear();
|
||||
} else if (!m_forPreview) {
|
||||
string md5, xmd5;
|
||||
MD5String(body, md5);
|
||||
m_metaData[cstr_dj_keymd5] = MD5HexPrint(md5, xmd5);
|
||||
}
|
||||
}
|
||||
|
||||
// Ipath
|
||||
|
||||
@ -23,16 +23,6 @@
|
||||
#include "safesysstat.h"
|
||||
#include <time.h>
|
||||
|
||||
#if defined(_WIN32)
|
||||
#define USING_STD_REGEX
|
||||
#endif
|
||||
|
||||
#ifdef USING_STD_REGEX
|
||||
#include <regex>
|
||||
#else
|
||||
#include <regex.h>
|
||||
#endif
|
||||
|
||||
#include <cstring>
|
||||
#include <map>
|
||||
#include <mutex>
|
||||
@ -363,7 +353,7 @@ static inline void stripendnl(line_type& line, int& ll)
|
||||
// This was added as an alternative format. By the way it also fools "mail" and
|
||||
// emacs-vm, Recoll is not alone
|
||||
// Update: 2009-11-27: word after From may be quoted string: From "john bull"
|
||||
static const char *frompat =
|
||||
static const string frompat{
|
||||
"^From[ ]+([^ ]+|\"[^\"]+\")[ ]+" // 'From (toto@tutu|"john bull") '
|
||||
"[[:alpha:]]{3}[ ]+[[:alpha:]]{3}[ ]+[0-3 ][0-9][ ]+" // Fri Oct 26
|
||||
"[0-2][0-9]:[0-5][0-9](:[0-5][0-9])?[ ]+" // Time, seconds optional
|
||||
@ -374,45 +364,15 @@ static const char *frompat =
|
||||
"[[:alpha:]]{3},[ ]+[0-3]?[0-9][ ]+[[:alpha:]]{3}[ ]+" // Mon, 8 May
|
||||
"[12][0-9][0-9][0-9][ ]+" // Year
|
||||
"[0-2][0-9]:[0-5][0-9](:[0-5][0-9])?" // Time, secs optional
|
||||
;
|
||||
};
|
||||
|
||||
// Extreme thunderbird brokiness. Will sometimes use From lines
|
||||
// exactly like: From ^M (From followed by space and eol). We only
|
||||
// test for this if QUIRKS_TBIRD is set
|
||||
static const char *miniTbirdFrom = "^From $";
|
||||
#ifndef USING_STD_REGEX
|
||||
static regex_t fromregex;
|
||||
static regex_t minifromregex;
|
||||
#define M_regexec(A,B,C,D,E) regexec(&(A),B,C,D,E)
|
||||
#else
|
||||
basic_regex<char> fromregex;
|
||||
basic_regex<char> minifromregex;
|
||||
#define REG_NOSUB std::regex_constants::nosubs
|
||||
#define REG_EXTENDED std::regex_constants::extended
|
||||
#define M_regexec(A, B, C, D, E) (!regex_match(B,A))
|
||||
static const string miniTbirdFrom{"^From $"};
|
||||
|
||||
#endif
|
||||
|
||||
static bool regcompiled;
|
||||
static std::mutex o_regex_mutex;
|
||||
|
||||
static void compileregexes()
|
||||
{
|
||||
std::unique_lock<std::mutex> locker(o_regex_mutex);
|
||||
// As the initial test of regcompiled is unprotected the value may
|
||||
// have changed while we were waiting for the lock. Test again now
|
||||
// that we are alone.
|
||||
if (regcompiled)
|
||||
return;
|
||||
#ifndef USING_STD_REGEX
|
||||
regcomp(&fromregex, frompat, REG_NOSUB|REG_EXTENDED);
|
||||
regcomp(&minifromregex, miniTbirdFrom, REG_NOSUB|REG_EXTENDED);
|
||||
#else
|
||||
fromregex = basic_regex<char>(frompat, REG_NOSUB | REG_EXTENDED);
|
||||
minifromregex = basic_regex<char>(miniTbirdFrom, REG_NOSUB | REG_EXTENDED);
|
||||
#endif
|
||||
regcompiled = true;
|
||||
}
|
||||
static SimpleRegexp fromregex(frompat, SimpleRegexp::SRE_NOSUB);
|
||||
static SimpleRegexp minifromregex(miniTbirdFrom, SimpleRegexp::SRE_NOSUB);
|
||||
|
||||
bool MimeHandlerMbox::next_document()
|
||||
{
|
||||
@ -432,13 +392,11 @@ bool MimeHandlerMbox::next_document()
|
||||
LOGDEB("MimeHandlerMbox::next_document: can't preview folders!\n");
|
||||
return false;
|
||||
}
|
||||
LOGDEB0("MimeHandlerMbox::next_document: fn " << (m_fn) << ", msgnum " << (m_msgnum) << " mtarg " << (mtarg) << " \n");
|
||||
LOGDEB0("MimeHandlerMbox::next_document: fn " << m_fn << ", msgnum " <<
|
||||
m_msgnum << " mtarg " << mtarg << " \n");
|
||||
if (mtarg == 0)
|
||||
mtarg = -1;
|
||||
|
||||
if (!regcompiled) {
|
||||
compileregexes();
|
||||
}
|
||||
|
||||
// If we are called to retrieve a specific message, seek to bof
|
||||
// (then scan up to the message). This is for the case where the
|
||||
@ -452,14 +410,14 @@ bool MimeHandlerMbox::next_document()
|
||||
if (mtarg > 0) {
|
||||
mbhoff_type off;
|
||||
line_type line;
|
||||
LOGDEB0("MimeHandlerMbox::next_doc: mtarg " << (mtarg) << " m_udi[" << (m_udi) << "]\n");
|
||||
LOGDEB0("MimeHandlerMbox::next_doc: mtarg " << mtarg << " m_udi[" <<
|
||||
m_udi << "]\n");
|
||||
if (!m_udi.empty() &&
|
||||
(off = o_mcache.get_offset(m_config, m_udi, mtarg)) >= 0 &&
|
||||
fseeko(fp, (off_t)off, SEEK_SET) >= 0 &&
|
||||
fgets(line, LL, fp) &&
|
||||
(!M_regexec(fromregex, line, 0, 0, 0) ||
|
||||
((m_quirks & MBOXQUIRK_TBIRD) &&
|
||||
!M_regexec(minifromregex, line, 0, 0, 0))) ) {
|
||||
(fromregex(line) || ((m_quirks & MBOXQUIRK_TBIRD) &&
|
||||
minifromregex(line))) ) {
|
||||
LOGDEB0("MimeHandlerMbox: Cache: From_ Ok\n");
|
||||
fseeko(fp, (off_t)off, SEEK_SET);
|
||||
m_msgnum = mtarg -1;
|
||||
@ -487,7 +445,8 @@ bool MimeHandlerMbox::next_document()
|
||||
m_lineno++;
|
||||
int ll;
|
||||
stripendnl(line, ll);
|
||||
LOGDEB2("mhmbox:next: hadempty " << (hademptyline) << " lineno " << (m_lineno) << " ll " << (ll) << " Line: [" << (line) << "]\n");
|
||||
LOGDEB2("mhmbox:next: hadempty " << hademptyline << " lineno " <<
|
||||
m_lineno << " ll " << ll << " Line: [" << line << "]\n");
|
||||
if (hademptyline) {
|
||||
if (ll > 0) {
|
||||
// Non-empty line with empty line flag set, reset flag
|
||||
@ -501,11 +460,12 @@ bool MimeHandlerMbox::next_document()
|
||||
/* The 'F' compare is redundant but it improves performance
|
||||
A LOT */
|
||||
if (line[0] == 'F' && (
|
||||
!M_regexec(fromregex, line, 0, 0, 0) ||
|
||||
((m_quirks & MBOXQUIRK_TBIRD) &&
|
||||
!M_regexec(minifromregex, line, 0, 0, 0)))
|
||||
fromregex(line) ||
|
||||
((m_quirks & MBOXQUIRK_TBIRD) && minifromregex(line)))
|
||||
) {
|
||||
LOGDEB0("MimeHandlerMbox: msgnum " << (m_msgnum) << ", From_ at line " << (m_lineno) << ": [" << (line) << "]\n");
|
||||
LOGDEB0("MimeHandlerMbox: msgnum " << m_msgnum <<
|
||||
", From_ at line " << m_lineno << ": [" << line
|
||||
<< "]\n");
|
||||
if (storeoffsets)
|
||||
m_offsets.push_back(message_end);
|
||||
m_msgnum++;
|
||||
@ -528,13 +488,15 @@ bool MimeHandlerMbox::next_document()
|
||||
line[ll+1] = 0;
|
||||
msgtxt += line;
|
||||
if (msgtxt.size() > max_mbox_member_size) {
|
||||
LOGERR("mh_mbox: huge message (more than " << (max_mbox_member_size/(1024*1024)) << " MB) inside " << (m_fn) << ", giving up\n");
|
||||
LOGERR("mh_mbox: huge message (more than " <<
|
||||
max_mbox_member_size/(1024*1024) << " MB) inside " <<
|
||||
m_fn << ", giving up\n");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
LOGDEB2("Message text length " << (msgtxt.size()) << "\n");
|
||||
LOGDEB2("Message text: [" << (msgtxt) << "]\n");
|
||||
LOGDEB2("Message text length " << msgtxt.size() << "\n");
|
||||
LOGDEB2("Message text: [" << msgtxt << "]\n");
|
||||
char buf[20];
|
||||
// m_msgnum was incremented when hitting the next From_ or eof, so the data
|
||||
// is for m_msgnum - 1
|
||||
|
||||
@ -35,6 +35,12 @@ using std::vector;
|
||||
|
||||
Uncomp::UncompCache Uncomp::o_cache;
|
||||
|
||||
Uncomp::Uncomp(bool docache)
|
||||
: m_docache(docache)
|
||||
{
|
||||
LOGDEB0("Uncomp::Uncomp: m_docache: " << m_docache << "\n");
|
||||
}
|
||||
|
||||
bool Uncomp::uncompressfile(const string& ifn,
|
||||
const vector<string>& cmdv, string& tfile)
|
||||
{
|
||||
@ -57,7 +63,8 @@ bool Uncomp::uncompressfile(const string& ifn,
|
||||
}
|
||||
// Make sure tmp dir is empty. we guarantee this to filters
|
||||
if (!m_dir || !m_dir->ok() || !m_dir->wipe()) {
|
||||
LOGERR("uncompressfile: can't clear temp dir " << (m_dir->dirname()) << "\n" );
|
||||
LOGERR("uncompressfile: can't clear temp dir " << m_dir->dirname() <<
|
||||
"\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -66,12 +73,14 @@ bool Uncomp::uncompressfile(const string& ifn,
|
||||
int pc;
|
||||
long long availmbs;
|
||||
if (!fsocc(m_dir->dirname(), &pc, &availmbs)) {
|
||||
LOGERR("uncompressfile: can't retrieve avail space for " << (m_dir->dirname()) << "\n" );
|
||||
LOGERR("uncompressfile: can't retrieve avail space for " <<
|
||||
m_dir->dirname() << "\n");
|
||||
// Hope for the best
|
||||
} else {
|
||||
long long fsize = path_filesize(ifn);
|
||||
if (fsize < 0) {
|
||||
LOGERR("uncompressfile: stat input file " << (ifn) << " errno " << (errno) << "\n" );
|
||||
LOGERR("uncompressfile: stat input file " << ifn << " errno " <<
|
||||
errno << "\n");
|
||||
return false;
|
||||
}
|
||||
// We need at least twice the file size for the uncompressed
|
||||
@ -83,7 +92,9 @@ bool Uncomp::uncompressfile(const string& ifn,
|
||||
long long filembs = fsize / (1024 * 1024);
|
||||
|
||||
if (availmbs < 2 * filembs + 1) {
|
||||
LOGERR("uncompressfile. " << (lltodecstr(availmbs)) << " MBs available in " << (m_dir->dirname()) << " not enough to uncompress " << (ifn) << " of size " << (lltodecstr(filembs)) << " mbs\n" );
|
||||
LOGERR("uncompressfile. " << availmbs << " MBs available in " <<
|
||||
m_dir->dirname() << " not enough to uncompress " <<
|
||||
ifn << " of size " << filembs << " MBs\n");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@ -107,9 +118,10 @@ bool Uncomp::uncompressfile(const string& ifn,
|
||||
ExecCmd ex;
|
||||
int status = ex.doexec(cmd, args, 0, &tfile);
|
||||
if (status || tfile.empty()) {
|
||||
LOGERR("uncompressfile: doexec: failed for [" << (ifn) << "] status 0x" << (status) << "\n" );
|
||||
LOGERR("uncompressfile: doexec: failed for [" << ifn << "] status 0x" <<
|
||||
status << "\n");
|
||||
if (!m_dir->wipe()) {
|
||||
LOGERR("uncompressfile: wipedir failed\n" );
|
||||
LOGERR("uncompressfile: wipedir failed\n");
|
||||
}
|
||||
return false;
|
||||
}
|
||||
@ -122,6 +134,8 @@ bool Uncomp::uncompressfile(const string& ifn,
|
||||
|
||||
Uncomp::~Uncomp()
|
||||
{
|
||||
LOGDEB0("Uncomp::~Uncomp: m_docache: " << m_docache << " m_dir " <<
|
||||
(m_dir?m_dir->dirname():"(null)") << "\n");
|
||||
if (m_docache) {
|
||||
std::unique_lock<std::mutex> lock(o_cache.m_lock);
|
||||
delete o_cache.m_dir;
|
||||
@ -133,4 +147,12 @@ Uncomp::~Uncomp()
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void Uncomp::clearcache()
|
||||
{
|
||||
LOGDEB0("Uncomp::clearcache\n");
|
||||
std::unique_lock<std::mutex> lock(o_cache.m_lock);
|
||||
delete o_cache.m_dir;
|
||||
o_cache.m_dir = 0;
|
||||
o_cache.m_tfile.clear();
|
||||
o_cache.m_srcpath.clear();
|
||||
}
|
||||
|
||||
@ -27,10 +27,7 @@
|
||||
/// Uncompression script interface.
|
||||
class Uncomp {
|
||||
public:
|
||||
Uncomp(bool docache = false)
|
||||
: m_dir(0), m_docache(docache)
|
||||
{
|
||||
}
|
||||
explicit Uncomp(bool docache = false);
|
||||
~Uncomp();
|
||||
|
||||
/** Uncompress the input file into a temporary one, by executing the
|
||||
@ -41,25 +38,22 @@ public:
|
||||
bool uncompressfile(const std::string& ifn,
|
||||
const std::vector<std::string>& cmdv,
|
||||
std::string& tfile);
|
||||
|
||||
static void clearcache();
|
||||
|
||||
private:
|
||||
TempDir *m_dir;
|
||||
TempDir *m_dir{0};
|
||||
std::string m_tfile;
|
||||
std::string m_srcpath;
|
||||
bool m_docache;
|
||||
|
||||
class UncompCache {
|
||||
public:
|
||||
UncompCache()
|
||||
: m_dir(0)
|
||||
{
|
||||
}
|
||||
~UncompCache()
|
||||
{
|
||||
UncompCache() {}
|
||||
~UncompCache() {
|
||||
delete m_dir;
|
||||
}
|
||||
std::mutex m_lock;
|
||||
TempDir *m_dir;
|
||||
TempDir *m_dir{0};
|
||||
std::string m_tfile;
|
||||
std::string m_srcpath;
|
||||
};
|
||||
|
||||
28
src/python/pychm/AUTHORS
Normal file
28
src/python/pychm/AUTHORS
Normal file
@ -0,0 +1,28 @@
|
||||
Author
|
||||
------
|
||||
|
||||
Rubens Ramos <rubensr@users.sourceforge.net>
|
||||
|
||||
Maintainer
|
||||
----------
|
||||
|
||||
Mikhail Gusarov <dottedmag@dottedmag.net>
|
||||
|
||||
Python3 port minor changes
|
||||
--------------------------
|
||||
|
||||
Jean-Francois Dockes <jf@dockes.org>
|
||||
|
||||
Acknowledgements
|
||||
----------------
|
||||
|
||||
This work would not have been possible without the existence of chmlib,
|
||||
developed by Jed Wing, and a lot of the python code used to parse the contents
|
||||
tree and to decode the index files was heavily based on the code implemented by
|
||||
Razvan Cojocaru <razvanco@gmx.net> for the xCHM viewer.
|
||||
|
||||
Bug reports
|
||||
-----------
|
||||
|
||||
can3p, Chang (changshu), Hristo Iliev, Carlos Liu, Torsten Marek, Dmitri
|
||||
(nebraskin), Fredrik de Vibe, Glenn Washburn
|
||||
281
src/python/pychm/COPYING
Normal file
281
src/python/pychm/COPYING
Normal file
@ -0,0 +1,281 @@
|
||||
GNU GENERAL PUBLIC LICENSE
|
||||
Version 2, June 1991
|
||||
|
||||
Copyright (C) 1989, 1991 Free Software Foundation, Inc.
|
||||
59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
Everyone is permitted to copy and distribute verbatim copies
|
||||
of this license document, but changing it is not allowed.
|
||||
|
||||
Preamble
|
||||
|
||||
The licenses for most software are designed to take away your
|
||||
freedom to share and change it. By contrast, the GNU General Public
|
||||
License is intended to guarantee your freedom to share and change free
|
||||
software--to make sure the software is free for all its users. This
|
||||
General Public License applies to most of the Free Software
|
||||
Foundation's software and to any other program whose authors commit to
|
||||
using it. (Some other Free Software Foundation software is covered by
|
||||
the GNU Library General Public License instead.) You can apply it to
|
||||
your programs, too.
|
||||
|
||||
When we speak of free software, we are referring to freedom, not
|
||||
price. Our General Public Licenses are designed to make sure that you
|
||||
have the freedom to distribute copies of free software (and charge for
|
||||
this service if you wish), that you receive source code or can get it
|
||||
if you want it, that you can change the software or use pieces of it
|
||||
in new free programs; and that you know you can do these things.
|
||||
|
||||
To protect your rights, we need to make restrictions that forbid
|
||||
anyone to deny you these rights or to ask you to surrender the rights.
|
||||
These restrictions translate to certain responsibilities for you if you
|
||||
distribute copies of the software, or if you modify it.
|
||||
|
||||
For example, if you distribute copies of such a program, whether
|
||||
gratis or for a fee, you must give the recipients all the rights that
|
||||
you have. You must make sure that they, too, receive or can get the
|
||||
source code. And you must show them these terms so they know their
|
||||
rights.
|
||||
|
||||
We protect your rights with two steps: (1) copyright the software, and
|
||||
(2) offer you this license which gives you legal permission to copy,
|
||||
distribute and/or modify the software.
|
||||
|
||||
Also, for each author's protection and ours, we want to make certain
|
||||
that everyone understands that there is no warranty for this free
|
||||
software. If the software is modified by someone else and passed on, we
|
||||
want its recipients to know that what they have is not the original, so
|
||||
that any problems introduced by others will not reflect on the original
|
||||
authors' reputations.
|
||||
|
||||
Finally, any free program is threatened constantly by software
|
||||
patents. We wish to avoid the danger that redistributors of a free
|
||||
program will individually obtain patent licenses, in effect making the
|
||||
program proprietary. To prevent this, we have made it clear that any
|
||||
patent must be licensed for everyone's free use or not licensed at all.
|
||||
|
||||
The precise terms and conditions for copying, distribution and
|
||||
modification follow.
|
||||
|
||||
GNU GENERAL PUBLIC LICENSE
|
||||
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
|
||||
|
||||
0. This License applies to any program or other work which contains
|
||||
a notice placed by the copyright holder saying it may be distributed
|
||||
under the terms of this General Public License. The "Program", below,
|
||||
refers to any such program or work, and a "work based on the Program"
|
||||
means either the Program or any derivative work under copyright law:
|
||||
that is to say, a work containing the Program or a portion of it,
|
||||
either verbatim or with modifications and/or translated into another
|
||||
language. (Hereinafter, translation is included without limitation in
|
||||
the term "modification".) Each licensee is addressed as "you".
|
||||
|
||||
Activities other than copying, distribution and modification are not
|
||||
covered by this License; they are outside its scope. The act of
|
||||
running the Program is not restricted, and the output from the Program
|
||||
is covered only if its contents constitute a work based on the
|
||||
Program (independent of having been made by running the Program).
|
||||
Whether that is true depends on what the Program does.
|
||||
|
||||
1. You may copy and distribute verbatim copies of the Program's
|
||||
source code as you receive it, in any medium, provided that you
|
||||
conspicuously and appropriately publish on each copy an appropriate
|
||||
copyright notice and disclaimer of warranty; keep intact all the
|
||||
notices that refer to this License and to the absence of any warranty;
|
||||
and give any other recipients of the Program a copy of this License
|
||||
along with the Program.
|
||||
|
||||
You may charge a fee for the physical act of transferring a copy, and
|
||||
you may at your option offer warranty protection in exchange for a fee.
|
||||
|
||||
2. You may modify your copy or copies of the Program or any portion
|
||||
of it, thus forming a work based on the Program, and copy and
|
||||
distribute such modifications or work under the terms of Section 1
|
||||
above, provided that you also meet all of these conditions:
|
||||
|
||||
a) You must cause the modified files to carry prominent notices
|
||||
stating that you changed the files and the date of any change.
|
||||
|
||||
b) You must cause any work that you distribute or publish, that in
|
||||
whole or in part contains or is derived from the Program or any
|
||||
part thereof, to be licensed as a whole at no charge to all third
|
||||
parties under the terms of this License.
|
||||
|
||||
c) If the modified program normally reads commands interactively
|
||||
when run, you must cause it, when started running for such
|
||||
interactive use in the most ordinary way, to print or display an
|
||||
announcement including an appropriate copyright notice and a
|
||||
notice that there is no warranty (or else, saying that you provide
|
||||
a warranty) and that users may redistribute the program under
|
||||
these conditions, and telling the user how to view a copy of this
|
||||
License. (Exception: if the Program itself is interactive but
|
||||
does not normally print such an announcement, your work based on
|
||||
the Program is not required to print an announcement.)
|
||||
|
||||
These requirements apply to the modified work as a whole. If
|
||||
identifiable sections of that work are not derived from the Program,
|
||||
and can be reasonably considered independent and separate works in
|
||||
themselves, then this License, and its terms, do not apply to those
|
||||
sections when you distribute them as separate works. But when you
|
||||
distribute the same sections as part of a whole which is a work based
|
||||
on the Program, the distribution of the whole must be on the terms of
|
||||
this License, whose permissions for other licensees extend to the
|
||||
entire whole, and thus to each and every part regardless of who wrote it.
|
||||
|
||||
Thus, it is not the intent of this section to claim rights or contest
|
||||
your rights to work written entirely by you; rather, the intent is to
|
||||
exercise the right to control the distribution of derivative or
|
||||
collective works based on the Program.
|
||||
|
||||
In addition, mere aggregation of another work not based on the Program
|
||||
with the Program (or with a work based on the Program) on a volume of
|
||||
a storage or distribution medium does not bring the other work under
|
||||
the scope of this License.
|
||||
|
||||
3. You may copy and distribute the Program (or a work based on it,
|
||||
under Section 2) in object code or executable form under the terms of
|
||||
Sections 1 and 2 above provided that you also do one of the following:
|
||||
|
||||
a) Accompany it with the complete corresponding machine-readable
|
||||
source code, which must be distributed under the terms of Sections
|
||||
1 and 2 above on a medium customarily used for software interchange; or,
|
||||
|
||||
b) Accompany it with a written offer, valid for at least three
|
||||
years, to give any third party, for a charge no more than your
|
||||
cost of physically performing source distribution, a complete
|
||||
machine-readable copy of the corresponding source code, to be
|
||||
distributed under the terms of Sections 1 and 2 above on a medium
|
||||
customarily used for software interchange; or,
|
||||
|
||||
c) Accompany it with the information you received as to the offer
|
||||
to distribute corresponding source code. (This alternative is
|
||||
allowed only for noncommercial distribution and only if you
|
||||
received the program in object code or executable form with such
|
||||
an offer, in accord with Subsection b above.)
|
||||
|
||||
The source code for a work means the preferred form of the work for
|
||||
making modifications to it. For an executable work, complete source
|
||||
code means all the source code for all modules it contains, plus any
|
||||
associated interface definition files, plus the scripts used to
|
||||
control compilation and installation of the executable. However, as a
|
||||
special exception, the source code distributed need not include
|
||||
anything that is normally distributed (in either source or binary
|
||||
form) with the major components (compiler, kernel, and so on) of the
|
||||
operating system on which the executable runs, unless that component
|
||||
itself accompanies the executable.
|
||||
|
||||
If distribution of executable or object code is made by offering
|
||||
access to copy from a designated place, then offering equivalent
|
||||
access to copy the source code from the same place counts as
|
||||
distribution of the source code, even though third parties are not
|
||||
compelled to copy the source along with the object code.
|
||||
|
||||
4. You may not copy, modify, sublicense, or distribute the Program
|
||||
except as expressly provided under this License. Any attempt
|
||||
otherwise to copy, modify, sublicense or distribute the Program is
|
||||
void, and will automatically terminate your rights under this License.
|
||||
However, parties who have received copies, or rights, from you under
|
||||
this License will not have their licenses terminated so long as such
|
||||
parties remain in full compliance.
|
||||
|
||||
5. You are not required to accept this License, since you have not
|
||||
signed it. However, nothing else grants you permission to modify or
|
||||
distribute the Program or its derivative works. These actions are
|
||||
prohibited by law if you do not accept this License. Therefore, by
|
||||
modifying or distributing the Program (or any work based on the
|
||||
Program), you indicate your acceptance of this License to do so, and
|
||||
all its terms and conditions for copying, distributing or modifying
|
||||
the Program or works based on it.
|
||||
|
||||
6. Each time you redistribute the Program (or any work based on the
|
||||
Program), the recipient automatically receives a license from the
|
||||
original licensor to copy, distribute or modify the Program subject to
|
||||
these terms and conditions. You may not impose any further
|
||||
restrictions on the recipients' exercise of the rights granted herein.
|
||||
You are not responsible for enforcing compliance by third parties to
|
||||
this License.
|
||||
|
||||
7. If, as a consequence of a court judgment or allegation of patent
|
||||
infringement or for any other reason (not limited to patent issues),
|
||||
conditions are imposed on you (whether by court order, agreement or
|
||||
otherwise) that contradict the conditions of this License, they do not
|
||||
excuse you from the conditions of this License. If you cannot
|
||||
distribute so as to satisfy simultaneously your obligations under this
|
||||
License and any other pertinent obligations, then as a consequence you
|
||||
may not distribute the Program at all. For example, if a patent
|
||||
license would not permit royalty-free redistribution of the Program by
|
||||
all those who receive copies directly or indirectly through you, then
|
||||
the only way you could satisfy both it and this License would be to
|
||||
refrain entirely from distribution of the Program.
|
||||
|
||||
If any portion of this section is held invalid or unenforceable under
|
||||
any particular circumstance, the balance of the section is intended to
|
||||
apply and the section as a whole is intended to apply in other
|
||||
circumstances.
|
||||
|
||||
It is not the purpose of this section to induce you to infringe any
|
||||
patents or other property right claims or to contest validity of any
|
||||
such claims; this section has the sole purpose of protecting the
|
||||
integrity of the free software distribution system, which is
|
||||
implemented by public license practices. Many people have made
|
||||
generous contributions to the wide range of software distributed
|
||||
through that system in reliance on consistent application of that
|
||||
system; it is up to the author/donor to decide if he or she is willing
|
||||
to distribute software through any other system and a licensee cannot
|
||||
impose that choice.
|
||||
|
||||
This section is intended to make thoroughly clear what is believed to
|
||||
be a consequence of the rest of this License.
|
||||
|
||||
8. If the distribution and/or use of the Program is restricted in
|
||||
certain countries either by patents or by copyrighted interfaces, the
|
||||
original copyright holder who places the Program under this License
|
||||
may add an explicit geographical distribution limitation excluding
|
||||
those countries, so that distribution is permitted only in or among
|
||||
countries not thus excluded. In such case, this License incorporates
|
||||
the limitation as if written in the body of this License.
|
||||
|
||||
9. The Free Software Foundation may publish revised and/or new versions
|
||||
of the General Public License from time to time. Such new versions will
|
||||
be similar in spirit to the present version, but may differ in detail to
|
||||
address new problems or concerns.
|
||||
|
||||
Each version is given a distinguishing version number. If the Program
|
||||
specifies a version number of this License which applies to it and "any
|
||||
later version", you have the option of following the terms and conditions
|
||||
either of that version or of any later version published by the Free
|
||||
Software Foundation. If the Program does not specify a version number of
|
||||
this License, you may choose any version ever published by the Free Software
|
||||
Foundation.
|
||||
|
||||
10. If you wish to incorporate parts of the Program into other free
|
||||
programs whose distribution conditions are different, write to the author
|
||||
to ask for permission. For software which is copyrighted by the Free
|
||||
Software Foundation, write to the Free Software Foundation; we sometimes
|
||||
make exceptions for this. Our decision will be guided by the two goals
|
||||
of preserving the free status of all derivatives of our free software and
|
||||
of promoting the sharing and reuse of software generally.
|
||||
|
||||
NO WARRANTY
|
||||
|
||||
11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
|
||||
FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
|
||||
OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
|
||||
PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
|
||||
OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
||||
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
|
||||
TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
|
||||
PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
|
||||
REPAIR OR CORRECTION.
|
||||
|
||||
12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
|
||||
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
|
||||
REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
|
||||
INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
|
||||
OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
|
||||
TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
|
||||
YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
|
||||
PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGES.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
2
src/python/pychm/MANIFEST.in
Normal file
2
src/python/pychm/MANIFEST.in
Normal file
@ -0,0 +1,2 @@
|
||||
include COPYING
|
||||
include chm/swig_chm.i
|
||||
11
src/python/pychm/README-RECOLL.txt
Normal file
11
src/python/pychm/README-RECOLL.txt
Normal file
@ -0,0 +1,11 @@
|
||||
May 2018:
|
||||
|
||||
pychm has no python3 version. The pull request I submitted for the port is
|
||||
sitting there, and so is the Debian bug.
|
||||
|
||||
https://github.com/dottedmag/pychm/pull/5
|
||||
|
||||
Which is why Recoll bundles pychm, enhanced for Python3, for now. The
|
||||
source repo is here:
|
||||
|
||||
https://github.com/medoc92/pychm
|
||||
15
src/python/pychm/pychm.egg-info/PKG-INFO
Normal file
15
src/python/pychm/pychm.egg-info/PKG-INFO
Normal file
@ -0,0 +1,15 @@
|
||||
Metadata-Version: 1.0
|
||||
Name: pychm
|
||||
Version: 0.8.4.1+git
|
||||
Summary: Python package to handle CHM files
|
||||
Home-page: https://github.com/dottedmag/pychm
|
||||
Author: Mikhail Gusarov
|
||||
Author-email: dottedmag@dottedmag.net
|
||||
License: GPL
|
||||
Description:
|
||||
The chm package provides three modules, chm, chmlib and extra, which provide
|
||||
access to the API implemented by the C library chmlib and some additional
|
||||
classes and functions. They are used to access MS-ITSS encoded files -
|
||||
Compressed Html Help files (.chm).
|
||||
|
||||
Platform: UNKNOWN
|
||||
10
src/python/pychm/pychm.egg-info/SOURCES.txt
Normal file
10
src/python/pychm/pychm.egg-info/SOURCES.txt
Normal file
@ -0,0 +1,10 @@
|
||||
setup.py
|
||||
/home/dockes/projets/fulltext/recoll/src/python/pychm/../../python/pychm/chm/__init__.py
|
||||
/home/dockes/projets/fulltext/recoll/src/python/pychm/../../python/pychm/chm/chm.py
|
||||
/home/dockes/projets/fulltext/recoll/src/python/pychm/../../python/pychm/chm/chmlib.py
|
||||
/home/dockes/projets/fulltext/recoll/src/python/pychm/../../python/pychm/pychm.egg-info/PKG-INFO
|
||||
/home/dockes/projets/fulltext/recoll/src/python/pychm/../../python/pychm/pychm.egg-info/SOURCES.txt
|
||||
/home/dockes/projets/fulltext/recoll/src/python/pychm/../../python/pychm/pychm.egg-info/dependency_links.txt
|
||||
/home/dockes/projets/fulltext/recoll/src/python/pychm/../../python/pychm/pychm.egg-info/top_level.txt
|
||||
/home/dockes/projets/fulltext/recoll/src/python/pychm/chm/extra.c
|
||||
/home/dockes/projets/fulltext/recoll/src/python/pychm/chm/swig_chm.c
|
||||
1
src/python/pychm/pychm.egg-info/dependency_links.txt
Normal file
1
src/python/pychm/pychm.egg-info/dependency_links.txt
Normal file
@ -0,0 +1 @@
|
||||
|
||||
1
src/python/pychm/pychm.egg-info/top_level.txt
Normal file
1
src/python/pychm/pychm.egg-info/top_level.txt
Normal file
@ -0,0 +1 @@
|
||||
chm
|
||||
32
src/python/pychm/recollchm/__init__.py
Normal file
32
src/python/pychm/recollchm/__init__.py
Normal file
@ -0,0 +1,32 @@
|
||||
# Copyright (C) 2003-2006 Rubens Ramos <rubensr@users.sourceforge.net>
|
||||
#
|
||||
# pychm is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU General Public License as
|
||||
# published by the Free Software Foundation; either version 2 of the
|
||||
# License, or (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
# General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public
|
||||
# License along with this program; see the file COPYING. If not,
|
||||
# write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
# Boston, MA 02111-1307, USA
|
||||
#
|
||||
|
||||
'''
|
||||
chm - A package to manipulate CHM files
|
||||
|
||||
The chm package provides four modules: chm, chmlib, extra and
|
||||
_chmlib. _chmlib and chmlib are very low level libraries generated
|
||||
from SWIG interface files, and are simple wrappers around the API
|
||||
defined by the C library chmlib.
|
||||
The extra module adds full-text search support.
|
||||
the chm module provides some higher level classes to simplify
|
||||
access to the CHM files information.
|
||||
'''
|
||||
__all__ = ["chm", "chmlib", "_chmlib", "extra"]
|
||||
__version__ = "0.8.4.1+git"
|
||||
__revision__ = "$Id$"
|
||||
502
src/python/pychm/recollchm/chm.py
Normal file
502
src/python/pychm/recollchm/chm.py
Normal file
@ -0,0 +1,502 @@
|
||||
# Copyright (C) 2003-2006 Rubens Ramos <rubensr@users.sourceforge.net>
|
||||
#
|
||||
# Based on code by:
|
||||
# Copyright (C) 2003 Razvan Cojocaru <razvanco@gmx.net>
|
||||
#
|
||||
# pychm is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU General Public License as
|
||||
# published by the Free Software Foundation; either version 2 of the
|
||||
# License, or (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
# General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public
|
||||
# License along with this program; see the file COPYING. If not,
|
||||
# write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
# Boston, MA 02111-1307, USA
|
||||
|
||||
'''
|
||||
chm - A high-level front end for the chmlib python module.
|
||||
|
||||
The chm module provides high level access to the functionality
|
||||
included in chmlib. It encapsulates functions in the CHMFile class, and
|
||||
provides some additional features, such as the ability to obtain
|
||||
the contents tree of a CHM archive.
|
||||
|
||||
'''
|
||||
|
||||
from . import chmlib
|
||||
from . import extra
|
||||
import array
|
||||
import os.path
|
||||
import sys
|
||||
|
||||
charset_table = {
|
||||
0: 'iso8859_1', # ANSI_CHARSET
|
||||
238: 'iso8859_2', # EASTEUROPE_CHARSET
|
||||
178: 'iso8859_6', # ARABIC_CHARSET
|
||||
161: 'iso8859_7', # GREEK_CHARSET
|
||||
177: 'iso8859_8', # HEBREW_CHARSET
|
||||
162: 'iso8859_9', # TURKISH_CHARSET
|
||||
222: 'iso8859_11', # THAI_CHARSET - hmm not in python 2.2...
|
||||
186: 'iso8859_13', # BALTIC_CHARSET
|
||||
204: 'cp1251', # RUSSIAN_CHARSET
|
||||
255: 'cp437', # OEM_CHARSET
|
||||
128: 'cp932', # SHIFTJIS_CHARSET
|
||||
134: 'cp936', # GB2312_CHARSET
|
||||
129: 'cp949', # HANGUL_CHARSET
|
||||
136: 'cp950', # CHINESEBIG5_CHARSET
|
||||
1: None, # DEFAULT_CHARSET
|
||||
2: None, # SYMBOL_CHARSET
|
||||
130: None, # JOHAB_CHARSET
|
||||
163: None, # VIETNAMESE_CHARSET
|
||||
77: None, # MAC_CHARSET
|
||||
}
|
||||
|
||||
locale_table = {
|
||||
0x0436: ('iso8859_1', "Afrikaans", "Western Europe & US"),
|
||||
0x041c: ('iso8859_2', "Albanian", "Central Europe"),
|
||||
0x0401: ('iso8859_6', "Arabic_Saudi_Arabia", "Arabic"),
|
||||
0x0801: ('iso8859_6', "Arabic_Iraq", "Arabic"),
|
||||
0x0c01: ('iso8859_6', "Arabic_Egypt", "Arabic"),
|
||||
0x1001: ('iso8859_6', "Arabic_Libya", "Arabic"),
|
||||
0x1401: ('iso8859_6', "Arabic_Algeria", "Arabic"),
|
||||
0x1801: ('iso8859_6', "Arabic_Morocco", "Arabic"),
|
||||
0x1c01: ('iso8859_6', "Arabic_Tunisia", "Arabic"),
|
||||
0x2001: ('iso8859_6', "Arabic_Oman", "Arabic"),
|
||||
0x2401: ('iso8859_6', "Arabic_Yemen", "Arabic"),
|
||||
0x2801: ('iso8859_6', "Arabic_Syria", "Arabic"),
|
||||
0x2c01: ('iso8859_6', "Arabic_Jordan", "Arabic"),
|
||||
0x3001: ('iso8859_6', "Arabic_Lebanon", "Arabic"),
|
||||
0x3401: ('iso8859_6', "Arabic_Kuwait", "Arabic"),
|
||||
0x3801: ('iso8859_6', "Arabic_UAE", "Arabic"),
|
||||
0x3c01: ('iso8859_6', "Arabic_Bahrain", "Arabic"),
|
||||
0x4001: ('iso8859_6', "Arabic_Qatar", "Arabic"),
|
||||
0x042b: (None, "Armenian", "Armenian"),
|
||||
0x042c: ('iso8859_9', "Azeri_Latin", "Turkish"),
|
||||
0x082c: ('cp1251', "Azeri_Cyrillic", "Cyrillic"),
|
||||
0x042d: ('iso8859_1', "Basque", "Western Europe & US"),
|
||||
0x0423: ('cp1251', "Belarusian", "Cyrillic"),
|
||||
0x0402: ('cp1251', "Bulgarian", "Cyrillic"),
|
||||
0x0403: ('iso8859_1', "Catalan", "Western Europe & US"),
|
||||
0x0404: ('cp950', "Chinese_Taiwan", "Traditional Chinese"),
|
||||
0x0804: ('cp936', "Chinese_PRC", "Simplified Chinese"),
|
||||
0x0c04: ('cp950', "Chinese_Hong_Kong", "Traditional Chinese"),
|
||||
0x1004: ('cp936', "Chinese_Singapore", "Simplified Chinese"),
|
||||
0x1404: ('cp950', "Chinese_Macau", "Traditional Chinese"),
|
||||
0x041a: ('iso8859_2', "Croatian", "Central Europe"),
|
||||
0x0405: ('iso8859_2', "Czech", "Central Europe"),
|
||||
0x0406: ('iso8859_1', "Danish", "Western Europe & US"),
|
||||
0x0413: ('iso8859_1', "Dutch_Standard", "Western Europe & US"),
|
||||
0x0813: ('iso8859_1', "Dutch_Belgian", "Western Europe & US"),
|
||||
0x0409: ('iso8859_1', "English_United_States", "Western Europe & US"),
|
||||
0x0809: ('iso8859_1', "English_United_Kingdom", "Western Europe & US"),
|
||||
0x0c09: ('iso8859_1', "English_Australian", "Western Europe & US"),
|
||||
0x1009: ('iso8859_1', "English_Canadian", "Western Europe & US"),
|
||||
0x1409: ('iso8859_1', "English_New_Zealand", "Western Europe & US"),
|
||||
0x1809: ('iso8859_1', "English_Irish", "Western Europe & US"),
|
||||
0x1c09: ('iso8859_1', "English_South_Africa", "Western Europe & US"),
|
||||
0x2009: ('iso8859_1', "English_Jamaica", "Western Europe & US"),
|
||||
0x2409: ('iso8859_1', "English_Caribbean", "Western Europe & US"),
|
||||
0x2809: ('iso8859_1', "English_Belize", "Western Europe & US"),
|
||||
0x2c09: ('iso8859_1', "English_Trinidad", "Western Europe & US"),
|
||||
0x3009: ('iso8859_1', "English_Zimbabwe", "Western Europe & US"),
|
||||
0x3409: ('iso8859_1', "English_Philippines", "Western Europe & US"),
|
||||
0x0425: ('iso8859_13', "Estonian", "Baltic",),
|
||||
0x0438: ('iso8859_1', "Faeroese", "Western Europe & US"),
|
||||
0x0429: ('iso8859_6', "Farsi", "Arabic"),
|
||||
0x040b: ('iso8859_1', "Finnish", "Western Europe & US"),
|
||||
0x040c: ('iso8859_1', "French_Standard", "Western Europe & US"),
|
||||
0x080c: ('iso8859_1', "French_Belgian", "Western Europe & US"),
|
||||
0x0c0c: ('iso8859_1', "French_Canadian", "Western Europe & US"),
|
||||
0x100c: ('iso8859_1', "French_Swiss", "Western Europe & US"),
|
||||
0x140c: ('iso8859_1', "French_Luxembourg", "Western Europe & US"),
|
||||
0x180c: ('iso8859_1', "French_Monaco", "Western Europe & US"),
|
||||
0x0437: (None, "Georgian", "Georgian"),
|
||||
0x0407: ('iso8859_1', "German_Standard", "Western Europe & US"),
|
||||
0x0807: ('iso8859_1', "German_Swiss", "Western Europe & US"),
|
||||
0x0c07: ('iso8859_1', "German_Austrian", "Western Europe & US"),
|
||||
0x1007: ('iso8859_1', "German_Luxembourg", "Western Europe & US"),
|
||||
0x1407: ('iso8859_1', "German_Liechtenstein", "Western Europe & US"),
|
||||
0x0408: ('iso8859_7', "Greek", "Greek"),
|
||||
0x040d: ('iso8859_8', "Hebrew", "Hebrew"),
|
||||
0x0439: (None, "Hindi", "Indic"),
|
||||
0x040e: ('iso8859_2', "Hungarian", "Central Europe"),
|
||||
0x040f: ('iso8859_1', "Icelandic", "Western Europe & US"),
|
||||
0x0421: ('iso8859_1', "Indonesian", "Western Europe & US"),
|
||||
0x0410: ('iso8859_1', "Italian_Standard", "Western Europe & US"),
|
||||
0x0810: ('iso8859_1', "Italian_Swiss", "Western Europe & US"),
|
||||
0x0411: ('cp932', "Japanese", "Japanese"),
|
||||
0x043f: ('cp1251', "Kazakh", "Cyrillic"),
|
||||
0x0457: (None, "Konkani", "Indic"),
|
||||
0x0412: ('cp949', "Korean", "Korean"),
|
||||
0x0426: ('iso8859_13', "Latvian", "Baltic",),
|
||||
0x0427: ('iso8859_13', "Lithuanian", "Baltic",),
|
||||
0x042f: ('cp1251', "Macedonian", "Cyrillic"),
|
||||
0x043e: ('iso8859_1', "Malay_Malaysia", "Western Europe & US"),
|
||||
0x083e: ('iso8859_1', "Malay_Brunei_Darussalam", "Western Europe & US"),
|
||||
0x044e: (None, "Marathi", "Indic"),
|
||||
0x0414: ('iso8859_1', "Norwegian_Bokmal", "Western Europe & US"),
|
||||
0x0814: ('iso8859_1', "Norwegian_Nynorsk", "Western Europe & US"),
|
||||
0x0415: ('iso8859_2', "Polish", "Central Europe"),
|
||||
0x0416: ('iso8859_1', "Portuguese_Brazilian", "Western Europe & US"),
|
||||
0x0816: ('iso8859_1', "Portuguese_Standard", "Western Europe & US"),
|
||||
0x0418: ('iso8859_2', "Romanian", "Central Europe"),
|
||||
0x0419: ('cp1251', "Russian", "Cyrillic"),
|
||||
0x044f: (None, "Sanskrit", "Indic"),
|
||||
0x081a: ('iso8859_2', "Serbian_Latin", "Central Europe"),
|
||||
0x0c1a: ('cp1251', "Serbian_Cyrillic", "Cyrillic"),
|
||||
0x041b: ('iso8859_2', "Slovak", "Central Europe"),
|
||||
0x0424: ('iso8859_2', "Slovenian", "Central Europe"),
|
||||
0x040a: ('iso8859_1', "Spanish_Trad_Sort", "Western Europe & US"),
|
||||
0x080a: ('iso8859_1', "Spanish_Mexican", "Western Europe & US"),
|
||||
0x0c0a: ('iso8859_1', "Spanish_Modern_Sort", "Western Europe & US"),
|
||||
0x100a: ('iso8859_1', "Spanish_Guatemala", "Western Europe & US"),
|
||||
0x140a: ('iso8859_1', "Spanish_Costa_Rica", "Western Europe & US"),
|
||||
0x180a: ('iso8859_1', "Spanish_Panama", "Western Europe & US"),
|
||||
0x1c0a: ('iso8859_1', "Spanish_Dominican_Repub", "Western Europe & US"),
|
||||
0x200a: ('iso8859_1', "Spanish_Venezuela", "Western Europe & US"),
|
||||
0x240a: ('iso8859_1', "Spanish_Colombia", "Western Europe & US"),
|
||||
0x280a: ('iso8859_1', "Spanish_Peru", "Western Europe & US"),
|
||||
0x2c0a: ('iso8859_1', "Spanish_Argentina", "Western Europe & US"),
|
||||
0x300a: ('iso8859_1', "Spanish_Ecuador", "Western Europe & US"),
|
||||
0x340a: ('iso8859_1', "Spanish_Chile", "Western Europe & US"),
|
||||
0x380a: ('iso8859_1', "Spanish_Uruguay", "Western Europe & US"),
|
||||
0x3c0a: ('iso8859_1', "Spanish_Paraguay", "Western Europe & US"),
|
||||
0x400a: ('iso8859_1', "Spanish_Bolivia", "Western Europe & US"),
|
||||
0x440a: ('iso8859_1', "Spanish_El_Salvador", "Western Europe & US"),
|
||||
0x480a: ('iso8859_1', "Spanish_Honduras", "Western Europe & US"),
|
||||
0x4c0a: ('iso8859_1', "Spanish_Nicaragua", "Western Europe & US"),
|
||||
0x500a: ('iso8859_1', "Spanish_Puerto_Rico", "Western Europe & US"),
|
||||
0x0441: ('iso8859_1', "Swahili", "Western Europe & US"),
|
||||
0x041d: ('iso8859_1', "Swedish", "Western Europe & US"),
|
||||
0x081d: ('iso8859_1', "Swedish_Finland", "Western Europe & US"),
|
||||
0x0449: (None, "Tamil", "Indic"),
|
||||
0x0444: ('cp1251', "Tatar", "Cyrillic"),
|
||||
0x041e: ('iso8859_11', "Thai", "Thai"),
|
||||
0x041f: ('iso8859_9', "Turkish", "Turkish"),
|
||||
0x0422: ('cp1251', "Ukrainian", "Cyrillic"),
|
||||
0x0420: ('iso8859_6', "Urdu", "Arabic"),
|
||||
0x0443: ('iso8859_9', "Uzbek_Latin", "Turkish"),
|
||||
0x0843: ('cp1251', "Uzbek_Cyrillic", "Cyrillic"),
|
||||
0x042a: (None, "Vietnamese", "Vietnamese")
|
||||
}
|
||||
|
||||
|
||||
class CHMFile:
|
||||
"A class to manage access to CHM files."
|
||||
filename = ""
|
||||
file = None
|
||||
title = ""
|
||||
home = "/"
|
||||
index = None
|
||||
topics = None
|
||||
encoding = None
|
||||
lcid = None
|
||||
binaryindex = None
|
||||
|
||||
def __init__(self):
|
||||
self.searchable = 0
|
||||
|
||||
def LoadCHM(self, archiveName):
|
||||
'''Loads a CHM archive.
|
||||
This function will also call GetArchiveInfo to obtain information
|
||||
such as the index file name and the topics file. It returns 1 on
|
||||
success, and 0 if it fails.
|
||||
'''
|
||||
if self.filename is not None:
|
||||
self.CloseCHM()
|
||||
|
||||
self.file = chmlib.chm_open(archiveName)
|
||||
if self.file is None:
|
||||
return 0
|
||||
|
||||
self.filename = archiveName
|
||||
self.GetArchiveInfo()
|
||||
|
||||
return 1
|
||||
|
||||
def CloseCHM(self):
|
||||
'''Closes the CHM archive.
|
||||
This function will close the CHM file, if it is open. All variables
|
||||
are also reset.
|
||||
'''
|
||||
if self.filename is not None:
|
||||
chmlib.chm_close(self.file)
|
||||
self.file = None
|
||||
self.filename = ''
|
||||
self.title = ""
|
||||
self.home = "/"
|
||||
self.index = None
|
||||
self.topics = None
|
||||
self.encoding = None
|
||||
|
||||
def GetArchiveInfo(self):
|
||||
'''Obtains information on CHM archive.
|
||||
This function checks the /#SYSTEM file inside the CHM archive to
|
||||
obtain the index, home page, topics, encoding and title. It is called
|
||||
from LoadCHM.
|
||||
'''
|
||||
|
||||
self.searchable = extra.is_searchable(self.file)
|
||||
self.lcid = None
|
||||
|
||||
result, ui = chmlib.chm_resolve_object(self.file, b'/#SYSTEM')
|
||||
if (result != chmlib.CHM_RESOLVE_SUCCESS):
|
||||
sys.stderr.write('GetArchiveInfo: #SYSTEM does not exist\n')
|
||||
return 0
|
||||
|
||||
size, text = chmlib.chm_retrieve_object(self.file, ui, 4, ui.length)
|
||||
if (size == 0):
|
||||
sys.stderr.write('GetArchiveInfo: file size = 0\n')
|
||||
return 0
|
||||
|
||||
buff = array.array('B', text)
|
||||
|
||||
index = 0
|
||||
while (index < size):
|
||||
cursor = buff[index] + (buff[index+1] * 256)
|
||||
|
||||
if (cursor == 0):
|
||||
index += 2
|
||||
cursor = buff[index] + (buff[index+1] * 256)
|
||||
index += 2
|
||||
self.topics = b'/' + text[index:index+cursor-1]
|
||||
elif (cursor == 1):
|
||||
index += 2
|
||||
cursor = buff[index] + (buff[index+1] * 256)
|
||||
index += 2
|
||||
self.index = b'/' + text[index:index+cursor-1]
|
||||
elif (cursor == 2):
|
||||
index += 2
|
||||
cursor = buff[index] + (buff[index+1] * 256)
|
||||
index += 2
|
||||
self.home = b'/' + text[index:index+cursor-1]
|
||||
elif (cursor == 3):
|
||||
index += 2
|
||||
cursor = buff[index] + (buff[index+1] * 256)
|
||||
index += 2
|
||||
self.title = text[index:index+cursor-1]
|
||||
elif (cursor == 4):
|
||||
index += 2
|
||||
cursor = buff[index] + (buff[index+1] * 256)
|
||||
index += 2
|
||||
self.lcid = buff[index] + (buff[index+1] * 256)
|
||||
elif (cursor == 6):
|
||||
index += 2
|
||||
cursor = buff[index] + (buff[index+1] * 256)
|
||||
index += 2
|
||||
tmp = text[index:index+cursor-1]
|
||||
if not self.topics:
|
||||
tmp1 = b'/' + tmp + b'.hhc'
|
||||
tmp2 = b'/' + tmp + b'.hhk'
|
||||
res1, ui1 = chmlib.chm_resolve_object(self.file, tmp1)
|
||||
res2, ui2 = chmlib.chm_resolve_object(self.file, tmp2)
|
||||
if not self.topics and res1 == chmlib.CHM_RESOLVE_SUCCESS:
|
||||
self.topics = b'/' + tmp + b'.hhc'
|
||||
if not self.index and res2 == chmlib.CHM_RESOLVE_SUCCESS:
|
||||
self.index = b'/' + tmp + b'.hhk'
|
||||
elif (cursor == 16):
|
||||
index += 2
|
||||
cursor = buff[index] + (buff[index+1] * 256)
|
||||
index += 2
|
||||
self.encoding = text[index:index+cursor-1]
|
||||
else:
|
||||
index += 2
|
||||
cursor = buff[index] + (buff[index+1] * 256)
|
||||
index += 2
|
||||
index += cursor
|
||||
|
||||
self.GetWindowsInfo()
|
||||
|
||||
if not self.lcid:
|
||||
self.lcid = extra.get_lcid(self.file)
|
||||
|
||||
return 1
|
||||
|
||||
def GetTopicsTree(self):
|
||||
'''Reads and returns the topics tree.
|
||||
This auxiliary function reads and returns the topics tree file
|
||||
contents for the CHM archive.
|
||||
'''
|
||||
if self.topics is None:
|
||||
return None
|
||||
|
||||
if self.topics:
|
||||
res, ui = chmlib.chm_resolve_object(self.file, self.topics)
|
||||
if (res != chmlib.CHM_RESOLVE_SUCCESS):
|
||||
return None
|
||||
|
||||
size, text = chmlib.chm_retrieve_object(self.file, ui, 0, ui.length)
|
||||
if (size == 0):
|
||||
sys.stderr.write('GetTopicsTree: file size = 0\n')
|
||||
return None
|
||||
return text
|
||||
|
||||
def GetIndex(self):
|
||||
'''Reads and returns the index tree.
|
||||
This auxiliary function reads and returns the index tree file
|
||||
contents for the CHM archive.
|
||||
'''
|
||||
if self.index is None:
|
||||
return None
|
||||
|
||||
if self.index:
|
||||
res, ui = chmlib.chm_resolve_object(self.file, self.index)
|
||||
if (res != chmlib.CHM_RESOLVE_SUCCESS):
|
||||
return None
|
||||
|
||||
size, text = chmlib.chm_retrieve_object(self.file, ui, 0, ui.length)
|
||||
if (size == 0):
|
||||
sys.stderr.write('GetIndex: file size = 0\n')
|
||||
return None
|
||||
return text
|
||||
|
||||
def ResolveObject(self, document):
|
||||
'''Tries to locate a document in the archive.
|
||||
This function tries to locate the document inside the archive. It
|
||||
returns a tuple where the first element is zero if the function
|
||||
was successful, and the second is the UnitInfo for that document.
|
||||
The UnitInfo is used to retrieve the document contents
|
||||
'''
|
||||
if self.file:
|
||||
# path = os.path.abspath(document) # wtf?? the index contents
|
||||
# are independant of the os !
|
||||
path = document
|
||||
return chmlib.chm_resolve_object(self.file, path)
|
||||
else:
|
||||
return (1, None)
|
||||
|
||||
def RetrieveObject(self, ui, start=-1, length=-1):
|
||||
'''Retrieves the contents of a document.
|
||||
This function takes a UnitInfo and two optional arguments, the first
|
||||
being the start address and the second is the length. These define
|
||||
the amount of data to be read from the archive.
|
||||
'''
|
||||
if self.file and ui:
|
||||
if length == -1:
|
||||
len = ui.length
|
||||
else:
|
||||
len = length
|
||||
if start == -1:
|
||||
st = 0
|
||||
else:
|
||||
st = long(start)
|
||||
return chmlib.chm_retrieve_object(self.file, ui, st, len)
|
||||
else:
|
||||
return (0, '')
|
||||
|
||||
def Search(self, text, wholewords=0, titleonly=0):
|
||||
'''Performs full-text search on the archive.
|
||||
The first parameter is the word to look for, the second
|
||||
indicates if the search should be for whole words only, and
|
||||
the third parameter indicates if the search should be
|
||||
restricted to page titles.
|
||||
This method will return a tuple, the first item
|
||||
indicating if the search results were partial, and the second
|
||||
item being a dictionary containing the results.'''
|
||||
if text and text != '' and self.file:
|
||||
return extra.search(self.file, text, wholewords, titleonly)
|
||||
else:
|
||||
return None
|
||||
|
||||
def IsSearchable(self):
|
||||
'''Indicates if the full-text search is available for this
|
||||
archive - this flag is updated when GetArchiveInfo is called'''
|
||||
return self.searchable
|
||||
|
||||
def GetEncoding(self):
|
||||
'''Returns a string that can be used with the codecs python package
|
||||
to encode or decode the files in the chm archive. If an error is
|
||||
found, or if it is not possible to find the encoding, None is
|
||||
returned.'''
|
||||
if self.encoding:
|
||||
vals = self.encoding.split(b',')
|
||||
if len(vals) > 2:
|
||||
try:
|
||||
return charset_table[int(vals[2])]
|
||||
except KeyError:
|
||||
pass
|
||||
return None
|
||||
|
||||
def GetLCID(self):
|
||||
'''Returns the archive Locale ID'''
|
||||
if self.lcid in locale_table:
|
||||
return locale_table[self.lcid]
|
||||
else:
|
||||
return None
|
||||
|
||||
def GetDWORD(self, buff, idx=0):
|
||||
'''Internal method.
|
||||
Reads a double word (4 bytes) from a buffer.
|
||||
'''
|
||||
result = buff[idx] + (buff[idx+1] << 8) + (buff[idx+2] << 16) + \
|
||||
(buff[idx+3] << 24)
|
||||
|
||||
if result == 0xFFFFFFFF:
|
||||
result = 0
|
||||
|
||||
return result
|
||||
|
||||
def GetString(self, text, idx):
|
||||
'''Internal method.
|
||||
Retrieves a string from the #STRINGS buffer.
|
||||
'''
|
||||
next = text.find(b'\x00', idx)
|
||||
chunk = text[idx:next]
|
||||
return chunk
|
||||
|
||||
def GetWindowsInfo(self):
|
||||
'''Gets information from the #WINDOWS file.
|
||||
Checks the #WINDOWS file to see if it has any info that was
|
||||
not found in #SYSTEM (topics, index or default page.
|
||||
'''
|
||||
result, ui = chmlib.chm_resolve_object(self.file, b'/#WINDOWS')
|
||||
if (result != chmlib.CHM_RESOLVE_SUCCESS):
|
||||
return -1
|
||||
|
||||
size, text = chmlib.chm_retrieve_object(self.file, ui, 0, 8)
|
||||
if (size < 8):
|
||||
return -2
|
||||
|
||||
buff = array.array('B', text)
|
||||
num_entries = self.GetDWORD(buff, 0)
|
||||
entry_size = self.GetDWORD(buff, 4)
|
||||
|
||||
if num_entries < 1:
|
||||
return -3
|
||||
|
||||
size, text = chmlib.chm_retrieve_object(self.file, ui, 8, entry_size)
|
||||
if (size < entry_size):
|
||||
return -4
|
||||
|
||||
buff = array.array('B', text)
|
||||
toc_index = self.GetDWORD(buff, 0x60)
|
||||
idx_index = self.GetDWORD(buff, 0x64)
|
||||
dft_index = self.GetDWORD(buff, 0x68)
|
||||
|
||||
result, ui = chmlib.chm_resolve_object(self.file, b'/#STRINGS')
|
||||
if (result != chmlib.CHM_RESOLVE_SUCCESS):
|
||||
return -5
|
||||
|
||||
size, text = chmlib.chm_retrieve_object(self.file, ui, 0, ui.length)
|
||||
if (size == 0):
|
||||
return -6
|
||||
|
||||
if (not self.topics):
|
||||
self.topics = self.GetString(text, toc_index)
|
||||
if not self.topics.startswith(b"/"):
|
||||
self.topics = b"/" + self.topics
|
||||
|
||||
if (not self.index):
|
||||
self.index = self.GetString(text, idx_index)
|
||||
if not self.index.startswith(b"/"):
|
||||
self.index = b"/" + self.index
|
||||
|
||||
if (dft_index != 0):
|
||||
self.home = self.GetString(text, dft_index)
|
||||
if not self.home.startswith(b"/"):
|
||||
self.home = b"/" + self.home
|
||||
180
src/python/pychm/recollchm/chmlib.py
Normal file
180
src/python/pychm/recollchm/chmlib.py
Normal file
@ -0,0 +1,180 @@
|
||||
# This file was automatically generated by SWIG (http://www.swig.org).
|
||||
# Version 3.0.10
|
||||
#
|
||||
# Do not make changes to this file unless you know what you are doing--modify
|
||||
# the SWIG interface file instead.
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
from sys import version_info as _swig_python_version_info
|
||||
if _swig_python_version_info >= (2, 7, 0):
|
||||
def swig_import_helper():
|
||||
import importlib
|
||||
pkg = __name__.rpartition('.')[0]
|
||||
mname = '.'.join((pkg, '_chmlib')).lstrip('.')
|
||||
try:
|
||||
return importlib.import_module(mname)
|
||||
except ImportError:
|
||||
return importlib.import_module('_chmlib')
|
||||
_chmlib = swig_import_helper()
|
||||
del swig_import_helper
|
||||
elif _swig_python_version_info >= (2, 6, 0):
|
||||
def swig_import_helper():
|
||||
from os.path import dirname
|
||||
import imp
|
||||
fp = None
|
||||
try:
|
||||
fp, pathname, description = imp.find_module('_chmlib', [dirname(__file__)])
|
||||
except ImportError:
|
||||
import _chmlib
|
||||
return _chmlib
|
||||
if fp is not None:
|
||||
try:
|
||||
_mod = imp.load_module('_chmlib', fp, pathname, description)
|
||||
finally:
|
||||
fp.close()
|
||||
return _mod
|
||||
_chmlib = swig_import_helper()
|
||||
del swig_import_helper
|
||||
else:
|
||||
import _chmlib
|
||||
del _swig_python_version_info
|
||||
try:
|
||||
_swig_property = property
|
||||
except NameError:
|
||||
pass # Python < 2.2 doesn't have 'property'.
|
||||
|
||||
try:
|
||||
import builtins as __builtin__
|
||||
except ImportError:
|
||||
import __builtin__
|
||||
|
||||
def _swig_setattr_nondynamic(self, class_type, name, value, static=1):
|
||||
if (name == "thisown"):
|
||||
return self.this.own(value)
|
||||
if (name == "this"):
|
||||
if type(value).__name__ == 'SwigPyObject':
|
||||
self.__dict__[name] = value
|
||||
return
|
||||
method = class_type.__swig_setmethods__.get(name, None)
|
||||
if method:
|
||||
return method(self, value)
|
||||
if (not static):
|
||||
if _newclass:
|
||||
object.__setattr__(self, name, value)
|
||||
else:
|
||||
self.__dict__[name] = value
|
||||
else:
|
||||
raise AttributeError("You cannot add attributes to %s" % self)
|
||||
|
||||
|
||||
def _swig_setattr(self, class_type, name, value):
|
||||
return _swig_setattr_nondynamic(self, class_type, name, value, 0)
|
||||
|
||||
|
||||
def _swig_getattr(self, class_type, name):
|
||||
if (name == "thisown"):
|
||||
return self.this.own()
|
||||
method = class_type.__swig_getmethods__.get(name, None)
|
||||
if method:
|
||||
return method(self)
|
||||
raise AttributeError("'%s' object has no attribute '%s'" % (class_type.__name__, name))
|
||||
|
||||
|
||||
def _swig_repr(self):
|
||||
try:
|
||||
strthis = "proxy of " + self.this.__repr__()
|
||||
except __builtin__.Exception:
|
||||
strthis = ""
|
||||
return "<%s.%s; %s >" % (self.__class__.__module__, self.__class__.__name__, strthis,)
|
||||
|
||||
try:
|
||||
_object = object
|
||||
_newclass = 1
|
||||
except __builtin__.Exception:
|
||||
class _object:
|
||||
pass
|
||||
_newclass = 0
|
||||
|
||||
CHM_UNCOMPRESSED = _chmlib.CHM_UNCOMPRESSED
|
||||
CHM_COMPRESSED = _chmlib.CHM_COMPRESSED
|
||||
CHM_MAX_PATHLEN = _chmlib.CHM_MAX_PATHLEN
|
||||
class chmUnitInfo(_object):
|
||||
__swig_setmethods__ = {}
|
||||
__setattr__ = lambda self, name, value: _swig_setattr(self, chmUnitInfo, name, value)
|
||||
__swig_getmethods__ = {}
|
||||
__getattr__ = lambda self, name: _swig_getattr(self, chmUnitInfo, name)
|
||||
__repr__ = _swig_repr
|
||||
__swig_setmethods__["start"] = _chmlib.chmUnitInfo_start_set
|
||||
__swig_getmethods__["start"] = _chmlib.chmUnitInfo_start_get
|
||||
if _newclass:
|
||||
start = _swig_property(_chmlib.chmUnitInfo_start_get, _chmlib.chmUnitInfo_start_set)
|
||||
__swig_setmethods__["length"] = _chmlib.chmUnitInfo_length_set
|
||||
__swig_getmethods__["length"] = _chmlib.chmUnitInfo_length_get
|
||||
if _newclass:
|
||||
length = _swig_property(_chmlib.chmUnitInfo_length_get, _chmlib.chmUnitInfo_length_set)
|
||||
__swig_setmethods__["space"] = _chmlib.chmUnitInfo_space_set
|
||||
__swig_getmethods__["space"] = _chmlib.chmUnitInfo_space_get
|
||||
if _newclass:
|
||||
space = _swig_property(_chmlib.chmUnitInfo_space_get, _chmlib.chmUnitInfo_space_set)
|
||||
__swig_setmethods__["path"] = _chmlib.chmUnitInfo_path_set
|
||||
__swig_getmethods__["path"] = _chmlib.chmUnitInfo_path_get
|
||||
if _newclass:
|
||||
path = _swig_property(_chmlib.chmUnitInfo_path_get, _chmlib.chmUnitInfo_path_set)
|
||||
|
||||
def __init__(self):
|
||||
this = _chmlib.new_chmUnitInfo()
|
||||
try:
|
||||
self.this.append(this)
|
||||
except __builtin__.Exception:
|
||||
self.this = this
|
||||
__swig_destroy__ = _chmlib.delete_chmUnitInfo
|
||||
__del__ = lambda self: None
|
||||
chmUnitInfo_swigregister = _chmlib.chmUnitInfo_swigregister
|
||||
chmUnitInfo_swigregister(chmUnitInfo)
|
||||
|
||||
|
||||
def chm_open(filename):
|
||||
return _chmlib.chm_open(filename)
|
||||
chm_open = _chmlib.chm_open
|
||||
|
||||
def chm_close(h):
|
||||
return _chmlib.chm_close(h)
|
||||
chm_close = _chmlib.chm_close
|
||||
CHM_PARAM_MAX_BLOCKS_CACHED = _chmlib.CHM_PARAM_MAX_BLOCKS_CACHED
|
||||
|
||||
def chm_set_param(h, paramType, paramVal):
|
||||
return _chmlib.chm_set_param(h, paramType, paramVal)
|
||||
chm_set_param = _chmlib.chm_set_param
|
||||
CHM_RESOLVE_SUCCESS = _chmlib.CHM_RESOLVE_SUCCESS
|
||||
CHM_RESOLVE_FAILURE = _chmlib.CHM_RESOLVE_FAILURE
|
||||
|
||||
def chm_resolve_object(h, objPath):
|
||||
return _chmlib.chm_resolve_object(h, objPath)
|
||||
chm_resolve_object = _chmlib.chm_resolve_object
|
||||
|
||||
def chm_retrieve_object(h, ui, addr, len):
|
||||
return _chmlib.chm_retrieve_object(h, ui, addr, len)
|
||||
chm_retrieve_object = _chmlib.chm_retrieve_object
|
||||
CHM_ENUMERATE_NORMAL = _chmlib.CHM_ENUMERATE_NORMAL
|
||||
CHM_ENUMERATE_META = _chmlib.CHM_ENUMERATE_META
|
||||
CHM_ENUMERATE_SPECIAL = _chmlib.CHM_ENUMERATE_SPECIAL
|
||||
CHM_ENUMERATE_FILES = _chmlib.CHM_ENUMERATE_FILES
|
||||
CHM_ENUMERATE_DIRS = _chmlib.CHM_ENUMERATE_DIRS
|
||||
CHM_ENUMERATE_ALL = _chmlib.CHM_ENUMERATE_ALL
|
||||
CHM_ENUMERATOR_FAILURE = _chmlib.CHM_ENUMERATOR_FAILURE
|
||||
CHM_ENUMERATOR_CONTINUE = _chmlib.CHM_ENUMERATOR_CONTINUE
|
||||
CHM_ENUMERATOR_SUCCESS = _chmlib.CHM_ENUMERATOR_SUCCESS
|
||||
|
||||
def chm_enumerate(h, what, e, context):
|
||||
return _chmlib.chm_enumerate(h, what, e, context)
|
||||
chm_enumerate = _chmlib.chm_enumerate
|
||||
|
||||
def chm_enumerate_dir(h, prefix, what, e, context):
|
||||
return _chmlib.chm_enumerate_dir(h, prefix, what, e, context)
|
||||
chm_enumerate_dir = _chmlib.chm_enumerate_dir
|
||||
# This file is compatible with both classic and new-style classes.
|
||||
|
||||
|
||||
803
src/python/pychm/recollchm/extra.c
Normal file
803
src/python/pychm/recollchm/extra.c
Normal file
@ -0,0 +1,803 @@
|
||||
/*
|
||||
* extra.c - full-text search support for pychm
|
||||
*
|
||||
* Copyright (C) 2004 Rubens Ramos <rubensr@users.sourceforge.net>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 59 Temple Place, Suite 330,
|
||||
* Boston, MA 02111-1307, USA.
|
||||
*
|
||||
* Author: Rubens Ramos <rubensr@users.sourceforge.net>
|
||||
*
|
||||
* Heavily based on work done by:
|
||||
* Pabs <pabs@zip.to> - chmdeco
|
||||
* Razvan Cojocaru <razvanco@gmx.net> - xCHM
|
||||
*
|
||||
*/
|
||||
|
||||
#include "chm_lib.h"
|
||||
#ifdef __PYTHON__
|
||||
#include "Python.h"
|
||||
#else
|
||||
#include <stdio.h>
|
||||
#define PyObject void
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
PyObject_HEAD
|
||||
void *ptr;
|
||||
void *ty;
|
||||
int own;
|
||||
PyObject *next;
|
||||
#ifdef SWIGPYTHON_BUILTIN
|
||||
PyObject *dict;
|
||||
#endif
|
||||
} SwigPyObject;
|
||||
|
||||
#include <stdlib.h>
|
||||
|
||||
#if defined(_WIN32) || defined(__WIN32__)
|
||||
# if defined(_MSC_VER)
|
||||
# if defined(STATIC_LINKED)
|
||||
# define MODEXPORT(a) a
|
||||
# define MODIMPORT(a) extern a
|
||||
# else
|
||||
# define MODEXPORT(a) __declspec(dllexport) a
|
||||
# define MODIMPORT(a) extern a
|
||||
# endif
|
||||
#define uint64_t unsigned long long
|
||||
#define uint32_t unsigned int
|
||||
#define uint16_t unsigned short
|
||||
#define uint8_t unsigned char
|
||||
#define size_t int
|
||||
#define strcasecmp _stricmp
|
||||
#define strncasecmp _strnicmp
|
||||
# else
|
||||
# if defined(__BORLANDC__)
|
||||
# define MODEXPORT(a) a _export
|
||||
# define MODIMPORT(a) a _export
|
||||
# else
|
||||
# define MODEXPORT(a) a
|
||||
# define MODIMPORT(a) a
|
||||
# endif
|
||||
# endif
|
||||
#else
|
||||
# define MODEXPORT(a) a
|
||||
# define MODIMPORT(a) a
|
||||
#include <inttypes.h>
|
||||
#include <strings.h>
|
||||
#endif
|
||||
|
||||
#define false 0
|
||||
#define true 1
|
||||
|
||||
#define FTS_HEADER_LEN 0x32
|
||||
#define TOPICS_ENTRY_LEN 16
|
||||
#define COMMON_BUF_LEN 1025
|
||||
|
||||
#define FREE(x) free (x); x = NULL
|
||||
|
||||
static uint16_t
|
||||
get_uint16 (uint8_t* b) {
|
||||
return b[0] |
|
||||
b[1]<<8;
|
||||
}
|
||||
|
||||
static uint32_t
|
||||
get_uint32 (uint8_t* b) {
|
||||
return b[0] |
|
||||
b[1]<<8 |
|
||||
b[2]<<16 |
|
||||
b[3]<<24;
|
||||
}
|
||||
|
||||
static uint64_t
|
||||
be_encint (unsigned char *buffer, size_t *length)
|
||||
{
|
||||
uint64_t result = 0;
|
||||
int shift=0;
|
||||
*length = 0;
|
||||
|
||||
do {
|
||||
result |= ((*buffer) & 0x7f) << shift;
|
||||
shift += 7;
|
||||
*length = *length + 1;
|
||||
|
||||
} while (*(buffer++) & 0x80);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/*
|
||||
Finds the first unset bit in memory. Returns the number of set bits found.
|
||||
Returns -1 if the buffer runs out before we find an unset bit.
|
||||
*/
|
||||
static int
|
||||
ffus (unsigned char* byte, int* bit, size_t *length) {
|
||||
int bits = 0;
|
||||
*length = 0;
|
||||
|
||||
while(*byte & (1 << *bit)){
|
||||
if(*bit)
|
||||
--(*bit);
|
||||
else {
|
||||
++byte;
|
||||
++(*length);
|
||||
*bit = 7;
|
||||
}
|
||||
++bits;
|
||||
}
|
||||
|
||||
if(*bit)
|
||||
--(*bit);
|
||||
else {
|
||||
++(*length);
|
||||
*bit = 7;
|
||||
}
|
||||
|
||||
return bits;
|
||||
}
|
||||
|
||||
|
||||
static uint64_t
|
||||
sr_int(unsigned char* byte, int* bit,
|
||||
unsigned char s, unsigned char r, size_t *length)
|
||||
{
|
||||
uint64_t ret;
|
||||
unsigned char mask;
|
||||
int n, n_bits, num_bits, base, count;
|
||||
size_t fflen;
|
||||
|
||||
*length = 0;
|
||||
|
||||
if(!bit || *bit > 7 || s != 2)
|
||||
return ~(uint64_t)0;
|
||||
ret = 0;
|
||||
|
||||
count = ffus(byte, bit, &fflen);
|
||||
*length += fflen;
|
||||
byte += *length;
|
||||
|
||||
n_bits = n = r + (count ? count-1 : 0) ;
|
||||
|
||||
while (n > 0) {
|
||||
num_bits = n > *bit ? *bit : n-1;
|
||||
base = n > *bit ? 0 : *bit - (n-1);
|
||||
|
||||
switch (num_bits){
|
||||
case 0:
|
||||
mask = 1;
|
||||
break;
|
||||
case 1:
|
||||
mask = 3;
|
||||
break;
|
||||
case 2:
|
||||
mask = 7;
|
||||
break;
|
||||
case 3:
|
||||
mask = 0xf;
|
||||
break;
|
||||
case 4:
|
||||
mask = 0x1f;
|
||||
break;
|
||||
case 5:
|
||||
mask = 0x3f;
|
||||
break;
|
||||
case 6:
|
||||
mask = 0x7f;
|
||||
break;
|
||||
case 7:
|
||||
mask = 0xff;
|
||||
break;
|
||||
default:
|
||||
mask = 0xff;
|
||||
break;
|
||||
}
|
||||
|
||||
mask <<= base;
|
||||
ret = (ret << (num_bits+1)) |
|
||||
(uint64_t)((*byte & mask) >> base);
|
||||
|
||||
if( n > *bit ){
|
||||
++byte;
|
||||
++(*length);
|
||||
n -= *bit+1;
|
||||
*bit = 7;
|
||||
} else {
|
||||
*bit -= n;
|
||||
n = 0;
|
||||
}
|
||||
}
|
||||
|
||||
if(count)
|
||||
ret |= (uint64_t)1 << n_bits;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
static uint32_t
|
||||
get_leaf_node_offset(struct chmFile *chmfile,
|
||||
const char *text,
|
||||
uint32_t initial_offset,
|
||||
uint32_t buff_size,
|
||||
uint16_t tree_depth,
|
||||
struct chmUnitInfo *ui)
|
||||
{
|
||||
unsigned char word_len;
|
||||
unsigned char pos;
|
||||
uint16_t free_space;
|
||||
char *wrd_buf;
|
||||
char *word = NULL;
|
||||
uint32_t test_offset = 0;
|
||||
uint32_t i = sizeof(uint16_t);
|
||||
unsigned char *buffer = malloc (buff_size);
|
||||
|
||||
if (NULL == buffer)
|
||||
return 0;
|
||||
|
||||
while (--tree_depth) {
|
||||
if (initial_offset == test_offset) {
|
||||
FREE(buffer);
|
||||
return 0;
|
||||
}
|
||||
|
||||
test_offset = initial_offset;
|
||||
if (chm_retrieve_object (chmfile, ui, buffer,
|
||||
initial_offset, buff_size) == 0) {
|
||||
FREE(buffer);
|
||||
return 0;
|
||||
}
|
||||
|
||||
free_space = get_uint16 (buffer);
|
||||
|
||||
while (i < buff_size - free_space) {
|
||||
|
||||
word_len = *(buffer + i);
|
||||
pos = *(buffer + i + 1);
|
||||
|
||||
wrd_buf = malloc (word_len);
|
||||
memcpy (wrd_buf, buffer + i + 2, word_len - 1);
|
||||
wrd_buf[word_len - 1] = 0;
|
||||
|
||||
if (pos == 0) {
|
||||
FREE (word);
|
||||
word = (char *) strdup (wrd_buf);
|
||||
} else {
|
||||
word = realloc (word, word_len + pos + 1);
|
||||
strcpy (word + pos, wrd_buf);
|
||||
}
|
||||
|
||||
FREE(wrd_buf);
|
||||
|
||||
if (strcasecmp (text, word) <= 0) {
|
||||
initial_offset = get_uint32 (buffer + i + word_len + 1);
|
||||
break;
|
||||
}
|
||||
|
||||
i += word_len + sizeof (unsigned char) + sizeof(uint32_t) +
|
||||
sizeof(uint16_t);
|
||||
}
|
||||
}
|
||||
|
||||
if(initial_offset == test_offset)
|
||||
initial_offset = 0;
|
||||
|
||||
FREE(word);
|
||||
FREE(buffer);
|
||||
|
||||
return initial_offset;
|
||||
}
|
||||
|
||||
static int
|
||||
pychm_process_wlc (struct chmFile *chmfile,
|
||||
uint64_t wlc_count, uint64_t wlc_size,
|
||||
uint32_t wlc_offset, unsigned char ds,
|
||||
unsigned char dr, unsigned char cs,
|
||||
unsigned char cr, unsigned char ls,
|
||||
unsigned char lr, struct chmUnitInfo *uimain,
|
||||
struct chmUnitInfo* uitbl,
|
||||
struct chmUnitInfo *uistrings,
|
||||
struct chmUnitInfo* topics,
|
||||
struct chmUnitInfo *urlstr,
|
||||
PyObject *dict)
|
||||
{
|
||||
uint32_t stroff, urloff;
|
||||
uint64_t i, j, count;
|
||||
size_t length;
|
||||
int wlc_bit = 7;
|
||||
size_t off = 0;
|
||||
uint64_t index = 0;
|
||||
unsigned char entry[TOPICS_ENTRY_LEN];
|
||||
unsigned char combuf[COMMON_BUF_LEN];
|
||||
unsigned char *buffer = malloc (wlc_size);
|
||||
char *url = NULL;
|
||||
char *topic = NULL;
|
||||
|
||||
if (chm_retrieve_object(chmfile, uimain, buffer,
|
||||
wlc_offset, wlc_size) == 0) {
|
||||
FREE(buffer);
|
||||
return false;
|
||||
}
|
||||
|
||||
for (i = 0; i < wlc_count; ++i) {
|
||||
|
||||
if(wlc_bit != 7) {
|
||||
++off;
|
||||
wlc_bit = 7;
|
||||
}
|
||||
|
||||
index += sr_int(buffer + off, &wlc_bit, ds, dr, &length);
|
||||
off += length;
|
||||
|
||||
if(chm_retrieve_object(chmfile, topics, entry,
|
||||
index * 16, TOPICS_ENTRY_LEN) == 0) {
|
||||
FREE(topic);
|
||||
FREE(url);
|
||||
FREE(buffer);
|
||||
return false;
|
||||
}
|
||||
|
||||
combuf[COMMON_BUF_LEN - 1] = 0;
|
||||
stroff = get_uint32 (entry + 4);
|
||||
|
||||
FREE (topic);
|
||||
if (chm_retrieve_object (chmfile, uistrings, combuf,
|
||||
stroff, COMMON_BUF_LEN - 1) == 0) {
|
||||
topic = strdup ("Untitled in index");
|
||||
|
||||
} else {
|
||||
combuf[COMMON_BUF_LEN - 1] = 0;
|
||||
|
||||
topic = strdup ((char *)combuf);
|
||||
}
|
||||
|
||||
urloff = get_uint32 (entry + 8);
|
||||
|
||||
if(chm_retrieve_object (chmfile, uitbl, combuf,
|
||||
urloff, 12) == 0) {
|
||||
FREE(buffer);
|
||||
return false;
|
||||
}
|
||||
|
||||
urloff = get_uint32 (combuf + 8);
|
||||
|
||||
if (chm_retrieve_object (chmfile, urlstr, combuf,
|
||||
urloff + 8, COMMON_BUF_LEN - 1) == 0) {
|
||||
FREE(topic);
|
||||
FREE(url);
|
||||
FREE(buffer);
|
||||
return false;
|
||||
}
|
||||
|
||||
combuf[COMMON_BUF_LEN - 1] = 0;
|
||||
|
||||
FREE (url);
|
||||
url = strdup ((char *)combuf);
|
||||
|
||||
if (url && topic) {
|
||||
#ifdef __PYTHON__
|
||||
PyDict_SetItem(dict,
|
||||
#if PY_MAJOR_VERSION >= 3
|
||||
PyBytes_FromStringAndSize(topic, strlen(topic)),
|
||||
PyBytes_FromStringAndSize(url, strlen(url))
|
||||
#else
|
||||
PyString_FromString (topic),
|
||||
PyString_FromString (url)
|
||||
#endif
|
||||
);
|
||||
#else
|
||||
printf ("%s ==> %s\n", url, topic);
|
||||
#endif
|
||||
}
|
||||
|
||||
count = sr_int (buffer + off, &wlc_bit, cs, cr, &length);
|
||||
off += length;
|
||||
|
||||
for (j = 0; j < count; ++j) {
|
||||
sr_int (buffer + off, &wlc_bit, ls, lr, &length);
|
||||
off += length;
|
||||
}
|
||||
}
|
||||
|
||||
FREE(topic);
|
||||
FREE(url);
|
||||
FREE(buffer);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static int
|
||||
chm_search (struct chmFile *chmfile,
|
||||
const char *text, int whole_words,
|
||||
int titles_only, PyObject *dict)
|
||||
{
|
||||
unsigned char header[FTS_HEADER_LEN];
|
||||
unsigned char doc_index_s;
|
||||
unsigned char doc_index_r;
|
||||
unsigned char code_count_s;
|
||||
unsigned char code_count_r;
|
||||
unsigned char loc_codes_s;
|
||||
unsigned char loc_codes_r;
|
||||
unsigned char word_len, pos;
|
||||
unsigned char *buffer;
|
||||
char *word = NULL;
|
||||
uint32_t node_offset;
|
||||
uint32_t node_len;
|
||||
uint16_t tree_depth;
|
||||
uint32_t i;
|
||||
uint16_t free_space;
|
||||
uint64_t wlc_count, wlc_size;
|
||||
uint32_t wlc_offset;
|
||||
char *wrd_buf;
|
||||
unsigned char title;
|
||||
size_t encsz;
|
||||
struct chmUnitInfo ui, uitopics, uiurltbl, uistrings, uiurlstr;
|
||||
int partial = false;
|
||||
|
||||
if (NULL == text)
|
||||
return -1;
|
||||
|
||||
if (chm_resolve_object (chmfile, "/$FIftiMain", &ui) !=
|
||||
CHM_RESOLVE_SUCCESS ||
|
||||
chm_resolve_object (chmfile, "/#TOPICS", &uitopics) !=
|
||||
CHM_RESOLVE_SUCCESS ||
|
||||
chm_resolve_object (chmfile, "/#STRINGS", &uistrings) !=
|
||||
CHM_RESOLVE_SUCCESS ||
|
||||
chm_resolve_object (chmfile, "/#URLTBL", &uiurltbl) !=
|
||||
CHM_RESOLVE_SUCCESS ||
|
||||
chm_resolve_object (chmfile, "/#URLSTR", &uiurlstr) !=
|
||||
CHM_RESOLVE_SUCCESS)
|
||||
return false;
|
||||
|
||||
if(chm_retrieve_object(chmfile, &ui, header, 0, FTS_HEADER_LEN) == 0)
|
||||
return false;
|
||||
|
||||
doc_index_s = header[0x1E];
|
||||
doc_index_r = header[0x1F];
|
||||
code_count_s = header[0x20];
|
||||
code_count_r = header[0x21];
|
||||
loc_codes_s = header[0x22];
|
||||
loc_codes_r = header[0x23];
|
||||
|
||||
if(doc_index_s != 2 || code_count_s != 2 || loc_codes_s != 2) {
|
||||
return false;
|
||||
}
|
||||
|
||||
node_offset = get_uint32 (header + 0x14);
|
||||
node_len = get_uint32 (header + 0x2e);
|
||||
tree_depth = get_uint16 (header + 0x18);
|
||||
|
||||
i = sizeof(uint16_t);
|
||||
|
||||
buffer = malloc (node_len);
|
||||
|
||||
node_offset = get_leaf_node_offset (chmfile, text, node_offset, node_len,
|
||||
tree_depth, &ui);
|
||||
|
||||
if (!node_offset) {
|
||||
FREE(buffer);
|
||||
return false;
|
||||
}
|
||||
|
||||
do {
|
||||
|
||||
if (chm_retrieve_object (chmfile, &ui, buffer,
|
||||
node_offset, node_len) == 0) {
|
||||
FREE(word);
|
||||
FREE(buffer);
|
||||
return false;
|
||||
}
|
||||
|
||||
free_space = get_uint16 (buffer + 6);
|
||||
|
||||
i = sizeof(uint32_t) + sizeof(uint16_t) + sizeof(uint16_t);
|
||||
|
||||
encsz = 0;
|
||||
|
||||
while (i < node_len - free_space) {
|
||||
word_len = *(buffer + i);
|
||||
pos = *(buffer + i + 1);
|
||||
|
||||
wrd_buf = malloc (word_len);
|
||||
memcpy (wrd_buf, buffer + i + 2, word_len - 1);
|
||||
wrd_buf[word_len - 1] = 0;
|
||||
|
||||
if (pos == 0) {
|
||||
FREE(word);
|
||||
word = (char *) strdup (wrd_buf);
|
||||
} else {
|
||||
word = realloc (word, word_len + pos + 1);
|
||||
strcpy (word + pos, wrd_buf);
|
||||
}
|
||||
|
||||
FREE(wrd_buf);
|
||||
|
||||
i += 2 + word_len;
|
||||
title = *(buffer + i - 1);
|
||||
|
||||
wlc_count = be_encint (buffer + i, &encsz);
|
||||
i += encsz;
|
||||
|
||||
wlc_offset = get_uint32 (buffer + i);
|
||||
|
||||
i += sizeof(uint32_t) + sizeof(uint16_t);
|
||||
wlc_size = be_encint (buffer + i, &encsz);
|
||||
i += encsz;
|
||||
|
||||
node_offset = get_uint32 (buffer);
|
||||
|
||||
if (!title && titles_only)
|
||||
continue;
|
||||
|
||||
if (whole_words && !strcasecmp(text, word)) {
|
||||
partial = pychm_process_wlc (chmfile, wlc_count, wlc_size,
|
||||
wlc_offset, doc_index_s,
|
||||
doc_index_r,code_count_s,
|
||||
code_count_r, loc_codes_s,
|
||||
loc_codes_r, &ui, &uiurltbl,
|
||||
&uistrings, &uitopics,
|
||||
&uiurlstr, dict);
|
||||
FREE(word);
|
||||
FREE(buffer);
|
||||
return partial;
|
||||
}
|
||||
|
||||
if (!whole_words) {
|
||||
if (!strncasecmp (word, text, strlen(text))) {
|
||||
partial = true;
|
||||
pychm_process_wlc (chmfile, wlc_count, wlc_size,
|
||||
wlc_offset, doc_index_s,
|
||||
doc_index_r,code_count_s,
|
||||
code_count_r, loc_codes_s,
|
||||
loc_codes_r, &ui, &uiurltbl,
|
||||
&uistrings, &uitopics,
|
||||
&uiurlstr, dict);
|
||||
|
||||
} else if (strncasecmp (text, word, strlen(text)) < -1)
|
||||
break;
|
||||
}
|
||||
|
||||
}
|
||||
} while (!whole_words &&
|
||||
!strncmp (word, text, strlen(text)) &&
|
||||
node_offset);
|
||||
|
||||
FREE(word);
|
||||
FREE(buffer);
|
||||
|
||||
return partial;
|
||||
}
|
||||
|
||||
typedef struct {
|
||||
const char *file;
|
||||
int offset;
|
||||
} Langrec;
|
||||
|
||||
static Langrec lang_files[] = {
|
||||
{"/$FIftiMain", 0x7E},
|
||||
{"$WWKeywordLinks/BTree", 0x34},
|
||||
{"$WWAssociativeLinks/BTree", 0x34}
|
||||
};
|
||||
|
||||
#define LANG_FILES_SIZE (sizeof(lang_files)/sizeof(Langrec))
|
||||
|
||||
static int
|
||||
chm_get_lcid (struct chmFile *chmfile) {
|
||||
struct chmUnitInfo ui;
|
||||
uint32_t lang;
|
||||
int i;
|
||||
|
||||
for (i=0; i<LANG_FILES_SIZE; i++) {
|
||||
|
||||
if (chm_resolve_object (chmfile, lang_files[i].file, &ui) ==
|
||||
CHM_RESOLVE_SUCCESS) {
|
||||
|
||||
if (chm_retrieve_object (chmfile, &ui, (unsigned char *) &lang,
|
||||
lang_files[i].offset, sizeof(uint32_t)) != 0)
|
||||
return lang;
|
||||
}
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
#ifdef __PYTHON__
|
||||
|
||||
static PyObject *
|
||||
is_searchable (PyObject *self, PyObject *args) {
|
||||
struct chmFile *file;
|
||||
PyObject *obj0;
|
||||
struct chmUnitInfo ui;
|
||||
|
||||
if (PyArg_ParseTuple (args, "O:is_searchable", &obj0)) {
|
||||
|
||||
file = (struct chmFile *) ((SwigPyObject*)(obj0))->ptr;
|
||||
|
||||
if (chm_resolve_object (file, "/$FIftiMain", &ui) !=
|
||||
CHM_RESOLVE_SUCCESS ||
|
||||
chm_resolve_object (file, "/#TOPICS", &ui) !=
|
||||
CHM_RESOLVE_SUCCESS ||
|
||||
chm_resolve_object (file, "/#STRINGS", &ui) !=
|
||||
CHM_RESOLVE_SUCCESS ||
|
||||
chm_resolve_object (file, "/#URLTBL", &ui) !=
|
||||
CHM_RESOLVE_SUCCESS ||
|
||||
chm_resolve_object (file, "/#URLSTR", &ui) !=
|
||||
CHM_RESOLVE_SUCCESS)
|
||||
return Py_BuildValue ("i", 0);
|
||||
else
|
||||
return Py_BuildValue ("i", 1);
|
||||
} else {
|
||||
PyErr_SetString(PyExc_TypeError, "Expected chmfile (not CHMFile!)");
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
static PyObject *
|
||||
search (PyObject *self, PyObject *args) {
|
||||
char *text;
|
||||
int whole_words = 0;
|
||||
int titles_only = 0;
|
||||
int partial;
|
||||
struct chmFile *file;
|
||||
PyObject *obj0;
|
||||
PyObject *dict;
|
||||
|
||||
#if PY_MAJOR_VERSION >= 3
|
||||
PyObject *obj1;
|
||||
if (PyArg_ParseTuple (args, "OSii:search", &obj0, &obj1,
|
||||
#else
|
||||
if (PyArg_ParseTuple (args, "Osii:search", &obj0, &text,
|
||||
#endif
|
||||
&whole_words, &titles_only)) {
|
||||
|
||||
#if PY_MAJOR_VERSION >= 3
|
||||
text = PyBytes_AsString(obj1);
|
||||
#endif
|
||||
dict = PyDict_New();
|
||||
|
||||
if (dict) {
|
||||
file = (struct chmFile *) ((SwigPyObject*)(obj0))->ptr;
|
||||
|
||||
partial = chm_search (file,
|
||||
text, whole_words, titles_only, dict);
|
||||
return Py_BuildValue ("(iO)", partial, dict);
|
||||
|
||||
} else {
|
||||
PyErr_NoMemory();
|
||||
return NULL;
|
||||
}
|
||||
} else {
|
||||
PyErr_SetString(PyExc_TypeError,
|
||||
"Expected chmfile (not CHMFile!), string, int, int");
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
static PyObject *
|
||||
get_lcid (PyObject *self, PyObject *args) {
|
||||
int code;
|
||||
struct chmFile *file;
|
||||
PyObject *obj0;
|
||||
|
||||
if (PyArg_ParseTuple (args, "O:get_lcid", &obj0)) {
|
||||
|
||||
file = (struct chmFile *) ((SwigPyObject*)(obj0))->ptr;
|
||||
|
||||
code = chm_get_lcid (file);
|
||||
|
||||
if (code != -1)
|
||||
return Py_BuildValue ("i", code);
|
||||
else
|
||||
Py_INCREF(Py_None);
|
||||
return Py_None;
|
||||
} else {
|
||||
PyErr_SetString(PyExc_TypeError,"Expected a chmfile (not a CHMFile!)");
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
static PyMethodDef
|
||||
IndexMethods[] = {
|
||||
{"get_lcid", get_lcid, METH_VARARGS,
|
||||
"Returns LCID (Locale ID) for archive."},
|
||||
{"search", search, METH_VARARGS,
|
||||
"Perform Full-Text search."},
|
||||
{"is_searchable", is_searchable, METH_VARARGS,
|
||||
"Return 1 if it is possible to search the archive, 0 otherwise."},
|
||||
{NULL, NULL, 0, NULL}
|
||||
};
|
||||
|
||||
|
||||
#if PY_MAJOR_VERSION >= 3
|
||||
static struct PyModuleDef moduledef = {
|
||||
PyModuleDef_HEAD_INIT,
|
||||
"extra",
|
||||
NULL,
|
||||
-1,
|
||||
IndexMethods,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL
|
||||
};
|
||||
|
||||
#define INITERROR return NULL
|
||||
|
||||
#else /* python < 3 */
|
||||
|
||||
#define INITERROR return
|
||||
|
||||
#endif /* python 3/2 */
|
||||
|
||||
|
||||
#if PY_MAJOR_VERSION >= 3
|
||||
PyObject* PyInit_extra(void)
|
||||
#else
|
||||
void initextra (void)
|
||||
#endif
|
||||
{
|
||||
PyObject *module;
|
||||
#if PY_MAJOR_VERSION >= 3
|
||||
module = PyModule_Create(&moduledef);
|
||||
#else
|
||||
module = Py_InitModule ("extra", IndexMethods);
|
||||
#endif
|
||||
if (module == NULL)
|
||||
INITERROR;
|
||||
|
||||
#if PY_MAJOR_VERSION >= 3
|
||||
return module;
|
||||
#endif
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
int
|
||||
main (int argc, char **argv) {
|
||||
struct chmFile *file;
|
||||
char text[255];
|
||||
int whole_words, titles_only;
|
||||
int partial;
|
||||
|
||||
if (argc == 2) {
|
||||
file = chm_open (argv[1]);
|
||||
|
||||
if (file) {
|
||||
printf ("\nLCID= %d (%08X)\n", chm_get_lcid(file), chm_get_lcid(file));
|
||||
while (1) {
|
||||
printf ("\n<whole_words> <titles_only> <string>\n");
|
||||
printf ("> ");
|
||||
if (scanf ("%d %d %s", &whole_words, &titles_only, text))
|
||||
partial = chm_search (file,
|
||||
text, whole_words, titles_only, NULL);
|
||||
else
|
||||
break;
|
||||
|
||||
printf ("Partial = %d\n", partial);
|
||||
}
|
||||
|
||||
chm_close (file);
|
||||
return 0;
|
||||
}
|
||||
|
||||
return -1;
|
||||
|
||||
} else {
|
||||
printf ("\n%s <filename>\n", argv[0]);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
4962
src/python/pychm/recollchm/swig_chm.c
Normal file
4962
src/python/pychm/recollchm/swig_chm.c
Normal file
File diff suppressed because it is too large
Load Diff
225
src/python/pychm/recollchm/swig_chm.i
Normal file
225
src/python/pychm/recollchm/swig_chm.i
Normal file
@ -0,0 +1,225 @@
|
||||
%module chmlib
|
||||
%begin %{
|
||||
#define SWIG_PYTHON_STRICT_BYTE_CHAR
|
||||
%}
|
||||
|
||||
%include "typemaps.i"
|
||||
%include "cstring.i"
|
||||
|
||||
%{
|
||||
/*
|
||||
Copyright (C) 2003 Rubens Ramos <rubensr@users.sourceforge.net>
|
||||
|
||||
Based on code by:
|
||||
Copyright (C) 2003 Razvan Cojocaru <razvanco@gmx.net>
|
||||
|
||||
pychm is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU General Public License as
|
||||
published by the Free Software Foundation; either version 2 of the
|
||||
License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public
|
||||
License along with this program; see the file COPYING. If not,
|
||||
write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
Boston, MA 02111-1307, USA
|
||||
|
||||
$Id$
|
||||
*/
|
||||
|
||||
#include "chm_lib.h"
|
||||
#include <stdio.h>
|
||||
|
||||
static PyObject *my_callback = NULL;
|
||||
|
||||
static PyObject *
|
||||
my_set_callback(PyObject *dummy, PyObject *arg)
|
||||
{
|
||||
PyObject *result = NULL;
|
||||
|
||||
if (!PyCallable_Check(arg)) {
|
||||
PyErr_SetString(PyExc_TypeError, "parameter must be callable");
|
||||
return NULL;
|
||||
}
|
||||
Py_XINCREF(arg); /* Add a reference to new callback */
|
||||
Py_XDECREF(my_callback); /* Dispose of previous callback */
|
||||
my_callback = arg; /* Remember new callback */
|
||||
/* Boilerplate to return "None" */
|
||||
Py_INCREF(Py_None);
|
||||
result = Py_None;
|
||||
return result;
|
||||
}
|
||||
|
||||
int dummy_enumerator (struct chmFile *h,
|
||||
struct chmUnitInfo *ui,
|
||||
void *context) {
|
||||
PyObject *arglist;
|
||||
PyObject *result;
|
||||
PyObject *py_h;
|
||||
PyObject *py_ui;
|
||||
PyObject *py_c;
|
||||
|
||||
py_h = SWIG_NewPointerObj((void *) h, SWIGTYPE_p_chmFile, 0);
|
||||
py_ui = SWIG_NewPointerObj((void *) ui, SWIGTYPE_p_chmUnitInfo, 0);
|
||||
/* The following was: py_c = PyCObject_AsVoidPtr(context); which did
|
||||
not make sense because the function takes a PyObject * and returns a
|
||||
void *, not the reverse. This was probably never used?? In doubt,
|
||||
replace with a call which makes sense and hope for the best... */
|
||||
py_c = PyCapsule_New(context, "context", NULL);
|
||||
|
||||
/* Time to call the callback */
|
||||
arglist = Py_BuildValue("(OOO)", py_h, py_ui, py_c);
|
||||
if (arglist) {
|
||||
result = PyEval_CallObject(my_callback, arglist);
|
||||
Py_DECREF(arglist);
|
||||
Py_DECREF(result);
|
||||
|
||||
Py_DECREF(py_h);
|
||||
Py_DECREF(py_ui);
|
||||
Py_DECREF(py_c);
|
||||
|
||||
if (result == NULL) {
|
||||
return 0; /* Pass error back */
|
||||
} else {
|
||||
return 1;
|
||||
}
|
||||
} else
|
||||
return 0;
|
||||
}
|
||||
%}
|
||||
|
||||
%typemap(in) CHM_ENUMERATOR {
|
||||
if (!my_set_callback(self, $input)) goto fail;
|
||||
$1 = dummy_enumerator;
|
||||
}
|
||||
|
||||
%typemap(in) void *context {
|
||||
if (!($1 = PyCapsule_New($input, "context", NULL))) goto fail;
|
||||
}
|
||||
|
||||
%typemap(in, numinputs=0) struct chmUnitInfo *OutValue (struct chmUnitInfo *temp = (struct chmUnitInfo *) calloc(1, sizeof(struct chmUnitInfo))) {
|
||||
$1 = temp;
|
||||
}
|
||||
|
||||
%typemap(argout) struct chmUnitInfo *OutValue {
|
||||
PyObject *o, *o2, *o3;
|
||||
o = SWIG_NewPointerObj((void *) $1, SWIGTYPE_p_chmUnitInfo, 1);
|
||||
if ((!$result) || ($result == Py_None)) {
|
||||
$result = o;
|
||||
} else {
|
||||
if (!PyTuple_Check($result)) {
|
||||
PyObject *o2 = $result;
|
||||
$result = PyTuple_New(1);
|
||||
PyTuple_SetItem($result,0,o2);
|
||||
}
|
||||
o3 = PyTuple_New(1);
|
||||
PyTuple_SetItem(o3,0,o);
|
||||
o2 = $result;
|
||||
$result = PySequence_Concat(o2,o3);
|
||||
Py_DECREF(o2);
|
||||
Py_DECREF(o3);
|
||||
}
|
||||
}
|
||||
|
||||
%typemap(check) unsigned char *OUTPUT {
|
||||
/* nasty hack */
|
||||
#ifdef __cplusplus
|
||||
$1 = ($1_ltype) new char[arg5];
|
||||
#else
|
||||
$1 = ($1_ltype) malloc(arg5);
|
||||
#endif
|
||||
if ($1 == NULL) SWIG_fail;
|
||||
}
|
||||
|
||||
%typemap(argout,fragment="t_output_helper") unsigned char *OUTPUT {
|
||||
PyObject *o;
|
||||
o = SWIG_FromCharPtrAndSize((const char*)$1, arg5);
|
||||
/* o = PyString_FromStringAndSize($1, arg5);*/
|
||||
$result = t_output_helper($result,o);
|
||||
#ifdef __cplusplus
|
||||
delete [] $1;
|
||||
#else
|
||||
free($1);
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef WIN32
|
||||
typedef unsigned __int64 LONGUINT64;
|
||||
typedef __int64 LONGINT64;
|
||||
#else
|
||||
typedef unsigned long long LONGUINT64;
|
||||
typedef long long LONGINT64;
|
||||
#endif
|
||||
|
||||
/* the two available spaces in a CHM file */
|
||||
/* N.B.: The format supports arbitrarily many spaces, but only */
|
||||
/* two appear to be used at present. */
|
||||
#define CHM_UNCOMPRESSED (0)
|
||||
#define CHM_COMPRESSED (1)
|
||||
|
||||
/* structure representing an ITS (CHM) file stream */
|
||||
struct chmFile;
|
||||
|
||||
/* structure representing an element from an ITS file stream */
|
||||
#define CHM_MAX_PATHLEN 256
|
||||
struct chmUnitInfo
|
||||
{
|
||||
LONGUINT64 start;
|
||||
LONGUINT64 length;
|
||||
int space;
|
||||
char path[CHM_MAX_PATHLEN+1];
|
||||
};
|
||||
|
||||
/* open an ITS archive */
|
||||
struct chmFile* chm_open(const char *filename);
|
||||
|
||||
/* close an ITS archive */
|
||||
void chm_close(struct chmFile *h);
|
||||
|
||||
/* methods for ssetting tuning parameters for particular file */
|
||||
#define CHM_PARAM_MAX_BLOCKS_CACHED 0
|
||||
void chm_set_param(struct chmFile *h,
|
||||
int paramType,
|
||||
int paramVal);
|
||||
|
||||
/* resolve a particular object from the archive */
|
||||
#define CHM_RESOLVE_SUCCESS (0)
|
||||
#define CHM_RESOLVE_FAILURE (1)
|
||||
int chm_resolve_object(struct chmFile *h,
|
||||
const char *objPath,
|
||||
struct chmUnitInfo *OutValue);
|
||||
|
||||
/* retrieve part of an object from the archive */
|
||||
LONGINT64 chm_retrieve_object(struct chmFile *h,
|
||||
struct chmUnitInfo *ui,
|
||||
unsigned char *OUTPUT,
|
||||
LONGUINT64 addr,
|
||||
LONGINT64 len);
|
||||
|
||||
/* enumerate the objects in the .chm archive */
|
||||
typedef int (*CHM_ENUMERATOR)(struct chmFile *h,
|
||||
struct chmUnitInfo *ui,
|
||||
void *context);
|
||||
#define CHM_ENUMERATE_NORMAL (1)
|
||||
#define CHM_ENUMERATE_META (2)
|
||||
#define CHM_ENUMERATE_SPECIAL (4)
|
||||
#define CHM_ENUMERATE_FILES (8)
|
||||
#define CHM_ENUMERATE_DIRS (16)
|
||||
#define CHM_ENUMERATE_ALL (31)
|
||||
#define CHM_ENUMERATOR_FAILURE (0)
|
||||
#define CHM_ENUMERATOR_CONTINUE (1)
|
||||
#define CHM_ENUMERATOR_SUCCESS (2)
|
||||
int chm_enumerate(struct chmFile *h,
|
||||
int what,
|
||||
CHM_ENUMERATOR e,
|
||||
void *context);
|
||||
|
||||
int chm_enumerate_dir(struct chmFile *h,
|
||||
const char *prefix,
|
||||
int what,
|
||||
CHM_ENUMERATOR e,
|
||||
void *context);
|
||||
36
src/python/pychm/setup.py.in
Normal file
36
src/python/pychm/setup.py.in
Normal file
@ -0,0 +1,36 @@
|
||||
from setuptools import setup, Extension
|
||||
|
||||
long_description = '''
|
||||
Version of the chm package modified to support Python 3 and bundled with Recoll.
|
||||
The chm package provides three modules, chm, chmlib and extra, which provide
|
||||
access to the API implemented by the C library chmlib and some additional
|
||||
classes and functions. They are used to access MS-ITSS encoded files -
|
||||
Compressed Html Help files (.chm).
|
||||
'''
|
||||
|
||||
# For shadow builds: references to the source tree
|
||||
import os
|
||||
top = os.path.join('@srcdir@', '..', '..')
|
||||
pytop = '@srcdir@'
|
||||
|
||||
setup(name="recollchm",
|
||||
version="0.8.4.1+git",
|
||||
description="Python package to handle CHM files",
|
||||
author="Rubens Ramos",
|
||||
author_email="rubensr@users.sourceforge.net",
|
||||
maintainer="Mikhail Gusarov",
|
||||
maintainer_email="dottedmag@dottedmag.net",
|
||||
url="https://github.com/dottedmag/pychm",
|
||||
license="GPL",
|
||||
long_description=long_description,
|
||||
package_dir = {'' : os.path.join(top, 'python', 'pychm')},
|
||||
py_modules=["recollchm.chm", "recollchm.chmlib"],
|
||||
ext_modules=[Extension("recollchm._chmlib",
|
||||
[os.path.join(pytop, "recollchm/swig_chm.c")],
|
||||
libraries=["chm"],
|
||||
extra_compile_args=["-DSWIG_COBJECT_TYPES"]),
|
||||
Extension("recollchm.extra",
|
||||
[os.path.join(pytop, "recollchm/extra.c")],
|
||||
extra_compile_args=["-D__PYTHON__"],
|
||||
libraries=["chm"])]
|
||||
)
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user