more filter conversion to python: svg and xml. Get rid of rclnull

--HG--
branch : WINDOWSPORT
This commit is contained in:
Jean-Francois Dockes 2015-09-14 09:51:11 +02:00
parent 07e2aa68a3
commit 24c77d2984
6 changed files with 304 additions and 10 deletions

View File

@ -1,9 +0,0 @@
#!/bin/sh
# It may make sense in some cases to set this null filter (no output)
# instead of using recoll_noindex or leaving the default filter in
# case one doesn't want to install it: this will avoid endless retries
# to reindex the affected files, as recoll will think it has succeeded
# indexing them. Downside: the files won't be indexed when one
# actually installs the real filter, will need a -z
exit 0

140
src/filters/rclsvg.py Executable file
View File

@ -0,0 +1,140 @@
#!/usr/bin/env python
# Copyright (C) 2014 J.F.Dockes
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the
# Free Software Foundation, Inc.,
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
######################################
import sys
import rclexecm
import rclxslt
stylesheet_all = '''<?xml version="1.0"?>
<xsl:stylesheet version="1.0"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:svg="http://www.w3.org/2000/svg"
xmlns:dc="http://purl.org/dc/elements/1.1/"
exclude-result-prefixes="svg"
>
<xsl:output method="html" encoding="UTF-8"/>
<xsl:template match="/">
<html>
<head>
<xsl:apply-templates select="svg:svg/svg:title"/>
<xsl:apply-templates select="svg:svg/svg:desc"/>
<xsl:apply-templates select="svg:svg/svg:metadata/descendant::dc:creator"/>
<xsl:apply-templates select="svg:svg/svg:metadata/descendant::dc:subject"/>
<xsl:apply-templates select="svg:svg/svg:metadata/descendant::dc:description"/>
</head>
<body>
<xsl:apply-templates select="//svg:text"/>
</body>
</html>
</xsl:template>
<xsl:template match="svg:desc">
<meta>
<xsl:attribute name="name">keywords</xsl:attribute>
<xsl:attribute name="content">
<xsl:value-of select="."/>
</xsl:attribute>
</meta><xsl:text>
</xsl:text>
</xsl:template>
<xsl:template match="dc:creator">
<meta>
<xsl:attribute name="name">author</xsl:attribute>
<xsl:attribute name="content">
<xsl:value-of select="."/>
</xsl:attribute>
</meta><xsl:text>
</xsl:text>
</xsl:template>
<xsl:template match="dc:subject">
<meta>
<xsl:attribute name="name">keywords</xsl:attribute>
<xsl:attribute name="content">
<xsl:value-of select="."/>
</xsl:attribute>
</meta><xsl:text>
</xsl:text>
</xsl:template>
<xsl:template match="dc:description">
<meta>
<xsl:attribute name="name">description</xsl:attribute>
<xsl:attribute name="content">
<xsl:value-of select="."/>
</xsl:attribute>
</meta><xsl:text>
</xsl:text>
</xsl:template>
<xsl:template match="svg:title">
<title><xsl:value-of select="."/></title><xsl:text>
</xsl:text>
</xsl:template>
<xsl:template match="svg:text">
<p><xsl:value-of select="."/></p><xsl:text>
</xsl:text>
</xsl:template>
</xsl:stylesheet>
'''
class SVGExtractor:
def __init__(self, em):
self.em = em
self.currentindex = 0
def extractone(self, params):
if not params.has_key("filename:"):
self.em.rclog("extractone: no mime or file name")
return (False, "", "", rclexecm.RclExecM.eofnow)
fn = params["filename:"]
try:
data = open(fn, 'r').read()
docdata = rclxslt.apply_sheet_data(stylesheet_all, data)
except Exception as err:
self.em.rclog("%s: bad data: " % (fn, err))
return (False, "", "", rclexecm.RclExecM.eofnow)
return (True, docdata, "", rclexecm.RclExecM.eofnow)
###### File type handler api, used by rclexecm ---------->
def openfile(self, params):
self.currentindex = 0
return True
def getipath(self, params):
return self.extractone(params)
def getnext(self, params):
if self.currentindex >= 1:
return (False, "", "", rclexecm.RclExecM.eofnow)
else:
ret= self.extractone(params)
self.currentindex += 1
return ret
if __name__ == '__main__':
proto = rclexecm.RclExecM()
extract = SVGExtractor(proto)
rclexecm.main(proto, extract)

98
src/filters/rclxml.py Executable file
View File

@ -0,0 +1,98 @@
#!/usr/bin/env python
# Copyright (C) 2014 J.F.Dockes
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the
# Free Software Foundation, Inc.,
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
######################################
import sys
import rclexecm
import rclxslt
stylesheet_all = '''<?xml version="1.0"?>
<xsl:stylesheet version="1.0"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<xsl:output method="html" encoding="UTF-8"/>
<xsl:template match="/">
<html>
<head>
<xsl:if test="//*[local-name() = 'title']">
<title>
<xsl:value-of select="//*[local-name() = 'title'][1]"/>
</title>
</xsl:if>
</head>
<body>
<xsl:apply-templates/>
</body>
</html>
</xsl:template>
<xsl:template match="text()">
<xsl:if test="string-length(normalize-space(.)) &gt; 0">
<p><xsl:value-of select="."/></p>
<xsl:text>
</xsl:text>
</xsl:if>
</xsl:template>
<xsl:template match="*">
<xsl:apply-templates/>
</xsl:template>
</xsl:stylesheet>
'''
class XMLExtractor:
def __init__(self, em):
self.em = em
self.currentindex = 0
def extractone(self, params):
if not params.has_key("filename:"):
self.em.rclog("extractone: no mime or file name")
return (False, "", "", rclexecm.RclExecM.eofnow)
fn = params["filename:"]
try:
data = open(fn, 'r').read()
docdata = rclxslt.apply_sheet_data(stylesheet_all, data)
except Exception as err:
self.em.rclog("%s: bad data: " % (fn, err))
return (False, "", "", rclexecm.RclExecM.eofnow)
return (True, docdata, "", rclexecm.RclExecM.eofnow)
###### File type handler api, used by rclexecm ---------->
def openfile(self, params):
self.currentindex = 0
return True
def getipath(self, params):
return self.extractone(params)
def getnext(self, params):
if self.currentindex >= 1:
return (False, "", "", rclexecm.RclExecM.eofnow)
else:
ret= self.extractone(params)
self.currentindex += 1
return ret
if __name__ == '__main__':
proto = rclexecm.RclExecM()
extract = XMLExtractor(proto)
rclexecm.main(proto, extract)

59
src/internfile/mh_null.h Normal file
View File

@ -0,0 +1,59 @@
/* Copyright (C) 2004 J.F.Dockes
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the
* Free Software Foundation, Inc.,
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
#ifndef _MH_NULL_H_INCLUDED_
#define _MH_NULL_H_INCLUDED_
// It may make sense in some cases to set this null filter (no output)
// instead of using recoll_noindex or leaving the default filter in
// case one doesn't want to install it: this will avoid endless retries
// to reindex the affected files, as recoll will think it has succeeded
// indexing them. Downside: the files won't be indexed when one
// actually installs the real filter, will need a -z
// Actually used for empty files
// Associated to application/x-zerosize, so use
// <mimetype> = internal application/x-zerosize
// in mimeconf
#include <string>
#include "cstr.h"
#include "mimehandler.h"
class MimeHandlerNull : public RecollFilter {
public:
MimeHandlerNull(RclConfig *cnf, const std::string& id)
: RecollFilter(cnf, id)
{
}
virtual ~MimeHandlerNull()
{
}
virtual bool set_document_file(const string& mt, const string& fn)
{
RecollFilter::set_document_file(mt, fn);
return m_havedoc = true;
}
virtual bool next_document()
{
if (m_havedoc == false)
return false;
m_havedoc = false;
m_metaData[cstr_dj_keycontent] = cstr_null;
m_metaData[cstr_dj_keymt] = cstr_textplain;
return true;
}
};
#endif /* _MH_NULL_H_INCLUDED_ */

View File

@ -39,6 +39,7 @@ using namespace std;
#include "mh_text.h"
#include "mh_symlink.h"
#include "mh_unknown.h"
#include "mh_null.h"
#include "ptmutex.h"
// Performance help: we use a pool of already known and created
@ -162,6 +163,10 @@ static RecollFilter *mhFactory(RclConfig *config, const string &mime,
LOGDEB2(("mhFactory(%s): ret MimeHandlerSymlink\n", mime.c_str()));
MD5String("MimeHandlerSymlink", id);
return nobuild ? 0 : new MimeHandlerSymlink(config, id);
} else if ("application/x-zerosize" == lmime) {
LOGDEB(("mhFactory(%s): ret MimeHandlerNull\n", mime.c_str()));
MD5String("MimeHandlerNull", id);
return nobuild ? 0 : new MimeHandlerNull(config, id);
} else if (lmime.find("text/") == 0) {
// Try to handle unknown text/xx as text/plain. This
// only happen if the text/xx was defined as "internal" in

View File

@ -133,7 +133,8 @@ image/vnd.djvu = exec rcldjvu
image/svg+xml = exec rclsvg
image/x-xcf = execm rclimg
inode/symlink = internal
inode/x-empty = exec rclnull
application/x-zerosize = internal
inode/x-empty = internal application/x-zerosize
message/rfc822 = internal
text/calendar = execm rclics;mimetype=text/plain
text/html = internal