Added handler for Hancom .hwp format
This commit is contained in:
parent
15d40dc81f
commit
2cbd9ad79c
@ -674,6 +674,7 @@ filters/rclexecm.py \
|
|||||||
filters/rclfb2.py \
|
filters/rclfb2.py \
|
||||||
filters/rclgaim \
|
filters/rclgaim \
|
||||||
filters/rclgenxslt.py \
|
filters/rclgenxslt.py \
|
||||||
|
filters/rclhwp.py \
|
||||||
filters/rclics \
|
filters/rclics \
|
||||||
filters/rclimg \
|
filters/rclimg \
|
||||||
filters/rclimg.py \
|
filters/rclimg.py \
|
||||||
|
|||||||
115
src/filters/rclhwp.py
Executable file
115
src/filters/rclhwp.py
Executable file
@ -0,0 +1,115 @@
|
|||||||
|
#!/usr/bin/python3
|
||||||
|
# Copyright (C) 2020 J.F.Dockes
|
||||||
|
# This program is free software; you can redistribute it and/or modify
|
||||||
|
# it under the terms of the GNU General Public License as published by
|
||||||
|
# the Free Software Foundation; either version 2 of the License, or
|
||||||
|
# (at your option) any later version.
|
||||||
|
#
|
||||||
|
# This program is distributed in the hope that it will be useful,
|
||||||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
# GNU General Public License for more details.
|
||||||
|
#
|
||||||
|
# You should have received a copy of the GNU General Public License
|
||||||
|
# along with this program; if not, write to the
|
||||||
|
# Free Software Foundation, Inc.,
|
||||||
|
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
#########################################
|
||||||
|
# Recoll Hanword .hwp handler
|
||||||
|
#
|
||||||
|
# The real work is done by pyhwp:
|
||||||
|
# https://github.com/mete0r/pyhwp
|
||||||
|
# https://pypi.org/project/pyhwp/
|
||||||
|
# pip3 install pyhwp
|
||||||
|
#
|
||||||
|
|
||||||
|
import sys
|
||||||
|
from io import BytesIO
|
||||||
|
|
||||||
|
import rclexecm
|
||||||
|
from rclbasehandler import RclBaseHandler
|
||||||
|
|
||||||
|
from hwp5.filestructure import Hwp5File as fs_Hwp5File
|
||||||
|
from hwp5.transforms import BaseTransform
|
||||||
|
from hwp5.xmlmodel import Hwp5File as xml_Hwp5File
|
||||||
|
from hwp5.utils import cached_property
|
||||||
|
|
||||||
|
|
||||||
|
# This was duplicated from hwp5 hwp5text.py and I don't really
|
||||||
|
# understand what it does...
|
||||||
|
RESOURCE_PATH_XSL_TEXT = 'xsl/plaintext.xsl'
|
||||||
|
class TextTransform(BaseTransform):
|
||||||
|
@property
|
||||||
|
def transform_hwp5_to_text(self):
|
||||||
|
transform_xhwp5 = self.transform_xhwp5_to_text
|
||||||
|
return self.make_transform_hwp5(transform_xhwp5)
|
||||||
|
@cached_property
|
||||||
|
def transform_xhwp5_to_text(self):
|
||||||
|
resource_path = RESOURCE_PATH_XSL_TEXT
|
||||||
|
return self.make_xsl_transform(resource_path)
|
||||||
|
|
||||||
|
|
||||||
|
# Associate HTML meta names and hwp summaryinfo values
|
||||||
|
def metafields(summaryinfo):
|
||||||
|
yield(('Description', summaryinfo.subject + " " +
|
||||||
|
summaryinfo.comments))
|
||||||
|
yield(('Author', summaryinfo.author))
|
||||||
|
yield(('Keywords', summaryinfo.keywords))
|
||||||
|
yield(('Date', summaryinfo.lastSavedTime))
|
||||||
|
|
||||||
|
|
||||||
|
# Extractor class. We use hwp summaryinfo to extract metadata and code
|
||||||
|
# extracted from hwp.hwp5txt.py to extract the text.
|
||||||
|
class HWP5Dump(RclBaseHandler):
|
||||||
|
def __init__(self, em):
|
||||||
|
super(HWP5Dump, self).__init__(em)
|
||||||
|
|
||||||
|
def html_text(self, fn):
|
||||||
|
# hwp wants str filenames. This is unfortunate
|
||||||
|
fn = fn.decode('utf-8')
|
||||||
|
|
||||||
|
html = b'<html><head>\n' + \
|
||||||
|
b'<meta http-equiv="content-type" \
|
||||||
|
content="text/html; charset=utf-8">\n'
|
||||||
|
|
||||||
|
hwpfile = fs_Hwp5File(fn)
|
||||||
|
try:
|
||||||
|
tt = hwpfile.summaryinfo.title.strip()
|
||||||
|
if tt:
|
||||||
|
tt = self.em.htmlescape(tt.encode('utf-8'))
|
||||||
|
html += b'<title>' + tt + b'</title>\n'
|
||||||
|
|
||||||
|
for k,v in metafields(hwpfile.summaryinfo):
|
||||||
|
v = "{0}".format(v)
|
||||||
|
v = v.strip()
|
||||||
|
if v:
|
||||||
|
v = self.em.htmlescape(v.encode('utf-8'))
|
||||||
|
k = k.encode('utf-8')
|
||||||
|
html += b'<meta name="' + k + b'" content="' + \
|
||||||
|
v + b'">\n'
|
||||||
|
except Exception as e:
|
||||||
|
self.em.rclog("Exception: %s" % e)
|
||||||
|
finally:
|
||||||
|
hwpfile.close()
|
||||||
|
|
||||||
|
html += b'</head><body><pre>\n'
|
||||||
|
|
||||||
|
hwpfile = xml_Hwp5File(fn)
|
||||||
|
text_transform = TextTransform()
|
||||||
|
transform = text_transform.transform_hwp5_to_text
|
||||||
|
dest = BytesIO()
|
||||||
|
try:
|
||||||
|
transform(hwpfile, dest)
|
||||||
|
except Exception as e:
|
||||||
|
self.em.rclog("Exception: %s" % e)
|
||||||
|
finally:
|
||||||
|
hwpfile.close()
|
||||||
|
dest.seek(0)
|
||||||
|
html += self.em.htmlescape(dest.read())
|
||||||
|
html += b'</pre></body></html>'
|
||||||
|
return html
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
proto = rclexecm.RclExecM()
|
||||||
|
extract = HWP5Dump(proto)
|
||||||
|
rclexecm.main(proto, extract)
|
||||||
@ -1,5 +1,5 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
# Copyright (C) 2014 J.F.Dockes
|
# Copyright (C) 2014-2020 J.F.Dockes
|
||||||
# This program is free software; you can redistribute it and/or modify
|
# This program is free software; you can redistribute it and/or modify
|
||||||
# it under the terms of the GNU General Public License as published by
|
# it under the terms of the GNU General Public License as published by
|
||||||
# the Free Software Foundation; either version 2 of the License, or
|
# the Free Software Foundation; either version 2 of the License, or
|
||||||
|
|||||||
@ -68,6 +68,8 @@ application/msword = execm rcldoc.py
|
|||||||
# You can also use wvware directly but it's much slower.
|
# You can also use wvware directly but it's much slower.
|
||||||
# application/msword = exec wvWare --charset=utf-8 --nographics
|
# application/msword = exec wvWare --charset=utf-8 --nographics
|
||||||
|
|
||||||
|
application/x-hwp = execm rclhwp.py
|
||||||
|
|
||||||
# Also Handle the mime type returned by "file -i" for a suffix-less word
|
# Also Handle the mime type returned by "file -i" for a suffix-less word
|
||||||
# file. This could probably just as well be an excel file, but we have to
|
# file. This could probably just as well be an excel file, but we have to
|
||||||
# chose one.
|
# chose one.
|
||||||
|
|||||||
@ -98,6 +98,8 @@
|
|||||||
.tbz = application/x-tar
|
.tbz = application/x-tar
|
||||||
.tar.bz2 = application/x-tar
|
.tar.bz2 = application/x-tar
|
||||||
|
|
||||||
|
.hwp = application/x-hwp
|
||||||
|
|
||||||
.doc = application/msword
|
.doc = application/msword
|
||||||
.dot = application/msword
|
.dot = application/msword
|
||||||
.ppt = application/vnd.ms-powerpoint
|
.ppt = application/vnd.ms-powerpoint
|
||||||
|
|||||||
@ -52,6 +52,8 @@ application/vnd.ms-powerpoint = execm python rclppt.py
|
|||||||
|
|
||||||
application/pdf = execm python rclpdf.py
|
application/pdf = execm python rclpdf.py
|
||||||
|
|
||||||
|
application/x-hwp = execm python rclhwp.py
|
||||||
|
|
||||||
application/vnd.oasis.opendocument.text = internal xsltproc meta.xml opendoc-meta.xsl content.xml opendoc-body.xsl
|
application/vnd.oasis.opendocument.text = internal xsltproc meta.xml opendoc-meta.xsl content.xml opendoc-body.xsl
|
||||||
application/vnd.oasis.opendocument.text-template = internal xsltproc meta.xml opendoc-meta.xsl content.xml opendoc-body.xsl
|
application/vnd.oasis.opendocument.text-template = internal xsltproc meta.xml opendoc-meta.xsl content.xml opendoc-body.xsl
|
||||||
application/vnd.oasis.opendocument.presentation = internal xsltproc meta.xml opendoc-meta.xsl content.xml opendoc-body.xsl
|
application/vnd.oasis.opendocument.presentation = internal xsltproc meta.xml opendoc-meta.xsl content.xml opendoc-body.xsl
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user