Added handler for Hancom .hwp format
This commit is contained in:
parent
15d40dc81f
commit
2cbd9ad79c
@ -674,6 +674,7 @@ filters/rclexecm.py \
|
||||
filters/rclfb2.py \
|
||||
filters/rclgaim \
|
||||
filters/rclgenxslt.py \
|
||||
filters/rclhwp.py \
|
||||
filters/rclics \
|
||||
filters/rclimg \
|
||||
filters/rclimg.py \
|
||||
|
||||
115
src/filters/rclhwp.py
Executable file
115
src/filters/rclhwp.py
Executable file
@ -0,0 +1,115 @@
|
||||
#!/usr/bin/python3
|
||||
# Copyright (C) 2020 J.F.Dockes
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the
|
||||
# Free Software Foundation, Inc.,
|
||||
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
#########################################
|
||||
# Recoll Hanword .hwp handler
|
||||
#
|
||||
# The real work is done by pyhwp:
|
||||
# https://github.com/mete0r/pyhwp
|
||||
# https://pypi.org/project/pyhwp/
|
||||
# pip3 install pyhwp
|
||||
#
|
||||
|
||||
import sys
|
||||
from io import BytesIO
|
||||
|
||||
import rclexecm
|
||||
from rclbasehandler import RclBaseHandler
|
||||
|
||||
from hwp5.filestructure import Hwp5File as fs_Hwp5File
|
||||
from hwp5.transforms import BaseTransform
|
||||
from hwp5.xmlmodel import Hwp5File as xml_Hwp5File
|
||||
from hwp5.utils import cached_property
|
||||
|
||||
|
||||
# This was duplicated from hwp5 hwp5text.py and I don't really
|
||||
# understand what it does...
|
||||
RESOURCE_PATH_XSL_TEXT = 'xsl/plaintext.xsl'
|
||||
class TextTransform(BaseTransform):
|
||||
@property
|
||||
def transform_hwp5_to_text(self):
|
||||
transform_xhwp5 = self.transform_xhwp5_to_text
|
||||
return self.make_transform_hwp5(transform_xhwp5)
|
||||
@cached_property
|
||||
def transform_xhwp5_to_text(self):
|
||||
resource_path = RESOURCE_PATH_XSL_TEXT
|
||||
return self.make_xsl_transform(resource_path)
|
||||
|
||||
|
||||
# Associate HTML meta names and hwp summaryinfo values
|
||||
def metafields(summaryinfo):
|
||||
yield(('Description', summaryinfo.subject + " " +
|
||||
summaryinfo.comments))
|
||||
yield(('Author', summaryinfo.author))
|
||||
yield(('Keywords', summaryinfo.keywords))
|
||||
yield(('Date', summaryinfo.lastSavedTime))
|
||||
|
||||
|
||||
# Extractor class. We use hwp summaryinfo to extract metadata and code
|
||||
# extracted from hwp.hwp5txt.py to extract the text.
|
||||
class HWP5Dump(RclBaseHandler):
|
||||
def __init__(self, em):
|
||||
super(HWP5Dump, self).__init__(em)
|
||||
|
||||
def html_text(self, fn):
|
||||
# hwp wants str filenames. This is unfortunate
|
||||
fn = fn.decode('utf-8')
|
||||
|
||||
html = b'<html><head>\n' + \
|
||||
b'<meta http-equiv="content-type" \
|
||||
content="text/html; charset=utf-8">\n'
|
||||
|
||||
hwpfile = fs_Hwp5File(fn)
|
||||
try:
|
||||
tt = hwpfile.summaryinfo.title.strip()
|
||||
if tt:
|
||||
tt = self.em.htmlescape(tt.encode('utf-8'))
|
||||
html += b'<title>' + tt + b'</title>\n'
|
||||
|
||||
for k,v in metafields(hwpfile.summaryinfo):
|
||||
v = "{0}".format(v)
|
||||
v = v.strip()
|
||||
if v:
|
||||
v = self.em.htmlescape(v.encode('utf-8'))
|
||||
k = k.encode('utf-8')
|
||||
html += b'<meta name="' + k + b'" content="' + \
|
||||
v + b'">\n'
|
||||
except Exception as e:
|
||||
self.em.rclog("Exception: %s" % e)
|
||||
finally:
|
||||
hwpfile.close()
|
||||
|
||||
html += b'</head><body><pre>\n'
|
||||
|
||||
hwpfile = xml_Hwp5File(fn)
|
||||
text_transform = TextTransform()
|
||||
transform = text_transform.transform_hwp5_to_text
|
||||
dest = BytesIO()
|
||||
try:
|
||||
transform(hwpfile, dest)
|
||||
except Exception as e:
|
||||
self.em.rclog("Exception: %s" % e)
|
||||
finally:
|
||||
hwpfile.close()
|
||||
dest.seek(0)
|
||||
html += self.em.htmlescape(dest.read())
|
||||
html += b'</pre></body></html>'
|
||||
return html
|
||||
|
||||
if __name__ == '__main__':
|
||||
proto = rclexecm.RclExecM()
|
||||
extract = HWP5Dump(proto)
|
||||
rclexecm.main(proto, extract)
|
||||
@ -1,5 +1,5 @@
|
||||
#!/usr/bin/env python3
|
||||
# Copyright (C) 2014 J.F.Dockes
|
||||
# Copyright (C) 2014-2020 J.F.Dockes
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
|
||||
@ -68,6 +68,8 @@ application/msword = execm rcldoc.py
|
||||
# You can also use wvware directly but it's much slower.
|
||||
# application/msword = exec wvWare --charset=utf-8 --nographics
|
||||
|
||||
application/x-hwp = execm rclhwp.py
|
||||
|
||||
# Also Handle the mime type returned by "file -i" for a suffix-less word
|
||||
# file. This could probably just as well be an excel file, but we have to
|
||||
# chose one.
|
||||
|
||||
@ -98,6 +98,8 @@
|
||||
.tbz = application/x-tar
|
||||
.tar.bz2 = application/x-tar
|
||||
|
||||
.hwp = application/x-hwp
|
||||
|
||||
.doc = application/msword
|
||||
.dot = application/msword
|
||||
.ppt = application/vnd.ms-powerpoint
|
||||
|
||||
@ -52,6 +52,8 @@ application/vnd.ms-powerpoint = execm python rclppt.py
|
||||
|
||||
application/pdf = execm python rclpdf.py
|
||||
|
||||
application/x-hwp = execm python rclhwp.py
|
||||
|
||||
application/vnd.oasis.opendocument.text = internal xsltproc meta.xml opendoc-meta.xsl content.xml opendoc-body.xsl
|
||||
application/vnd.oasis.opendocument.text-template = internal xsltproc meta.xml opendoc-meta.xsl content.xml opendoc-body.xsl
|
||||
application/vnd.oasis.opendocument.presentation = internal xsltproc meta.xml opendoc-meta.xsl content.xml opendoc-body.xsl
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user