Added handler for Hancom .hwp format

This commit is contained in:
Jean-Francois Dockes 2020-03-10 14:38:52 +01:00
parent 15d40dc81f
commit 2cbd9ad79c
6 changed files with 123 additions and 1 deletions

View File

@ -674,6 +674,7 @@ filters/rclexecm.py \
filters/rclfb2.py \
filters/rclgaim \
filters/rclgenxslt.py \
filters/rclhwp.py \
filters/rclics \
filters/rclimg \
filters/rclimg.py \

115
src/filters/rclhwp.py Executable file
View File

@ -0,0 +1,115 @@
#!/usr/bin/python3
# Copyright (C) 2020 J.F.Dockes
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the
# Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
#########################################
# Recoll Hanword .hwp handler
#
# The real work is done by pyhwp:
# https://github.com/mete0r/pyhwp
# https://pypi.org/project/pyhwp/
# pip3 install pyhwp
#
import sys
from io import BytesIO
import rclexecm
from rclbasehandler import RclBaseHandler
from hwp5.filestructure import Hwp5File as fs_Hwp5File
from hwp5.transforms import BaseTransform
from hwp5.xmlmodel import Hwp5File as xml_Hwp5File
from hwp5.utils import cached_property
# This was duplicated from hwp5 hwp5text.py and I don't really
# understand what it does...
RESOURCE_PATH_XSL_TEXT = 'xsl/plaintext.xsl'
class TextTransform(BaseTransform):
@property
def transform_hwp5_to_text(self):
transform_xhwp5 = self.transform_xhwp5_to_text
return self.make_transform_hwp5(transform_xhwp5)
@cached_property
def transform_xhwp5_to_text(self):
resource_path = RESOURCE_PATH_XSL_TEXT
return self.make_xsl_transform(resource_path)
# Associate HTML meta names and hwp summaryinfo values
def metafields(summaryinfo):
yield(('Description', summaryinfo.subject + " " +
summaryinfo.comments))
yield(('Author', summaryinfo.author))
yield(('Keywords', summaryinfo.keywords))
yield(('Date', summaryinfo.lastSavedTime))
# Extractor class. We use hwp summaryinfo to extract metadata and code
# extracted from hwp.hwp5txt.py to extract the text.
class HWP5Dump(RclBaseHandler):
def __init__(self, em):
super(HWP5Dump, self).__init__(em)
def html_text(self, fn):
# hwp wants str filenames. This is unfortunate
fn = fn.decode('utf-8')
html = b'<html><head>\n' + \
b'<meta http-equiv="content-type" \
content="text/html; charset=utf-8">\n'
hwpfile = fs_Hwp5File(fn)
try:
tt = hwpfile.summaryinfo.title.strip()
if tt:
tt = self.em.htmlescape(tt.encode('utf-8'))
html += b'<title>' + tt + b'</title>\n'
for k,v in metafields(hwpfile.summaryinfo):
v = "{0}".format(v)
v = v.strip()
if v:
v = self.em.htmlescape(v.encode('utf-8'))
k = k.encode('utf-8')
html += b'<meta name="' + k + b'" content="' + \
v + b'">\n'
except Exception as e:
self.em.rclog("Exception: %s" % e)
finally:
hwpfile.close()
html += b'</head><body><pre>\n'
hwpfile = xml_Hwp5File(fn)
text_transform = TextTransform()
transform = text_transform.transform_hwp5_to_text
dest = BytesIO()
try:
transform(hwpfile, dest)
except Exception as e:
self.em.rclog("Exception: %s" % e)
finally:
hwpfile.close()
dest.seek(0)
html += self.em.htmlescape(dest.read())
html += b'</pre></body></html>'
return html
if __name__ == '__main__':
proto = rclexecm.RclExecM()
extract = HWP5Dump(proto)
rclexecm.main(proto, extract)

View File

@ -1,5 +1,5 @@
#!/usr/bin/env python3
# Copyright (C) 2014 J.F.Dockes
# Copyright (C) 2014-2020 J.F.Dockes
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or

View File

@ -68,6 +68,8 @@ application/msword = execm rcldoc.py
# You can also use wvware directly but it's much slower.
# application/msword = exec wvWare --charset=utf-8 --nographics
application/x-hwp = execm rclhwp.py
# Also Handle the mime type returned by "file -i" for a suffix-less word
# file. This could probably just as well be an excel file, but we have to
# chose one.

View File

@ -98,6 +98,8 @@
.tbz = application/x-tar
.tar.bz2 = application/x-tar
.hwp = application/x-hwp
.doc = application/msword
.dot = application/msword
.ppt = application/vnd.ms-powerpoint

View File

@ -52,6 +52,8 @@ application/vnd.ms-powerpoint = execm python rclppt.py
application/pdf = execm python rclpdf.py
application/x-hwp = execm python rclhwp.py
application/vnd.oasis.opendocument.text = internal xsltproc meta.xml opendoc-meta.xsl content.xml opendoc-body.xsl
application/vnd.oasis.opendocument.text-template = internal xsltproc meta.xml opendoc-meta.xsl content.xml opendoc-body.xsl
application/vnd.oasis.opendocument.presentation = internal xsltproc meta.xml opendoc-meta.xsl content.xml opendoc-body.xsl