diff --git a/src/Makefile.am b/src/Makefile.am index 940218e4..73f2f415 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -674,6 +674,7 @@ filters/rclexecm.py \ filters/rclfb2.py \ filters/rclgaim \ filters/rclgenxslt.py \ +filters/rclhwp.py \ filters/rclics \ filters/rclimg \ filters/rclimg.py \ diff --git a/src/filters/rclhwp.py b/src/filters/rclhwp.py new file mode 100755 index 00000000..9e8bbf26 --- /dev/null +++ b/src/filters/rclhwp.py @@ -0,0 +1,115 @@ +#!/usr/bin/python3 +# Copyright (C) 2020 J.F.Dockes +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the +# Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +######################################### +# Recoll Hanword .hwp handler +# +# The real work is done by pyhwp: +# https://github.com/mete0r/pyhwp +# https://pypi.org/project/pyhwp/ +# pip3 install pyhwp +# + +import sys +from io import BytesIO + +import rclexecm +from rclbasehandler import RclBaseHandler + +from hwp5.filestructure import Hwp5File as fs_Hwp5File +from hwp5.transforms import BaseTransform +from hwp5.xmlmodel import Hwp5File as xml_Hwp5File +from hwp5.utils import cached_property + + +# This was duplicated from hwp5 hwp5text.py and I don't really +# understand what it does... +RESOURCE_PATH_XSL_TEXT = 'xsl/plaintext.xsl' +class TextTransform(BaseTransform): + @property + def transform_hwp5_to_text(self): + transform_xhwp5 = self.transform_xhwp5_to_text + return self.make_transform_hwp5(transform_xhwp5) + @cached_property + def transform_xhwp5_to_text(self): + resource_path = RESOURCE_PATH_XSL_TEXT + return self.make_xsl_transform(resource_path) + + +# Associate HTML meta names and hwp summaryinfo values +def metafields(summaryinfo): + yield(('Description', summaryinfo.subject + " " + + summaryinfo.comments)) + yield(('Author', summaryinfo.author)) + yield(('Keywords', summaryinfo.keywords)) + yield(('Date', summaryinfo.lastSavedTime)) + + +# Extractor class. We use hwp summaryinfo to extract metadata and code +# extracted from hwp.hwp5txt.py to extract the text. +class HWP5Dump(RclBaseHandler): + def __init__(self, em): + super(HWP5Dump, self).__init__(em) + + def html_text(self, fn): + # hwp wants str filenames. This is unfortunate + fn = fn.decode('utf-8') + + html = b'
\n' + \ + b'\n' + + hwpfile = fs_Hwp5File(fn) + try: + tt = hwpfile.summaryinfo.title.strip() + if tt: + tt = self.em.htmlescape(tt.encode('utf-8')) + html += b'\n'
+
+ hwpfile = xml_Hwp5File(fn)
+ text_transform = TextTransform()
+ transform = text_transform.transform_hwp5_to_text
+ dest = BytesIO()
+ try:
+ transform(hwpfile, dest)
+ except Exception as e:
+ self.em.rclog("Exception: %s" % e)
+ finally:
+ hwpfile.close()
+ dest.seek(0)
+ html += self.em.htmlescape(dest.read())
+ html += b''
+ return html
+
+if __name__ == '__main__':
+ proto = rclexecm.RclExecM()
+ extract = HWP5Dump(proto)
+ rclexecm.main(proto, extract)
diff --git a/src/filters/rclpdf.py b/src/filters/rclpdf.py
index 830b8e6c..98f1e39a 100755
--- a/src/filters/rclpdf.py
+++ b/src/filters/rclpdf.py
@@ -1,5 +1,5 @@
#!/usr/bin/env python3
-# Copyright (C) 2014 J.F.Dockes
+# Copyright (C) 2014-2020 J.F.Dockes
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
diff --git a/src/sampleconf/mimeconf b/src/sampleconf/mimeconf
index 2c91201f..b72f96b7 100644
--- a/src/sampleconf/mimeconf
+++ b/src/sampleconf/mimeconf
@@ -68,6 +68,8 @@ application/msword = execm rcldoc.py
# You can also use wvware directly but it's much slower.
# application/msword = exec wvWare --charset=utf-8 --nographics
+application/x-hwp = execm rclhwp.py
+
# Also Handle the mime type returned by "file -i" for a suffix-less word
# file. This could probably just as well be an excel file, but we have to
# chose one.
diff --git a/src/sampleconf/mimemap b/src/sampleconf/mimemap
index 90db7ecd..e1c37d57 100644
--- a/src/sampleconf/mimemap
+++ b/src/sampleconf/mimemap
@@ -98,6 +98,8 @@
.tbz = application/x-tar
.tar.bz2 = application/x-tar
+.hwp = application/x-hwp
+
.doc = application/msword
.dot = application/msword
.ppt = application/vnd.ms-powerpoint
diff --git a/src/windows/mimeconf b/src/windows/mimeconf
index 20ab73dc..2be6d7f6 100644
--- a/src/windows/mimeconf
+++ b/src/windows/mimeconf
@@ -52,6 +52,8 @@ application/vnd.ms-powerpoint = execm python rclppt.py
application/pdf = execm python rclpdf.py
+application/x-hwp = execm python rclhwp.py
+
application/vnd.oasis.opendocument.text = internal xsltproc meta.xml opendoc-meta.xsl content.xml opendoc-body.xsl
application/vnd.oasis.opendocument.text-template = internal xsltproc meta.xml opendoc-meta.xsl content.xml opendoc-body.xsl
application/vnd.oasis.opendocument.presentation = internal xsltproc meta.xml opendoc-meta.xsl content.xml opendoc-body.xsl