diff --git a/src/Makefile.am b/src/Makefile.am index 940218e4..73f2f415 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -674,6 +674,7 @@ filters/rclexecm.py \ filters/rclfb2.py \ filters/rclgaim \ filters/rclgenxslt.py \ +filters/rclhwp.py \ filters/rclics \ filters/rclimg \ filters/rclimg.py \ diff --git a/src/filters/rclhwp.py b/src/filters/rclhwp.py new file mode 100755 index 00000000..9e8bbf26 --- /dev/null +++ b/src/filters/rclhwp.py @@ -0,0 +1,115 @@ +#!/usr/bin/python3 +# Copyright (C) 2020 J.F.Dockes +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the +# Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +######################################### +# Recoll Hanword .hwp handler +# +# The real work is done by pyhwp: +# https://github.com/mete0r/pyhwp +# https://pypi.org/project/pyhwp/ +# pip3 install pyhwp +# + +import sys +from io import BytesIO + +import rclexecm +from rclbasehandler import RclBaseHandler + +from hwp5.filestructure import Hwp5File as fs_Hwp5File +from hwp5.transforms import BaseTransform +from hwp5.xmlmodel import Hwp5File as xml_Hwp5File +from hwp5.utils import cached_property + + +# This was duplicated from hwp5 hwp5text.py and I don't really +# understand what it does... +RESOURCE_PATH_XSL_TEXT = 'xsl/plaintext.xsl' +class TextTransform(BaseTransform): + @property + def transform_hwp5_to_text(self): + transform_xhwp5 = self.transform_xhwp5_to_text + return self.make_transform_hwp5(transform_xhwp5) + @cached_property + def transform_xhwp5_to_text(self): + resource_path = RESOURCE_PATH_XSL_TEXT + return self.make_xsl_transform(resource_path) + + +# Associate HTML meta names and hwp summaryinfo values +def metafields(summaryinfo): + yield(('Description', summaryinfo.subject + " " + + summaryinfo.comments)) + yield(('Author', summaryinfo.author)) + yield(('Keywords', summaryinfo.keywords)) + yield(('Date', summaryinfo.lastSavedTime)) + + +# Extractor class. We use hwp summaryinfo to extract metadata and code +# extracted from hwp.hwp5txt.py to extract the text. +class HWP5Dump(RclBaseHandler): + def __init__(self, em): + super(HWP5Dump, self).__init__(em) + + def html_text(self, fn): + # hwp wants str filenames. This is unfortunate + fn = fn.decode('utf-8') + + html = b'\n' + \ + b'\n' + + hwpfile = fs_Hwp5File(fn) + try: + tt = hwpfile.summaryinfo.title.strip() + if tt: + tt = self.em.htmlescape(tt.encode('utf-8')) + html += b'' + tt + b'\n' + + for k,v in metafields(hwpfile.summaryinfo): + v = "{0}".format(v) + v = v.strip() + if v: + v = self.em.htmlescape(v.encode('utf-8')) + k = k.encode('utf-8') + html += b'\n' + except Exception as e: + self.em.rclog("Exception: %s" % e) + finally: + hwpfile.close() + + html += b'
\n'
+
+        hwpfile = xml_Hwp5File(fn)
+        text_transform = TextTransform()
+        transform = text_transform.transform_hwp5_to_text
+        dest = BytesIO()
+        try:
+            transform(hwpfile, dest)
+        except Exception as e:
+            self.em.rclog("Exception: %s" % e)
+        finally:
+            hwpfile.close()
+        dest.seek(0)
+        html += self.em.htmlescape(dest.read())
+        html += b'
' + return html + +if __name__ == '__main__': + proto = rclexecm.RclExecM() + extract = HWP5Dump(proto) + rclexecm.main(proto, extract) diff --git a/src/filters/rclpdf.py b/src/filters/rclpdf.py index 830b8e6c..98f1e39a 100755 --- a/src/filters/rclpdf.py +++ b/src/filters/rclpdf.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -# Copyright (C) 2014 J.F.Dockes +# Copyright (C) 2014-2020 J.F.Dockes # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or diff --git a/src/sampleconf/mimeconf b/src/sampleconf/mimeconf index 2c91201f..b72f96b7 100644 --- a/src/sampleconf/mimeconf +++ b/src/sampleconf/mimeconf @@ -68,6 +68,8 @@ application/msword = execm rcldoc.py # You can also use wvware directly but it's much slower. # application/msword = exec wvWare --charset=utf-8 --nographics +application/x-hwp = execm rclhwp.py + # Also Handle the mime type returned by "file -i" for a suffix-less word # file. This could probably just as well be an excel file, but we have to # chose one. diff --git a/src/sampleconf/mimemap b/src/sampleconf/mimemap index 90db7ecd..e1c37d57 100644 --- a/src/sampleconf/mimemap +++ b/src/sampleconf/mimemap @@ -98,6 +98,8 @@ .tbz = application/x-tar .tar.bz2 = application/x-tar +.hwp = application/x-hwp + .doc = application/msword .dot = application/msword .ppt = application/vnd.ms-powerpoint diff --git a/src/windows/mimeconf b/src/windows/mimeconf index 20ab73dc..2be6d7f6 100644 --- a/src/windows/mimeconf +++ b/src/windows/mimeconf @@ -52,6 +52,8 @@ application/vnd.ms-powerpoint = execm python rclppt.py application/pdf = execm python rclpdf.py +application/x-hwp = execm python rclhwp.py + application/vnd.oasis.opendocument.text = internal xsltproc meta.xml opendoc-meta.xsl content.xml opendoc-body.xsl application/vnd.oasis.opendocument.text-template = internal xsltproc meta.xml opendoc-meta.xsl content.xml opendoc-body.xsl application/vnd.oasis.opendocument.presentation = internal xsltproc meta.xml opendoc-meta.xsl content.xml opendoc-body.xsl