diff --git a/src/filters/msodump.zip b/src/filters/msodump.zip new file mode 100644 index 00000000..9e038b99 Binary files /dev/null and b/src/filters/msodump.zip differ diff --git a/src/filters/ppt-dump.py b/src/filters/ppt-dump.py new file mode 100755 index 00000000..e94ce291 --- /dev/null +++ b/src/filters/ppt-dump.py @@ -0,0 +1,141 @@ +#!/usr/bin/env python2 +######################################################################## +# +# Copyright (c) 2010 Kohei Yoshida, Thorsten Behrens +# +# Permission is hereby granted, free of charge, to any person +# obtaining a copy of this software and associated documentation +# files (the "Software"), to deal in the Software without +# restriction, including without limitation the rights to use, +# copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following +# conditions: +# +# The above copyright notice and this permission notice shall be +# included in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. +# +######################################################################## + +import sys, os.path, getopt +sys.path.append(sys.path[0]+"/msodump.zip/src") +import ole, pptstream, globals, olestream + +from globals import error + +def usage (exname): + exname = os.path.basename(exname) + msg = """Usage: %s [options] [ppt file] + +Options: + --help displays this help message. + --no-struct-output suppress normal disassembly output + --dump-text print the textual content +"""%exname + print msg + + +class PPTDumper(object): + + def __init__ (self, filepath, params): + self.filepath = filepath + self.params = params + + def __printDirHeader (self, dirname, byteLen): + dirname = globals.encodeName(dirname) + globals.outputln("") + globals.outputln("="*68) + globals.outputln("%s (size: %d bytes)"%(dirname, byteLen)) + globals.outputln("-"*68) + + def dump (self): + file = open(self.filepath, 'rb') + strm = pptstream.PPTFile(file.read(), self.params) + file.close() + strm.printStreamInfo() + strm.printHeader() + strm.printDirectory() + dirnames = strm.getDirectoryNames() + result = True + for dirname in dirnames: + if len(dirname) == 0 or dirname == 'Root Entry': + continue + + try: + dirstrm = strm.getDirectoryStreamByName(dirname) + except Exception, err: + error("getDirectoryStreamByName(%s): %s\n" % (dirname,str(err))) + # The previous version was killed by the exception + # here, so the equivalent is to break, but maybe there + # is no reason to do so. + break + self.__printDirHeader(dirname, len(dirstrm.bytes)) + if dirname == "PowerPoint Document": + if not self.__readSubStream(dirstrm): + result = False + elif dirname == "Current User": + if not self.__readSubStream(dirstrm): + result = False + elif dirname == "\x05DocumentSummaryInformation": + strm = olestream.PropertySetStream(dirstrm.bytes) + strm.read() + else: + globals.dumpBytes(dirstrm.bytes, 512) + return result + + def __readSubStream (self, strm): + # read all records in substream + return strm.readRecords() + + +def main (args): + exname, args = args[0], args[1:] + if len(args) < 1: + print("takes at least one argument") + usage(exname) + return + + params = globals.Params() + try: + opts, args = getopt.getopt(args, "h", + ["help", "debug", "show-sector-chain", + "no-struct-output", "dump-text"]) + for opt, arg in opts: + if opt in ['-h', '--help']: + usage(exname) + return + elif opt in ['--debug']: + params.debug = True + elif opt in ['--show-sector-chain']: + params.showSectorChain = True + elif opt in ['--no-struct-output']: + globals.muteOutput(1) + params.noStructOutput = True + elif opt in ['--dump-text']: + params.dumpText = True + else: + error("unknown option %s\n"%opt) + usage() + + except getopt.GetoptError: + error("error parsing input options\n") + usage(exname) + return + + dumper = PPTDumper(args[0], params) + if not dumper.dump(): + error("FAILURE\n") + if params.dumpText: + print(globals.textdump.replace("\r", "\n")) + +if __name__ == '__main__': + main(sys.argv) diff --git a/src/filters/rclppt b/src/filters/rclppt index ba5640b4..467acab6 100755 --- a/src/filters/rclppt +++ b/src/filters/rclppt @@ -17,11 +17,9 @@ #================================================================ # Handle powerpoint files for recoll. -# Uses catppt from the catdoc utilities -# (http://ftp.45.free.net/~vitus/software/catdoc/) -# In my experience, this sometimes fail to extract text, printing "Default -# Design" ou "format par defaut" instead and only. -# +# Use unoconv, this is very slow, but catppt just can't handle the majority +# of semi-modern ppt files + #================================================================ # set variables @@ -96,72 +94,17 @@ umask 77 # !! Leave the following line unmodified ! #ENDRECFILTCOMMONCODE -havecappt=no -iscmd cappt && havecappt=yes -haveunoconv=no -iscmd unoconv && haveunoconv=yes -iscmd pdftotext || haveunoconv=no +filtersdir=`dirname $0` +checkcmds $filtersdir/ppt-dump.py -if test X$havecatppt = Xno -a X$haveunoconv = Xno ; then - # checkcmds will exit with the appropriate salutations - checkcmds catppt unoconv pdftotext -fi +mso="$filtersdir/ppt-dump.py --no-struct-output --dump-text" -# This needs a temp dir because we first output pdf (outputting html -# would produce one file per page), and pdftotext can't read from -# stdin -if test z"$RECOLL_TMPDIR" != z; then - ttdir=$RECOLL_TMPDIR -elif test z"$TMPDIR" != z ; then - ttdir=$TMPDIR -else - ttdir=/tmp -fi +cat < + +
+EOF
 
-tmpdir=$ttdir/rclppt_tmp$$
-mkdir $tmpdir || exit 1
-mkdir $tmpdir/rclppttmp || exit 1
+$mso "$infile"| sed -e 's/ $cattxt
-lines=`wc -l < $cattxt`
-
-if test $lines -lt 5 -a X$haveunoconv = Xyes; then
-    unoconv -f pdf -o "$unopdf" "$infile"
-    sinfile=`basename "$infile"`
-    `dirname $0`/rclpdf "$unopdf/${sinfile%.*}.pdf"
-else
-    # output the catppt result
-    echo ''
-    #echo '' "$title" ''
-    echo ''
-    echo ''
-    echo '
'
-
-    catppt -d utf-8 "$infile" | \
-        sed -e 's/'
-    echo ''
-fi
+echo '
'