recoll/src/filters/rclkar
2011-01-31 20:23:56 +01:00

323 lines
11 KiB
Python
Executable File

#!/usr/bin/env python
# Read a .kar midi karaoke file and translate to recoll indexable format
import rclexecm
import sys
import os.path
import string
import re
import codecs
try:
import rcllatinclass
except:
pass
try:
import midi
except:
print "RECFILTERROR HELPERNOTFOUND python:midi"
sys.exit(1);
try:
import chardet
has_chardet = True
except:
has_chardet = False
# Prototype for the html document we're returning
htmltemplate = '''
<html>
<head>
<meta http-equiv="content-type" content="text/html; charset=utf-8">
<title>%s</title>
<meta name="author" content="%s">
<meta name="language" content="%s">
</head>
<body>
%s
</body>
</html>
'''
class KarTextExtractor:
# Afaik, the only charset encodings with null bytes are variations on
# utf-16 and utf-32 and iso relatives. A hopefully comprehensive
# list follows, compiled from iconv and python values. This is used for
# stripping garbage from some files.
acceptnullencodings = \
set(('csucs4', 'csunicode', 'csunicode11', 'iso-10646-ucs-2',
'iso-10646-ucs-4', 'u16', 'u32', 'ucs-2', 'ucs-2-internal',
'ucs-2-swapped', 'ucs-2be', 'ucs-2le', 'ucs-4', 'ucs-4-internal',
'ucs-4-swapped', 'ucs-4be', 'ucs-4le', 'unicode-1-1', 'unicodebig',
'unicodelittle', 'utf-16', 'utf-16be', 'utf-16le', 'utf-32',
'utf-32be', 'utf-32le', 'utf16', 'utf32', 'utf_16', 'utf_16_be',
'utf_16_le', 'utf_32', 'utf_32_be', 'utf_32_le'))
def __init__(self, em):
self.em = em
self.currentindex = 0
self.encoding = ""
self.defaultencoding = ""
self.hadnulls = False
self.classifier = None
# Compute the fallback encoding to use if we can't determine
# one when processing the file. Based on the nls environment
try:
self.defaultencoding = sys.getfilesystemencoding()
except:
pass
if self.defaultencoding is None:
self.defaultencoding = sys.getdefaultencoding()
if not self.defaultencoding or \
self.defaultencoding.lower().find('ascii') != -1:
self.defaultencoding = 'cp1252'
try:
codecs.lookup(self.defaultencoding)
except:
self.defaultencoding = 'cp1252'
def nulltrunc(self, data):
'''Truncate data after 1st null byte. For messages with garbage after
a null byte. Must not be done for utf-16/32 of course'''
if not data:
return data
firstnull = data.find(chr(0))
if firstnull != -1:
self.hadnulls = True
data = data[0 : firstnull]
return data
def reencode(self, data):
'''Decode from whatever encoding we think this file is using
and reencode as UTF-8'''
# self.em.rclog("Reencoding from [%s] to UTF-8" % self.encoding)
if data:
try:
data = data.decode(self.encoding, 'ignore')
except Exception, err:
self.em.rclog("Decode failed: " + str(err))
return ""
try:
data = data.encode('utf-8')
except Exception, err:
self.em.rclog("Encode failed: " + str(err))
return ""
data = self.em.htmlescape(data).replace("\n", "<br>\n")
return data
def encodingfromfilename(self, fn):
'''Compute encoding from file name: some karaoke files have the
encoding as part of the file name as 'some title
(encoding).xxx'. This is not an established convention though,
just one our users could use if there is trouble with guessing
encodings'''
rexp = r'\(([^\)]+)\)\.[a-zA-Z]+$'
m = re.search(rexp, fn)
if m:
return m.group(1)
else:
return ""
def chardet_detect(self, text):
encodconf = chardet.detect(text)
encoding = encodconf['encoding']
confidence = encodconf['confidence']
#self.em.rclog("Chardet returns %s %.2f" % (encoding,confidence))
# chardet is awfully bad at detecting 8bit european
# encodings/languages and will mostly return iso-8859-2 for
# everything, which is a bad default (iso-8859-1/cp1252 being
# much more common). We use our own ad-hoc stopwords based
# module to try and improve
if encoding.lower() == 'iso-8859-2':
if self.classifier is None:
try:
import __main__
dir = os.path.dirname(__main__.__file__)
langszip = os.path.join(dir, 'rcllatinstops.zip')
f = open(langszip)
f.close()
classifier = rcllatinclass.European8859TextClassifier(langszip)
except:
self.em.rclog("Can't build euroclassifier (missing stopwords zip?")
return (encoding, confidence)
try:
lang,code,count = classifier.classify(text)
#self.em.rclog("euclass lang/code/matchcount: %s %s %d" % \
# (lang, code, count))
if count > 0:
confidence = 1.0
encoding = code
except Exception, err:
self.em.rclog("stopwords-based classifier failed: %s" % err)
return (encoding, confidence)
return (encoding, confidence)
def extractone(self, params):
'''Process one file'''
docdata = ""
ok = False
if not params.has_key("filename:"):
self.em.rclog("extractone: no mime or file name")
return (ok, docdata, "", rclexecm.RclExecM.eofnow)
filename = params["filename:"]
# Character encoding from file name ?
self.encoding = self.encodingfromfilename(filename)
if self.encoding:
try:
codecs.lookup(self.encoding)
except:
self.encoding = ""
# Mimetype not used for now
if not params.has_key("mimetype:"):
mimetype = 'audio/x-midi'
else:
mimetype = params["mimetype:"]
# Read in and midi-decode the file
try:
stream = midi.read_midifile(filename)
except Exception, err:
self.em.rclog("extractone: midi extract failed: [%s]" % err)
return (ok, docdata, "", rclexecm.RclExecM.eofnow)
title = None
author = None
language = None
lyrics = ""
lyricsN = ""
self.hadnulls = False
for event in stream.iterevents():
edata = ""
if isinstance(event, midi.TextMetaEvent):
if not event.data:
continue
elif event.data[0] == '/' or event.data[0] == '\\':
edata = "\n" + event.data[1:]
elif event.data[0] == '[' or event.data[0] == ']':
edata = event.data[1:]
elif event.data[0] == '@':
if len(event.data) == 1:
continue
else:
if event.data[1] == 'I':
edata = event.data[2:] + '\n'
elif event.data[1] == 'L':
language = self.nulltrunc(event.data[2:])
languageN = event.data[2:]
elif event.data[1] == 'T':
if title is None:
title = self.nulltrunc(event.data[2:])
titleN = event.data[2:]
elif author is None:
author = self.nulltrunc(event.data[2:])
authorN = event.data[2:]
else:
edata = event.data
elif isinstance(event, midi.LryricsEvent) or \
isinstance(event, midi.TrackNameEvent):
space = ""
if isinstance(event, midi.TrackNameEvent):
nl = "\n"
if not event.data:
continue
elif event.data[0] == '/' or event.data[0] == '\\':
edata = "\n" + event.data[1:] + nl
else:
edata = event.data + nl
lyrics += self.nulltrunc(edata)
lyricsN += edata
# Try to guess the encoding. First do it with the data
# possibly containing nulls. If we get one of the accepted
# nullbyte encodings, go with this, else repeat with the
# de-nulled data
# self.em.rclog("Lyrics length %d" % len(lyrics))
if self.encoding == "" and has_chardet:
if self.hadnulls:
(encoding, confidence) = self.chardet_detect(lyricsN)
# self.em.rclog("With nulls: chardet: enc [%s], conf %.2f" % \
# (encoding, confidence))
if confidence > 0.6 and \
encoding.lower() in self.acceptnullencodings:
self.encoding = encoding
lyrics = lyricsN
title = titleN
author = authorN
if self.encoding == "":
(encoding, confidence) = self.chardet_detect(lyrics)
#self.em.rclog("No nulls: chardet: enc [%s], conf %.2f" % \
# (encoding, confidence))
if confidence > 0.6:
self.encoding = encoding
if self.encoding == "":
self.em.rclog("Encoding not guessed, defaulting to [%s]" % \
(self.defaultencoding,))
self.encoding = self.defaultencoding
if title is None:
title = ""
if author is None:
author = ""
if language is None:
language = ""
title = self.reencode(title)
author = self.reencode(author)
lyrics = self.reencode(lyrics)
self.em.setmimetype("text/html")
docdata = htmltemplate % (title, author, language, lyrics)
ok = True
return (ok, docdata, "", rclexecm.RclExecM.eofnext)
###### File type handler api, used by rclexecm. Some stuff makes little
# sense because we only have one doc per file.
def openfile(self, params):
self.currentindex = 0
return True
def getipath(self, params):
return self.extractone(params)
def getnext(self, params):
if self.currentindex >= 1:
return (False, "", "", rclexecm.RclExecM.eofnow)
else:
ret= self.extractone(params)
self.currentindex += 1
return ret
proto = rclexecm.RclExecM()
extract = KarTextExtractor(proto)
rclexecm.main(proto, extract)