diff --git a/.hgignore b/.hgignore index 39a601d9..4c5d3123 100644 --- a/.hgignore +++ b/.hgignore @@ -52,6 +52,7 @@ src/doc/user/usermanual.html src/doc/user/usermanual.html-text src/doc/user/usermanual.txt src/filters/rclexecm.pyc +src/filters/eulangclass.pyc src/index/alldeps src/index/recollindex src/lib/alldeps diff --git a/src/filters/eulangclass.py b/src/filters/eulangclass.py new file mode 100755 index 00000000..242f02d1 --- /dev/null +++ b/src/filters/eulangclass.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python + +import sys +import string +import glob +import os +import os.path +from zipfile import ZipFile + +class European8859TextClassifier: + def __init__(self, langzip): + self.langtables = self.readlanguages(langzip) + + # Table to translate from punctuation to spaces + punct = '''*?[].@+-,#_$%&={};.,:!"''' + "\n\r" + spaces = "" + for c in punct: + spaces += " " + self.spacetable = string.maketrans(punct, spaces) + + # Read the languages stopwords lists + def readlanguages(self, langzip): + zip = ZipFile(langzip) + langfiles = zip.namelist() + langs = [] + for fn in langfiles: + text = zip.read(fn) + words = text.split() + langcode = os.path.basename(fn) + langcode = os.path.splitext(langcode)[0] + (lang,code) = langcode.split('_') + langs.append((lang, code, words)) + return langs + + def classify(self, rawtext): + + # Remove punctuation + rawtext = rawtext.translate(self.spacetable) + # Split words + words = rawtext.split() + # Count frequencies + dict = {} + for w in words: + dict[w] = dict.get(w, 0) + 1 + # Order word list by frequency + lfreq = sorted(dict.iteritems(), \ + key=lambda entry: entry[1], reverse=True) + # Check the ntest most frequent words against the language lists and + # chose the best match + ntest = 10 + maxcount = 0 + maxlang = "" + maxcode = "" + for lang,code,lwords in self.langtables: + count = 0 + for w,c in lfreq[0:ntest]: + if w in lwords: + count += 1 + print "Lang %s code %s count %d" % (lang, code, count) + if maxcount < count: + maxlang = lang + maxcount = count + maxcode = code + # If match too bad, default to most common + if maxcount == 0: + maxlang,maxcode = ('english', 'cp1252') + return (maxlang, maxcode, maxcount) + + +if __name__ == "__main__": + f = open(sys.argv[1]) + rawtext = f.read() + f.close() + + dir = os.path.dirname(__file__) + langszip = os.path.join(dir, 'iso8859stops.zip') + + classifier = European8859TextClassifier(langszip) + + lang,code,count = classifier.classify(rawtext) + print "Chosen lang/code/matchcount: %s %s %d" % (lang, code, count) diff --git a/src/filters/iso8859stops.zip b/src/filters/iso8859stops.zip new file mode 100644 index 00000000..0b0c6440 Binary files /dev/null and b/src/filters/iso8859stops.zip differ diff --git a/src/filters/rclkar b/src/filters/rclkar index 52974898..bcdf82bc 100755 --- a/src/filters/rclkar +++ b/src/filters/rclkar @@ -9,12 +9,20 @@ import string import re import codecs +import eulangclass + try: import midi except: print "RECFILTERROR HELPERNOTFOUND python:midi" sys.exit(1); +try: + import chardet + has_chardet = True +except: + has_chardet = False + # Prototype for the html document we're returning htmltemplate = ''' @@ -49,53 +57,72 @@ class KarTextExtractor: self.currentindex = 0 self.encoding = "" self.defaultencoding = "" - self.acceptnulls = False + self.hadnulls = False + # Compute the fallback encoding to use if we can't determine + # one when processing the file. Based on the nls environment try: self.defaultencoding = sys.getfilesystemencoding() except: pass + if self.defaultencoding is None: self.defaultencoding = sys.getdefaultencoding() if not self.defaultencoding or \ self.defaultencoding.lower().find('ascii') != -1: - self.defaultencoding = 'latin_1' + self.defaultencoding = 'cp1252' + try: codecs.lookup(self.defaultencoding) except: - self.defaultencoding = 'latin_1' + self.defaultencoding = 'cp1252' + + + def nulltrunc(self, data): + '''Truncate data after 1st null byte. For messages with garbage after + a null byte. Must not be done for utf-16/32 of course''' - # Try to decode input binary string then encode to utf-8 for output - def reencode(self, data): - text = "" if not data: - return text + return data - # Some files have garbage data after a null byte. - if not self.acceptnulls: - firstnull = data.find(chr(0)) - if firstnull != -1: - data = data[0 : firstnull] - - try: - text = data.decode(self.encoding, 'ignore') - except Exception, err: - self.em.rclog("Decode failed: " + str(err)) - return "" - try: - text = text.encode('utf-8') - except Exception, err: - self.em.rclog("Encode failed: " + str(err)) - return "" + firstnull = data.find(chr(0)) + if firstnull != -1: + self.hadnulls = True + data = data[0 : firstnull] + return data + + + def reencode(self, data): + '''Decode from whatever encoding we think this file is using + and reencode as UTF-8''' + + # self.em.rclog("Reencoding from [%s] to UTF-8" % self.encoding) + + if data: + try: + data = data.decode(self.encoding, 'ignore') + except Exception, err: + self.em.rclog("Decode failed: " + str(err)) + return "" + try: + data = data.encode('utf-8') + except Exception, err: + self.em.rclog("Encode failed: " + str(err)) + return "" - text = self.em.htmlescape(text).replace("\n", "
\n") - return text + data = self.em.htmlescape(data).replace("\n", "
\n") + + return data + - # Some karaoke files have the encoding as part of the file name - # as 'some title (encoding).xxx' Not sure the whitespace before - # the '(' has to be there, so not relying on this def encodingfromfilename(self, fn): + '''Compute encoding from file name: some karaoke files have the + encoding as part of the file name as 'some title + (encoding).xxx'. This is not an established convention though, + just one our users could use if there is trouble with guessing + encodings''' + rexp = r'\(([^\)]+)\)\.[a-zA-Z]+$' m = re.search(rexp, fn) if m: @@ -103,7 +130,43 @@ class KarTextExtractor: else: return "" + def chardet_detect(self, text): + encodconf = chardet.detect(text) + encoding = encodconf['encoding'] + confidence = encodconf['confidence'] + self.em.rclog("Chardet returns %s %.2f" % (encoding,confidence)) + # chardet is awfully bad at detecting 8bit european + # encodings/languages and will mostly return iso-8859-2 for + # everything, which is a bad default (iso-8859-1/cp1252 being + # much more common). We use our own ad-hoc stopwords based + # module to try and improve + if encoding.lower() == 'iso-8859-2': + try: + import __main__ + dir = os.path.dirname(__main__.__file__) + langszip = os.path.join(dir, 'iso8859stops.zip') + f = open(langszip) + f.close() + except: + self.em.rclog("Can't the find the language stopwords zipfile") + return (encoding, confidence) + try: + classifier = eulangclass.European8859TextClassifier(langszip) + lang,code,count = classifier.classify(text) + self.em.rclog("euclass lang/code/matchcount: %s %s %d" % \ + (lang, code, count)) + if count > 0: + confidence = 1.0 + encoding = code + except Exception, err: + self.em.rclog("stopwords-based classifier failed: %s" % err) + return (encoding, confidence) + + return (encoding, confidence) + + def extractone(self, params): + '''Process one file''' docdata = "" ok = False @@ -112,15 +175,13 @@ class KarTextExtractor: return (ok, docdata, "", rclexecm.RclExecM.eofnow) filename = params["filename:"] + # Character encoding from file name ? self.encoding = self.encodingfromfilename(filename) - try: - codecs.lookup(self.encoding) - except: - self.em.rclog("Encoding [%s] not found, defaulting to [%s]" % \ - (self.encoding, self.defaultencoding)) - self.encoding = self.defaultencoding - - self.acceptnulls = self.encoding.lower() in self.acceptnullencodings + if self.encoding: + try: + codecs.lookup(self.encoding) + except: + self.encoding = "" # Mimetype not used for now if not params.has_key("mimetype:"): @@ -128,41 +189,47 @@ class KarTextExtractor: else: mimetype = params["mimetype:"] + # Read in and midi-decode the file try: stream = midi.read_midifile(filename) except Exception, err: - self.em.rclog("extractone: extract failed: [%s]" % err) + self.em.rclog("extractone: midi extract failed: [%s]" % err) return (ok, docdata, "", rclexecm.RclExecM.eofnow) title = None author = None language = None lyrics = "" - + lyricsN = "" + self.hadnulls = False + for event in stream.iterevents(): edata = "" if isinstance(event, midi.TextMetaEvent): if not event.data: continue elif event.data[0] == '/' or event.data[0] == '\\': - edata += "\n" + event.data[1:] + edata = "\n" + event.data[1:] elif event.data[0] == '[' or event.data[0] == ']': - edata += event.data[1:] + edata = event.data[1:] elif event.data[0] == '@': if len(event.data) == 1: continue else: if event.data[1] == 'I': - edata += event.data[2:] + '\n' + edata = event.data[2:] + '\n' elif event.data[1] == 'L': - language = self.reencode(event.data[2:]) + language = self.nulltrunc(event.data[2:]) + languageN = event.data[2:] elif event.data[1] == 'T': if title is None: - title = self.reencode(event.data[2:]) + title = self.nulltrunc(event.data[2:]) + titleN = event.data[2:] elif author is None: - author = self.reencode(event.data[2:]) + author = self.nulltrunc(event.data[2:]) + authorN = event.data[2:] else: - edata += event.data + edata = event.data elif isinstance(event, midi.LryricsEvent) or \ isinstance(event, midi.TrackNameEvent): space = "" @@ -171,13 +238,44 @@ class KarTextExtractor: if not event.data: continue elif event.data[0] == '/' or event.data[0] == '\\': - edata += "\n" + event.data[1:] + nl + edata = "\n" + event.data[1:] + nl else: - edata += event.data + nl + edata = event.data + nl - lyrics += self.reencode(edata) + lyrics += self.nulltrunc(edata) + lyricsN += edata - + + # Try to guess the encoding. First do it with the data + # possibly containing nulls. If we get one of the accepted + # nullbyte encodings, go with this, else repeat with the + # de-nulled data + + # self.em.rclog("Lyrics length %d" % len(lyrics)) + + if self.encoding == "" and has_chardet: + if self.hadnulls: + (encoding, confidence) = self.chardet_detect(lyricsN) + # self.em.rclog("With nulls: chardet: enc [%s], conf %.2f" % \ + # (encoding, confidence)) + if confidence > 0.6 and \ + encoding.lower() in self.acceptnullencodings: + self.encoding = encoding + lyrics = lyricsN + title = titleN + author = authorN + if self.encoding == "": + (encoding, confidence) = self.chardet_detect(lyrics) + self.em.rclog("No nulls: chardet: enc [%s], conf %.2f" % \ + (encoding, confidence)) + if confidence > 0.6: + self.encoding = encoding + + if self.encoding == "": + self.em.rclog("Encoding not guessed, defaulting to [%s]" % \ + (self.defaultencoding,)) + self.encoding = self.defaultencoding + if title is None: title = "" if author is None: @@ -185,6 +283,10 @@ class KarTextExtractor: if language is None: language = "" + title = self.reencode(title) + author = self.reencode(author) + lyrics = self.reencode(lyrics) + self.em.setmimetype("text/html") docdata = htmltemplate % (title, author, language, lyrics)