diff --git a/.hgignore b/.hgignore
index 39a601d9..4c5d3123 100644
--- a/.hgignore
+++ b/.hgignore
@@ -52,6 +52,7 @@ src/doc/user/usermanual.html
src/doc/user/usermanual.html-text
src/doc/user/usermanual.txt
src/filters/rclexecm.pyc
+src/filters/eulangclass.pyc
src/index/alldeps
src/index/recollindex
src/lib/alldeps
diff --git a/src/filters/eulangclass.py b/src/filters/eulangclass.py
new file mode 100755
index 00000000..242f02d1
--- /dev/null
+++ b/src/filters/eulangclass.py
@@ -0,0 +1,81 @@
+#!/usr/bin/env python
+
+import sys
+import string
+import glob
+import os
+import os.path
+from zipfile import ZipFile
+
+class European8859TextClassifier:
+ def __init__(self, langzip):
+ self.langtables = self.readlanguages(langzip)
+
+ # Table to translate from punctuation to spaces
+ punct = '''*?[].@+-,#_$%&={};.,:!"''' + "\n\r"
+ spaces = ""
+ for c in punct:
+ spaces += " "
+ self.spacetable = string.maketrans(punct, spaces)
+
+ # Read the languages stopwords lists
+ def readlanguages(self, langzip):
+ zip = ZipFile(langzip)
+ langfiles = zip.namelist()
+ langs = []
+ for fn in langfiles:
+ text = zip.read(fn)
+ words = text.split()
+ langcode = os.path.basename(fn)
+ langcode = os.path.splitext(langcode)[0]
+ (lang,code) = langcode.split('_')
+ langs.append((lang, code, words))
+ return langs
+
+ def classify(self, rawtext):
+
+ # Remove punctuation
+ rawtext = rawtext.translate(self.spacetable)
+ # Split words
+ words = rawtext.split()
+ # Count frequencies
+ dict = {}
+ for w in words:
+ dict[w] = dict.get(w, 0) + 1
+ # Order word list by frequency
+ lfreq = sorted(dict.iteritems(), \
+ key=lambda entry: entry[1], reverse=True)
+ # Check the ntest most frequent words against the language lists and
+ # chose the best match
+ ntest = 10
+ maxcount = 0
+ maxlang = ""
+ maxcode = ""
+ for lang,code,lwords in self.langtables:
+ count = 0
+ for w,c in lfreq[0:ntest]:
+ if w in lwords:
+ count += 1
+ print "Lang %s code %s count %d" % (lang, code, count)
+ if maxcount < count:
+ maxlang = lang
+ maxcount = count
+ maxcode = code
+ # If match too bad, default to most common
+ if maxcount == 0:
+ maxlang,maxcode = ('english', 'cp1252')
+ return (maxlang, maxcode, maxcount)
+
+
+if __name__ == "__main__":
+ f = open(sys.argv[1])
+ rawtext = f.read()
+ f.close()
+
+ dir = os.path.dirname(__file__)
+ langszip = os.path.join(dir, 'iso8859stops.zip')
+
+ classifier = European8859TextClassifier(langszip)
+
+ lang,code,count = classifier.classify(rawtext)
+ print "Chosen lang/code/matchcount: %s %s %d" % (lang, code, count)
diff --git a/src/filters/iso8859stops.zip b/src/filters/iso8859stops.zip
new file mode 100644
index 00000000..0b0c6440
Binary files /dev/null and b/src/filters/iso8859stops.zip differ
diff --git a/src/filters/rclkar b/src/filters/rclkar
index 52974898..bcdf82bc 100755
--- a/src/filters/rclkar
+++ b/src/filters/rclkar
@@ -9,12 +9,20 @@ import string
import re
import codecs
+import eulangclass
+
try:
import midi
except:
print "RECFILTERROR HELPERNOTFOUND python:midi"
sys.exit(1);
+try:
+ import chardet
+ has_chardet = True
+except:
+ has_chardet = False
+
# Prototype for the html document we're returning
htmltemplate = '''
@@ -49,53 +57,72 @@ class KarTextExtractor:
self.currentindex = 0
self.encoding = ""
self.defaultencoding = ""
- self.acceptnulls = False
+ self.hadnulls = False
+ # Compute the fallback encoding to use if we can't determine
+ # one when processing the file. Based on the nls environment
try:
self.defaultencoding = sys.getfilesystemencoding()
except:
pass
+
if self.defaultencoding is None:
self.defaultencoding = sys.getdefaultencoding()
if not self.defaultencoding or \
self.defaultencoding.lower().find('ascii') != -1:
- self.defaultencoding = 'latin_1'
+ self.defaultencoding = 'cp1252'
+
try:
codecs.lookup(self.defaultencoding)
except:
- self.defaultencoding = 'latin_1'
+ self.defaultencoding = 'cp1252'
+
+
+ def nulltrunc(self, data):
+ '''Truncate data after 1st null byte. For messages with garbage after
+ a null byte. Must not be done for utf-16/32 of course'''
- # Try to decode input binary string then encode to utf-8 for output
- def reencode(self, data):
- text = ""
if not data:
- return text
+ return data
- # Some files have garbage data after a null byte.
- if not self.acceptnulls:
- firstnull = data.find(chr(0))
- if firstnull != -1:
- data = data[0 : firstnull]
-
- try:
- text = data.decode(self.encoding, 'ignore')
- except Exception, err:
- self.em.rclog("Decode failed: " + str(err))
- return ""
- try:
- text = text.encode('utf-8')
- except Exception, err:
- self.em.rclog("Encode failed: " + str(err))
- return ""
+ firstnull = data.find(chr(0))
+ if firstnull != -1:
+ self.hadnulls = True
+ data = data[0 : firstnull]
+ return data
+
+
+ def reencode(self, data):
+ '''Decode from whatever encoding we think this file is using
+ and reencode as UTF-8'''
+
+ # self.em.rclog("Reencoding from [%s] to UTF-8" % self.encoding)
+
+ if data:
+ try:
+ data = data.decode(self.encoding, 'ignore')
+ except Exception, err:
+ self.em.rclog("Decode failed: " + str(err))
+ return ""
+ try:
+ data = data.encode('utf-8')
+ except Exception, err:
+ self.em.rclog("Encode failed: " + str(err))
+ return ""
- text = self.em.htmlescape(text).replace("\n", "
\n")
- return text
+ data = self.em.htmlescape(data).replace("\n", "
\n")
+
+ return data
+
- # Some karaoke files have the encoding as part of the file name
- # as 'some title (encoding).xxx' Not sure the whitespace before
- # the '(' has to be there, so not relying on this
def encodingfromfilename(self, fn):
+ '''Compute encoding from file name: some karaoke files have the
+ encoding as part of the file name as 'some title
+ (encoding).xxx'. This is not an established convention though,
+ just one our users could use if there is trouble with guessing
+ encodings'''
+
rexp = r'\(([^\)]+)\)\.[a-zA-Z]+$'
m = re.search(rexp, fn)
if m:
@@ -103,7 +130,43 @@ class KarTextExtractor:
else:
return ""
+ def chardet_detect(self, text):
+ encodconf = chardet.detect(text)
+ encoding = encodconf['encoding']
+ confidence = encodconf['confidence']
+ self.em.rclog("Chardet returns %s %.2f" % (encoding,confidence))
+ # chardet is awfully bad at detecting 8bit european
+ # encodings/languages and will mostly return iso-8859-2 for
+ # everything, which is a bad default (iso-8859-1/cp1252 being
+ # much more common). We use our own ad-hoc stopwords based
+ # module to try and improve
+ if encoding.lower() == 'iso-8859-2':
+ try:
+ import __main__
+ dir = os.path.dirname(__main__.__file__)
+ langszip = os.path.join(dir, 'iso8859stops.zip')
+ f = open(langszip)
+ f.close()
+ except:
+ self.em.rclog("Can't the find the language stopwords zipfile")
+ return (encoding, confidence)
+ try:
+ classifier = eulangclass.European8859TextClassifier(langszip)
+ lang,code,count = classifier.classify(text)
+ self.em.rclog("euclass lang/code/matchcount: %s %s %d" % \
+ (lang, code, count))
+ if count > 0:
+ confidence = 1.0
+ encoding = code
+ except Exception, err:
+ self.em.rclog("stopwords-based classifier failed: %s" % err)
+ return (encoding, confidence)
+
+ return (encoding, confidence)
+
+
def extractone(self, params):
+ '''Process one file'''
docdata = ""
ok = False
@@ -112,15 +175,13 @@ class KarTextExtractor:
return (ok, docdata, "", rclexecm.RclExecM.eofnow)
filename = params["filename:"]
+ # Character encoding from file name ?
self.encoding = self.encodingfromfilename(filename)
- try:
- codecs.lookup(self.encoding)
- except:
- self.em.rclog("Encoding [%s] not found, defaulting to [%s]" % \
- (self.encoding, self.defaultencoding))
- self.encoding = self.defaultencoding
-
- self.acceptnulls = self.encoding.lower() in self.acceptnullencodings
+ if self.encoding:
+ try:
+ codecs.lookup(self.encoding)
+ except:
+ self.encoding = ""
# Mimetype not used for now
if not params.has_key("mimetype:"):
@@ -128,41 +189,47 @@ class KarTextExtractor:
else:
mimetype = params["mimetype:"]
+ # Read in and midi-decode the file
try:
stream = midi.read_midifile(filename)
except Exception, err:
- self.em.rclog("extractone: extract failed: [%s]" % err)
+ self.em.rclog("extractone: midi extract failed: [%s]" % err)
return (ok, docdata, "", rclexecm.RclExecM.eofnow)
title = None
author = None
language = None
lyrics = ""
-
+ lyricsN = ""
+ self.hadnulls = False
+
for event in stream.iterevents():
edata = ""
if isinstance(event, midi.TextMetaEvent):
if not event.data:
continue
elif event.data[0] == '/' or event.data[0] == '\\':
- edata += "\n" + event.data[1:]
+ edata = "\n" + event.data[1:]
elif event.data[0] == '[' or event.data[0] == ']':
- edata += event.data[1:]
+ edata = event.data[1:]
elif event.data[0] == '@':
if len(event.data) == 1:
continue
else:
if event.data[1] == 'I':
- edata += event.data[2:] + '\n'
+ edata = event.data[2:] + '\n'
elif event.data[1] == 'L':
- language = self.reencode(event.data[2:])
+ language = self.nulltrunc(event.data[2:])
+ languageN = event.data[2:]
elif event.data[1] == 'T':
if title is None:
- title = self.reencode(event.data[2:])
+ title = self.nulltrunc(event.data[2:])
+ titleN = event.data[2:]
elif author is None:
- author = self.reencode(event.data[2:])
+ author = self.nulltrunc(event.data[2:])
+ authorN = event.data[2:]
else:
- edata += event.data
+ edata = event.data
elif isinstance(event, midi.LryricsEvent) or \
isinstance(event, midi.TrackNameEvent):
space = ""
@@ -171,13 +238,44 @@ class KarTextExtractor:
if not event.data:
continue
elif event.data[0] == '/' or event.data[0] == '\\':
- edata += "\n" + event.data[1:] + nl
+ edata = "\n" + event.data[1:] + nl
else:
- edata += event.data + nl
+ edata = event.data + nl
- lyrics += self.reencode(edata)
+ lyrics += self.nulltrunc(edata)
+ lyricsN += edata
-
+
+ # Try to guess the encoding. First do it with the data
+ # possibly containing nulls. If we get one of the accepted
+ # nullbyte encodings, go with this, else repeat with the
+ # de-nulled data
+
+ # self.em.rclog("Lyrics length %d" % len(lyrics))
+
+ if self.encoding == "" and has_chardet:
+ if self.hadnulls:
+ (encoding, confidence) = self.chardet_detect(lyricsN)
+ # self.em.rclog("With nulls: chardet: enc [%s], conf %.2f" % \
+ # (encoding, confidence))
+ if confidence > 0.6 and \
+ encoding.lower() in self.acceptnullencodings:
+ self.encoding = encoding
+ lyrics = lyricsN
+ title = titleN
+ author = authorN
+ if self.encoding == "":
+ (encoding, confidence) = self.chardet_detect(lyrics)
+ self.em.rclog("No nulls: chardet: enc [%s], conf %.2f" % \
+ (encoding, confidence))
+ if confidence > 0.6:
+ self.encoding = encoding
+
+ if self.encoding == "":
+ self.em.rclog("Encoding not guessed, defaulting to [%s]" % \
+ (self.defaultencoding,))
+ self.encoding = self.defaultencoding
+
if title is None:
title = ""
if author is None:
@@ -185,6 +283,10 @@ class KarTextExtractor:
if language is None:
language = ""
+ title = self.reencode(title)
+ author = self.reencode(author)
+ lyrics = self.reencode(lyrics)
+
self.em.setmimetype("text/html")
docdata = htmltemplate % (title, author, language, lyrics)