Added language-based helper for classifying iso-8859-x encodings

This commit is contained in:
Jean-Francois Dockes 2011-01-31 09:32:26 +01:00
parent 91e740074e
commit 879225d687
4 changed files with 233 additions and 49 deletions

View File

@ -52,6 +52,7 @@ src/doc/user/usermanual.html
src/doc/user/usermanual.html-text src/doc/user/usermanual.html-text
src/doc/user/usermanual.txt src/doc/user/usermanual.txt
src/filters/rclexecm.pyc src/filters/rclexecm.pyc
src/filters/eulangclass.pyc
src/index/alldeps src/index/alldeps
src/index/recollindex src/index/recollindex
src/lib/alldeps src/lib/alldeps

81
src/filters/eulangclass.py Executable file
View File

@ -0,0 +1,81 @@
#!/usr/bin/env python
import sys
import string
import glob
import os
import os.path
from zipfile import ZipFile
class European8859TextClassifier:
def __init__(self, langzip):
self.langtables = self.readlanguages(langzip)
# Table to translate from punctuation to spaces
punct = '''*?[].@+-,#_$%&={};.,:!"''' + "\n\r"
spaces = ""
for c in punct:
spaces += " "
self.spacetable = string.maketrans(punct, spaces)
# Read the languages stopwords lists
def readlanguages(self, langzip):
zip = ZipFile(langzip)
langfiles = zip.namelist()
langs = []
for fn in langfiles:
text = zip.read(fn)
words = text.split()
langcode = os.path.basename(fn)
langcode = os.path.splitext(langcode)[0]
(lang,code) = langcode.split('_')
langs.append((lang, code, words))
return langs
def classify(self, rawtext):
# Remove punctuation
rawtext = rawtext.translate(self.spacetable)
# Split words
words = rawtext.split()
# Count frequencies
dict = {}
for w in words:
dict[w] = dict.get(w, 0) + 1
# Order word list by frequency
lfreq = sorted(dict.iteritems(), \
key=lambda entry: entry[1], reverse=True)
# Check the ntest most frequent words against the language lists and
# chose the best match
ntest = 10
maxcount = 0
maxlang = ""
maxcode = ""
for lang,code,lwords in self.langtables:
count = 0
for w,c in lfreq[0:ntest]:
if w in lwords:
count += 1
print "Lang %s code %s count %d" % (lang, code, count)
if maxcount < count:
maxlang = lang
maxcount = count
maxcode = code
# If match too bad, default to most common
if maxcount == 0:
maxlang,maxcode = ('english', 'cp1252')
return (maxlang, maxcode, maxcount)
if __name__ == "__main__":
f = open(sys.argv[1])
rawtext = f.read()
f.close()
dir = os.path.dirname(__file__)
langszip = os.path.join(dir, 'iso8859stops.zip')
classifier = European8859TextClassifier(langszip)
lang,code,count = classifier.classify(rawtext)
print "Chosen lang/code/matchcount: %s %s %d" % (lang, code, count)

Binary file not shown.

View File

@ -9,12 +9,20 @@ import string
import re import re
import codecs import codecs
import eulangclass
try: try:
import midi import midi
except: except:
print "RECFILTERROR HELPERNOTFOUND python:midi" print "RECFILTERROR HELPERNOTFOUND python:midi"
sys.exit(1); sys.exit(1);
try:
import chardet
has_chardet = True
except:
has_chardet = False
# Prototype for the html document we're returning # Prototype for the html document we're returning
htmltemplate = ''' htmltemplate = '''
<html> <html>
@ -49,53 +57,72 @@ class KarTextExtractor:
self.currentindex = 0 self.currentindex = 0
self.encoding = "" self.encoding = ""
self.defaultencoding = "" self.defaultencoding = ""
self.acceptnulls = False self.hadnulls = False
# Compute the fallback encoding to use if we can't determine
# one when processing the file. Based on the nls environment
try: try:
self.defaultencoding = sys.getfilesystemencoding() self.defaultencoding = sys.getfilesystemencoding()
except: except:
pass pass
if self.defaultencoding is None: if self.defaultencoding is None:
self.defaultencoding = sys.getdefaultencoding() self.defaultencoding = sys.getdefaultencoding()
if not self.defaultencoding or \ if not self.defaultencoding or \
self.defaultencoding.lower().find('ascii') != -1: self.defaultencoding.lower().find('ascii') != -1:
self.defaultencoding = 'latin_1' self.defaultencoding = 'cp1252'
try: try:
codecs.lookup(self.defaultencoding) codecs.lookup(self.defaultencoding)
except: except:
self.defaultencoding = 'latin_1' self.defaultencoding = 'cp1252'
def nulltrunc(self, data):
'''Truncate data after 1st null byte. For messages with garbage after
a null byte. Must not be done for utf-16/32 of course'''
# Try to decode input binary string then encode to utf-8 for output
def reencode(self, data):
text = ""
if not data: if not data:
return text return data
# Some files have garbage data after a null byte. firstnull = data.find(chr(0))
if not self.acceptnulls: if firstnull != -1:
firstnull = data.find(chr(0)) self.hadnulls = True
if firstnull != -1: data = data[0 : firstnull]
data = data[0 : firstnull] return data
try:
text = data.decode(self.encoding, 'ignore') def reencode(self, data):
except Exception, err: '''Decode from whatever encoding we think this file is using
self.em.rclog("Decode failed: " + str(err)) and reencode as UTF-8'''
return ""
try: # self.em.rclog("Reencoding from [%s] to UTF-8" % self.encoding)
text = text.encode('utf-8')
except Exception, err: if data:
self.em.rclog("Encode failed: " + str(err)) try:
return "" data = data.decode(self.encoding, 'ignore')
except Exception, err:
self.em.rclog("Decode failed: " + str(err))
return ""
try:
data = data.encode('utf-8')
except Exception, err:
self.em.rclog("Encode failed: " + str(err))
return ""
text = self.em.htmlescape(text).replace("\n", "<br>\n") data = self.em.htmlescape(data).replace("\n", "<br>\n")
return text
return data
# Some karaoke files have the encoding as part of the file name
# as 'some title (encoding).xxx' Not sure the whitespace before
# the '(' has to be there, so not relying on this
def encodingfromfilename(self, fn): def encodingfromfilename(self, fn):
'''Compute encoding from file name: some karaoke files have the
encoding as part of the file name as 'some title
(encoding).xxx'. This is not an established convention though,
just one our users could use if there is trouble with guessing
encodings'''
rexp = r'\(([^\)]+)\)\.[a-zA-Z]+$' rexp = r'\(([^\)]+)\)\.[a-zA-Z]+$'
m = re.search(rexp, fn) m = re.search(rexp, fn)
if m: if m:
@ -103,7 +130,43 @@ class KarTextExtractor:
else: else:
return "" return ""
def chardet_detect(self, text):
encodconf = chardet.detect(text)
encoding = encodconf['encoding']
confidence = encodconf['confidence']
self.em.rclog("Chardet returns %s %.2f" % (encoding,confidence))
# chardet is awfully bad at detecting 8bit european
# encodings/languages and will mostly return iso-8859-2 for
# everything, which is a bad default (iso-8859-1/cp1252 being
# much more common). We use our own ad-hoc stopwords based
# module to try and improve
if encoding.lower() == 'iso-8859-2':
try:
import __main__
dir = os.path.dirname(__main__.__file__)
langszip = os.path.join(dir, 'iso8859stops.zip')
f = open(langszip)
f.close()
except:
self.em.rclog("Can't the find the language stopwords zipfile")
return (encoding, confidence)
try:
classifier = eulangclass.European8859TextClassifier(langszip)
lang,code,count = classifier.classify(text)
self.em.rclog("euclass lang/code/matchcount: %s %s %d" % \
(lang, code, count))
if count > 0:
confidence = 1.0
encoding = code
except Exception, err:
self.em.rclog("stopwords-based classifier failed: %s" % err)
return (encoding, confidence)
return (encoding, confidence)
def extractone(self, params): def extractone(self, params):
'''Process one file'''
docdata = "" docdata = ""
ok = False ok = False
@ -112,15 +175,13 @@ class KarTextExtractor:
return (ok, docdata, "", rclexecm.RclExecM.eofnow) return (ok, docdata, "", rclexecm.RclExecM.eofnow)
filename = params["filename:"] filename = params["filename:"]
# Character encoding from file name ?
self.encoding = self.encodingfromfilename(filename) self.encoding = self.encodingfromfilename(filename)
try: if self.encoding:
codecs.lookup(self.encoding) try:
except: codecs.lookup(self.encoding)
self.em.rclog("Encoding [%s] not found, defaulting to [%s]" % \ except:
(self.encoding, self.defaultencoding)) self.encoding = ""
self.encoding = self.defaultencoding
self.acceptnulls = self.encoding.lower() in self.acceptnullencodings
# Mimetype not used for now # Mimetype not used for now
if not params.has_key("mimetype:"): if not params.has_key("mimetype:"):
@ -128,41 +189,47 @@ class KarTextExtractor:
else: else:
mimetype = params["mimetype:"] mimetype = params["mimetype:"]
# Read in and midi-decode the file
try: try:
stream = midi.read_midifile(filename) stream = midi.read_midifile(filename)
except Exception, err: except Exception, err:
self.em.rclog("extractone: extract failed: [%s]" % err) self.em.rclog("extractone: midi extract failed: [%s]" % err)
return (ok, docdata, "", rclexecm.RclExecM.eofnow) return (ok, docdata, "", rclexecm.RclExecM.eofnow)
title = None title = None
author = None author = None
language = None language = None
lyrics = "" lyrics = ""
lyricsN = ""
self.hadnulls = False
for event in stream.iterevents(): for event in stream.iterevents():
edata = "" edata = ""
if isinstance(event, midi.TextMetaEvent): if isinstance(event, midi.TextMetaEvent):
if not event.data: if not event.data:
continue continue
elif event.data[0] == '/' or event.data[0] == '\\': elif event.data[0] == '/' or event.data[0] == '\\':
edata += "\n" + event.data[1:] edata = "\n" + event.data[1:]
elif event.data[0] == '[' or event.data[0] == ']': elif event.data[0] == '[' or event.data[0] == ']':
edata += event.data[1:] edata = event.data[1:]
elif event.data[0] == '@': elif event.data[0] == '@':
if len(event.data) == 1: if len(event.data) == 1:
continue continue
else: else:
if event.data[1] == 'I': if event.data[1] == 'I':
edata += event.data[2:] + '\n' edata = event.data[2:] + '\n'
elif event.data[1] == 'L': elif event.data[1] == 'L':
language = self.reencode(event.data[2:]) language = self.nulltrunc(event.data[2:])
languageN = event.data[2:]
elif event.data[1] == 'T': elif event.data[1] == 'T':
if title is None: if title is None:
title = self.reencode(event.data[2:]) title = self.nulltrunc(event.data[2:])
titleN = event.data[2:]
elif author is None: elif author is None:
author = self.reencode(event.data[2:]) author = self.nulltrunc(event.data[2:])
authorN = event.data[2:]
else: else:
edata += event.data edata = event.data
elif isinstance(event, midi.LryricsEvent) or \ elif isinstance(event, midi.LryricsEvent) or \
isinstance(event, midi.TrackNameEvent): isinstance(event, midi.TrackNameEvent):
space = "" space = ""
@ -171,13 +238,44 @@ class KarTextExtractor:
if not event.data: if not event.data:
continue continue
elif event.data[0] == '/' or event.data[0] == '\\': elif event.data[0] == '/' or event.data[0] == '\\':
edata += "\n" + event.data[1:] + nl edata = "\n" + event.data[1:] + nl
else: else:
edata += event.data + nl edata = event.data + nl
lyrics += self.reencode(edata) lyrics += self.nulltrunc(edata)
lyricsN += edata
# Try to guess the encoding. First do it with the data
# possibly containing nulls. If we get one of the accepted
# nullbyte encodings, go with this, else repeat with the
# de-nulled data
# self.em.rclog("Lyrics length %d" % len(lyrics))
if self.encoding == "" and has_chardet:
if self.hadnulls:
(encoding, confidence) = self.chardet_detect(lyricsN)
# self.em.rclog("With nulls: chardet: enc [%s], conf %.2f" % \
# (encoding, confidence))
if confidence > 0.6 and \
encoding.lower() in self.acceptnullencodings:
self.encoding = encoding
lyrics = lyricsN
title = titleN
author = authorN
if self.encoding == "":
(encoding, confidence) = self.chardet_detect(lyrics)
self.em.rclog("No nulls: chardet: enc [%s], conf %.2f" % \
(encoding, confidence))
if confidence > 0.6:
self.encoding = encoding
if self.encoding == "":
self.em.rclog("Encoding not guessed, defaulting to [%s]" % \
(self.defaultencoding,))
self.encoding = self.defaultencoding
if title is None: if title is None:
title = "" title = ""
if author is None: if author is None:
@ -185,6 +283,10 @@ class KarTextExtractor:
if language is None: if language is None:
language = "" language = ""
title = self.reencode(title)
author = self.reencode(author)
lyrics = self.reencode(lyrics)
self.em.setmimetype("text/html") self.em.setmimetype("text/html")
docdata = htmltemplate % (title, author, language, lyrics) docdata = htmltemplate % (title, author, language, lyrics)