rclkar: renamed files for compat with install script

This commit is contained in:
Jean-Francois Dockes 2011-01-31 20:23:56 +01:00
parent 879225d687
commit a4241cff6a
3 changed files with 39 additions and 25 deletions

View File

@ -9,7 +9,10 @@ import string
import re
import codecs
import eulangclass
try:
import rcllatinclass
except:
pass
try:
import midi
@ -58,6 +61,7 @@ class KarTextExtractor:
self.encoding = ""
self.defaultencoding = ""
self.hadnulls = False
self.classifier = None
# Compute the fallback encoding to use if we can't determine
# one when processing the file. Based on the nls environment
@ -134,27 +138,30 @@ class KarTextExtractor:
encodconf = chardet.detect(text)
encoding = encodconf['encoding']
confidence = encodconf['confidence']
self.em.rclog("Chardet returns %s %.2f" % (encoding,confidence))
#self.em.rclog("Chardet returns %s %.2f" % (encoding,confidence))
# chardet is awfully bad at detecting 8bit european
# encodings/languages and will mostly return iso-8859-2 for
# everything, which is a bad default (iso-8859-1/cp1252 being
# much more common). We use our own ad-hoc stopwords based
# module to try and improve
if encoding.lower() == 'iso-8859-2':
if self.classifier is None:
try:
import __main__
dir = os.path.dirname(__main__.__file__)
langszip = os.path.join(dir, 'rcllatinstops.zip')
f = open(langszip)
f.close()
classifier = rcllatinclass.European8859TextClassifier(langszip)
except:
self.em.rclog("Can't build euroclassifier (missing stopwords zip?")
return (encoding, confidence)
try:
import __main__
dir = os.path.dirname(__main__.__file__)
langszip = os.path.join(dir, 'iso8859stops.zip')
f = open(langszip)
f.close()
except:
self.em.rclog("Can't the find the language stopwords zipfile")
return (encoding, confidence)
try:
classifier = eulangclass.European8859TextClassifier(langszip)
lang,code,count = classifier.classify(text)
self.em.rclog("euclass lang/code/matchcount: %s %s %d" % \
(lang, code, count))
#self.em.rclog("euclass lang/code/matchcount: %s %s %d" % \
# (lang, code, count))
if count > 0:
confidence = 1.0
encoding = code
@ -266,8 +273,8 @@ class KarTextExtractor:
author = authorN
if self.encoding == "":
(encoding, confidence) = self.chardet_detect(lyrics)
self.em.rclog("No nulls: chardet: enc [%s], conf %.2f" % \
(encoding, confidence))
#self.em.rclog("No nulls: chardet: enc [%s], conf %.2f" % \
# (encoding, confidence))
if confidence > 0.6:
self.encoding = encoding

View File

@ -12,11 +12,11 @@ class European8859TextClassifier:
self.langtables = self.readlanguages(langzip)
# Table to translate from punctuation to spaces
punct = '''*?[].@+-,#_$%&={};.,:!"''' + "\n\r"
self.punct = '''*?[].@+-,#_$%&={};.,:!"''' + "\n\r"
spaces = ""
for c in punct:
for c in self.punct:
spaces += " "
self.spacetable = string.maketrans(punct, spaces)
self.spacetable = string.maketrans(self.punct, spaces)
# Read the languages stopwords lists
def readlanguages(self, langzip):
@ -33,10 +33,17 @@ class European8859TextClassifier:
return langs
def classify(self, rawtext):
# Note: we can't use an re-based method to split the data because it
# should be considered binary, not text.
# Limit to reasonable size.
if len(rawtext) > 10000:
i = rawtext.find(" ", 9000)
if i == -1:
i = 9000
rawtext = rawtext[0:i]
# Remove punctuation
rawtext = rawtext.translate(self.spacetable)
# Split words
# Split words.
words = rawtext.split()
# Count frequencies
dict = {}
@ -45,8 +52,8 @@ class European8859TextClassifier:
# Order word list by frequency
lfreq = sorted(dict.iteritems(), \
key=lambda entry: entry[1], reverse=True)
# Check the ntest most frequent words against the language lists and
# chose the best match
# Check the text's ntest most frequent words against the
# language lists and chose the best match
ntest = 10
maxcount = 0
maxlang = ""
@ -56,7 +63,7 @@ class European8859TextClassifier:
for w,c in lfreq[0:ntest]:
if w in lwords:
count += 1
print "Lang %s code %s count %d" % (lang, code, count)
#print "Lang %s code %s count %d" % (lang, code, count)
if maxcount < count:
maxlang = lang
maxcount = count
@ -73,7 +80,7 @@ if __name__ == "__main__":
f.close()
dir = os.path.dirname(__file__)
langszip = os.path.join(dir, 'iso8859stops.zip')
langszip = os.path.join(dir, 'rcllatinstops.zip')
classifier = European8859TextClassifier(langszip)