rclkar: renamed files for compat with install script
This commit is contained in:
parent
879225d687
commit
a4241cff6a
@ -9,7 +9,10 @@ import string
|
|||||||
import re
|
import re
|
||||||
import codecs
|
import codecs
|
||||||
|
|
||||||
import eulangclass
|
try:
|
||||||
|
import rcllatinclass
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import midi
|
import midi
|
||||||
@ -58,6 +61,7 @@ class KarTextExtractor:
|
|||||||
self.encoding = ""
|
self.encoding = ""
|
||||||
self.defaultencoding = ""
|
self.defaultencoding = ""
|
||||||
self.hadnulls = False
|
self.hadnulls = False
|
||||||
|
self.classifier = None
|
||||||
|
|
||||||
# Compute the fallback encoding to use if we can't determine
|
# Compute the fallback encoding to use if we can't determine
|
||||||
# one when processing the file. Based on the nls environment
|
# one when processing the file. Based on the nls environment
|
||||||
@ -134,27 +138,30 @@ class KarTextExtractor:
|
|||||||
encodconf = chardet.detect(text)
|
encodconf = chardet.detect(text)
|
||||||
encoding = encodconf['encoding']
|
encoding = encodconf['encoding']
|
||||||
confidence = encodconf['confidence']
|
confidence = encodconf['confidence']
|
||||||
self.em.rclog("Chardet returns %s %.2f" % (encoding,confidence))
|
#self.em.rclog("Chardet returns %s %.2f" % (encoding,confidence))
|
||||||
|
|
||||||
# chardet is awfully bad at detecting 8bit european
|
# chardet is awfully bad at detecting 8bit european
|
||||||
# encodings/languages and will mostly return iso-8859-2 for
|
# encodings/languages and will mostly return iso-8859-2 for
|
||||||
# everything, which is a bad default (iso-8859-1/cp1252 being
|
# everything, which is a bad default (iso-8859-1/cp1252 being
|
||||||
# much more common). We use our own ad-hoc stopwords based
|
# much more common). We use our own ad-hoc stopwords based
|
||||||
# module to try and improve
|
# module to try and improve
|
||||||
if encoding.lower() == 'iso-8859-2':
|
if encoding.lower() == 'iso-8859-2':
|
||||||
|
if self.classifier is None:
|
||||||
|
try:
|
||||||
|
import __main__
|
||||||
|
dir = os.path.dirname(__main__.__file__)
|
||||||
|
langszip = os.path.join(dir, 'rcllatinstops.zip')
|
||||||
|
f = open(langszip)
|
||||||
|
f.close()
|
||||||
|
classifier = rcllatinclass.European8859TextClassifier(langszip)
|
||||||
|
except:
|
||||||
|
self.em.rclog("Can't build euroclassifier (missing stopwords zip?")
|
||||||
|
return (encoding, confidence)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import __main__
|
|
||||||
dir = os.path.dirname(__main__.__file__)
|
|
||||||
langszip = os.path.join(dir, 'iso8859stops.zip')
|
|
||||||
f = open(langszip)
|
|
||||||
f.close()
|
|
||||||
except:
|
|
||||||
self.em.rclog("Can't the find the language stopwords zipfile")
|
|
||||||
return (encoding, confidence)
|
|
||||||
try:
|
|
||||||
classifier = eulangclass.European8859TextClassifier(langszip)
|
|
||||||
lang,code,count = classifier.classify(text)
|
lang,code,count = classifier.classify(text)
|
||||||
self.em.rclog("euclass lang/code/matchcount: %s %s %d" % \
|
#self.em.rclog("euclass lang/code/matchcount: %s %s %d" % \
|
||||||
(lang, code, count))
|
# (lang, code, count))
|
||||||
if count > 0:
|
if count > 0:
|
||||||
confidence = 1.0
|
confidence = 1.0
|
||||||
encoding = code
|
encoding = code
|
||||||
@ -266,8 +273,8 @@ class KarTextExtractor:
|
|||||||
author = authorN
|
author = authorN
|
||||||
if self.encoding == "":
|
if self.encoding == "":
|
||||||
(encoding, confidence) = self.chardet_detect(lyrics)
|
(encoding, confidence) = self.chardet_detect(lyrics)
|
||||||
self.em.rclog("No nulls: chardet: enc [%s], conf %.2f" % \
|
#self.em.rclog("No nulls: chardet: enc [%s], conf %.2f" % \
|
||||||
(encoding, confidence))
|
# (encoding, confidence))
|
||||||
if confidence > 0.6:
|
if confidence > 0.6:
|
||||||
self.encoding = encoding
|
self.encoding = encoding
|
||||||
|
|
||||||
|
|||||||
@ -12,11 +12,11 @@ class European8859TextClassifier:
|
|||||||
self.langtables = self.readlanguages(langzip)
|
self.langtables = self.readlanguages(langzip)
|
||||||
|
|
||||||
# Table to translate from punctuation to spaces
|
# Table to translate from punctuation to spaces
|
||||||
punct = '''*?[].@+-,#_$%&={};.,:!"''' + "\n\r"
|
self.punct = '''*?[].@+-,#_$%&={};.,:!"''' + "\n\r"
|
||||||
spaces = ""
|
spaces = ""
|
||||||
for c in punct:
|
for c in self.punct:
|
||||||
spaces += " "
|
spaces += " "
|
||||||
self.spacetable = string.maketrans(punct, spaces)
|
self.spacetable = string.maketrans(self.punct, spaces)
|
||||||
|
|
||||||
# Read the languages stopwords lists
|
# Read the languages stopwords lists
|
||||||
def readlanguages(self, langzip):
|
def readlanguages(self, langzip):
|
||||||
@ -33,10 +33,17 @@ class European8859TextClassifier:
|
|||||||
return langs
|
return langs
|
||||||
|
|
||||||
def classify(self, rawtext):
|
def classify(self, rawtext):
|
||||||
|
# Note: we can't use an re-based method to split the data because it
|
||||||
|
# should be considered binary, not text.
|
||||||
|
# Limit to reasonable size.
|
||||||
|
if len(rawtext) > 10000:
|
||||||
|
i = rawtext.find(" ", 9000)
|
||||||
|
if i == -1:
|
||||||
|
i = 9000
|
||||||
|
rawtext = rawtext[0:i]
|
||||||
# Remove punctuation
|
# Remove punctuation
|
||||||
rawtext = rawtext.translate(self.spacetable)
|
rawtext = rawtext.translate(self.spacetable)
|
||||||
# Split words
|
# Split words.
|
||||||
words = rawtext.split()
|
words = rawtext.split()
|
||||||
# Count frequencies
|
# Count frequencies
|
||||||
dict = {}
|
dict = {}
|
||||||
@ -45,8 +52,8 @@ class European8859TextClassifier:
|
|||||||
# Order word list by frequency
|
# Order word list by frequency
|
||||||
lfreq = sorted(dict.iteritems(), \
|
lfreq = sorted(dict.iteritems(), \
|
||||||
key=lambda entry: entry[1], reverse=True)
|
key=lambda entry: entry[1], reverse=True)
|
||||||
# Check the ntest most frequent words against the language lists and
|
# Check the text's ntest most frequent words against the
|
||||||
# chose the best match
|
# language lists and chose the best match
|
||||||
ntest = 10
|
ntest = 10
|
||||||
maxcount = 0
|
maxcount = 0
|
||||||
maxlang = ""
|
maxlang = ""
|
||||||
@ -56,7 +63,7 @@ class European8859TextClassifier:
|
|||||||
for w,c in lfreq[0:ntest]:
|
for w,c in lfreq[0:ntest]:
|
||||||
if w in lwords:
|
if w in lwords:
|
||||||
count += 1
|
count += 1
|
||||||
print "Lang %s code %s count %d" % (lang, code, count)
|
#print "Lang %s code %s count %d" % (lang, code, count)
|
||||||
if maxcount < count:
|
if maxcount < count:
|
||||||
maxlang = lang
|
maxlang = lang
|
||||||
maxcount = count
|
maxcount = count
|
||||||
@ -73,7 +80,7 @@ if __name__ == "__main__":
|
|||||||
f.close()
|
f.close()
|
||||||
|
|
||||||
dir = os.path.dirname(__file__)
|
dir = os.path.dirname(__file__)
|
||||||
langszip = os.path.join(dir, 'iso8859stops.zip')
|
langszip = os.path.join(dir, 'rcllatinstops.zip')
|
||||||
|
|
||||||
classifier = European8859TextClassifier(langszip)
|
classifier = European8859TextClassifier(langszip)
|
||||||
|
|
||||||
Loading…
x
Reference in New Issue
Block a user