diff --git a/src/filters/rclkar b/src/filters/rclkar index bcdf82bc..83c0207c 100755 --- a/src/filters/rclkar +++ b/src/filters/rclkar @@ -9,7 +9,10 @@ import string import re import codecs -import eulangclass +try: + import rcllatinclass +except: + pass try: import midi @@ -58,6 +61,7 @@ class KarTextExtractor: self.encoding = "" self.defaultencoding = "" self.hadnulls = False + self.classifier = None # Compute the fallback encoding to use if we can't determine # one when processing the file. Based on the nls environment @@ -134,27 +138,30 @@ class KarTextExtractor: encodconf = chardet.detect(text) encoding = encodconf['encoding'] confidence = encodconf['confidence'] - self.em.rclog("Chardet returns %s %.2f" % (encoding,confidence)) + #self.em.rclog("Chardet returns %s %.2f" % (encoding,confidence)) + # chardet is awfully bad at detecting 8bit european # encodings/languages and will mostly return iso-8859-2 for # everything, which is a bad default (iso-8859-1/cp1252 being # much more common). We use our own ad-hoc stopwords based # module to try and improve if encoding.lower() == 'iso-8859-2': + if self.classifier is None: + try: + import __main__ + dir = os.path.dirname(__main__.__file__) + langszip = os.path.join(dir, 'rcllatinstops.zip') + f = open(langszip) + f.close() + classifier = rcllatinclass.European8859TextClassifier(langszip) + except: + self.em.rclog("Can't build euroclassifier (missing stopwords zip?") + return (encoding, confidence) + try: - import __main__ - dir = os.path.dirname(__main__.__file__) - langszip = os.path.join(dir, 'iso8859stops.zip') - f = open(langszip) - f.close() - except: - self.em.rclog("Can't the find the language stopwords zipfile") - return (encoding, confidence) - try: - classifier = eulangclass.European8859TextClassifier(langszip) lang,code,count = classifier.classify(text) - self.em.rclog("euclass lang/code/matchcount: %s %s %d" % \ - (lang, code, count)) + #self.em.rclog("euclass lang/code/matchcount: %s %s %d" % \ + # (lang, code, count)) if count > 0: confidence = 1.0 encoding = code @@ -266,8 +273,8 @@ class KarTextExtractor: author = authorN if self.encoding == "": (encoding, confidence) = self.chardet_detect(lyrics) - self.em.rclog("No nulls: chardet: enc [%s], conf %.2f" % \ - (encoding, confidence)) + #self.em.rclog("No nulls: chardet: enc [%s], conf %.2f" % \ + # (encoding, confidence)) if confidence > 0.6: self.encoding = encoding diff --git a/src/filters/eulangclass.py b/src/filters/rcllatinclass.py similarity index 73% rename from src/filters/eulangclass.py rename to src/filters/rcllatinclass.py index 242f02d1..3952e0de 100755 --- a/src/filters/eulangclass.py +++ b/src/filters/rcllatinclass.py @@ -12,11 +12,11 @@ class European8859TextClassifier: self.langtables = self.readlanguages(langzip) # Table to translate from punctuation to spaces - punct = '''*?[].@+-,#_$%&={};.,:!"''' + "\n\r" + self.punct = '''*?[].@+-,#_$%&={};.,:!"''' + "\n\r" spaces = "" - for c in punct: + for c in self.punct: spaces += " " - self.spacetable = string.maketrans(punct, spaces) + self.spacetable = string.maketrans(self.punct, spaces) # Read the languages stopwords lists def readlanguages(self, langzip): @@ -33,10 +33,17 @@ class European8859TextClassifier: return langs def classify(self, rawtext): - + # Note: we can't use an re-based method to split the data because it + # should be considered binary, not text. + # Limit to reasonable size. + if len(rawtext) > 10000: + i = rawtext.find(" ", 9000) + if i == -1: + i = 9000 + rawtext = rawtext[0:i] # Remove punctuation rawtext = rawtext.translate(self.spacetable) - # Split words + # Split words. words = rawtext.split() # Count frequencies dict = {} @@ -45,8 +52,8 @@ class European8859TextClassifier: # Order word list by frequency lfreq = sorted(dict.iteritems(), \ key=lambda entry: entry[1], reverse=True) - # Check the ntest most frequent words against the language lists and - # chose the best match + # Check the text's ntest most frequent words against the + # language lists and chose the best match ntest = 10 maxcount = 0 maxlang = "" @@ -56,7 +63,7 @@ class European8859TextClassifier: for w,c in lfreq[0:ntest]: if w in lwords: count += 1 - print "Lang %s code %s count %d" % (lang, code, count) + #print "Lang %s code %s count %d" % (lang, code, count) if maxcount < count: maxlang = lang maxcount = count @@ -73,7 +80,7 @@ if __name__ == "__main__": f.close() dir = os.path.dirname(__file__) - langszip = os.path.join(dir, 'iso8859stops.zip') + langszip = os.path.join(dir, 'rcllatinstops.zip') classifier = European8859TextClassifier(langszip) diff --git a/src/filters/iso8859stops.zip b/src/filters/rcllatinstops.zip similarity index 100% rename from src/filters/iso8859stops.zip rename to src/filters/rcllatinstops.zip