rclkar: renamed files for compat with install script

2011-01-31 20:23:56 +01:00 · 2011-01-31 20:23:56 +01:00 · a4241cff6a
commit a4241cff6a
parent 879225d687
3 changed files with 39 additions and 25 deletions
--- a/src/filters/rclkar
+++ b/src/filters/rclkar
@ -9,7 +9,10 @@ import string
 import re
 import codecs

-import eulangclass
+try:
+    import rcllatinclass
+except:
+    pass

 try:
    import midi
@ -58,6 +61,7 @@ class KarTextExtractor:
        self.encoding = ""
        self.defaultencoding = ""
        self.hadnulls = False
+        self.classifier = None
        
        # Compute the fallback encoding to use if we can't determine
        # one when processing the file. Based on the nls environment
@ -134,27 +138,30 @@ class KarTextExtractor:
        encodconf = chardet.detect(text)
        encoding = encodconf['encoding']
        confidence = encodconf['confidence']
-        self.em.rclog("Chardet returns %s %.2f" % (encoding,confidence))
+        #self.em.rclog("Chardet returns %s %.2f" % (encoding,confidence))
+
        # chardet is awfully bad at detecting 8bit european
        # encodings/languages and will mostly return iso-8859-2 for
        # everything, which is a bad default (iso-8859-1/cp1252 being
        # much more common). We use our own ad-hoc stopwords based
        # module to try and improve
        if encoding.lower() == 'iso-8859-2':
+            if self.classifier is None:
+                try:
+                    import __main__
+                    dir = os.path.dirname(__main__.__file__)
+                    langszip = os.path.join(dir, 'rcllatinstops.zip')
+                    f = open(langszip)
+                    f.close()
+                    classifier = rcllatinclass.European8859TextClassifier(langszip)
+                except:
+                    self.em.rclog("Can't build euroclassifier (missing stopwords zip?")
+                    return (encoding, confidence)
+
            try:
-                import __main__
-                dir = os.path.dirname(__main__.__file__)
-                langszip = os.path.join(dir, 'iso8859stops.zip')
-                f = open(langszip)
-                f.close()
-            except:
-                self.em.rclog("Can't the find the language stopwords zipfile")
-                return (encoding, confidence)
-            try:
-                classifier = eulangclass.European8859TextClassifier(langszip)
                lang,code,count = classifier.classify(text)
-                self.em.rclog("euclass lang/code/matchcount: %s %s %d" % \
-                              (lang, code, count))
+                #self.em.rclog("euclass lang/code/matchcount: %s %s %d" % \
+                #              (lang, code, count))
                if count > 0:
                    confidence = 1.0
                    encoding = code
@ -266,8 +273,8 @@ class KarTextExtractor:
                    author = authorN
            if self.encoding == "":
                (encoding, confidence) = self.chardet_detect(lyrics)
-                self.em.rclog("No nulls: chardet: enc [%s], conf %.2f" % \
-                              (encoding, confidence))
+                #self.em.rclog("No nulls: chardet: enc [%s], conf %.2f" % \
+                #              (encoding, confidence))
                if confidence > 0.6:
                    self.encoding = encoding

--- a/src/filters/rcllatinclass.py
+++ b/src/filters/rcllatinclass.py
@ -12,11 +12,11 @@ class European8859TextClassifier:
        self.langtables = self.readlanguages(langzip)

        # Table to translate from punctuation to spaces
-        punct = '''*?[].@+-,#_$%&={};.,:!"''' + "\n\r"
+        self.punct = '''*?[].@+-,#_$%&={};.,:!"''' + "\n\r"
        spaces = ""
-        for c in punct:
+        for c in self.punct:
            spaces += " " 
-        self.spacetable = string.maketrans(punct, spaces)
+        self.spacetable = string.maketrans(self.punct, spaces)

    # Read the languages stopwords lists
    def readlanguages(self, langzip):
@ -33,10 +33,17 @@ class European8859TextClassifier:
        return langs

    def classify(self, rawtext):
-
+        # Note: we can't use an re-based method to split the data because it
+        # should be considered binary, not text.
+        # Limit to reasonable size.
+        if len(rawtext) > 10000:
+            i = rawtext.find(" ", 9000)
+            if i == -1:
+                i = 9000
+            rawtext = rawtext[0:i]
        # Remove punctuation
        rawtext = rawtext.translate(self.spacetable)
-        # Split words
+        # Split words. 
        words = rawtext.split()
        # Count frequencies
        dict = {}
@ -45,8 +52,8 @@ class European8859TextClassifier:
        # Order word list by frequency
        lfreq = sorted(dict.iteritems(), \
                       key=lambda entry: entry[1], reverse=True)
-        # Check the ntest most frequent words against the language lists and
-        # chose the best match
+        # Check the text's ntest most frequent words against the
+        # language lists and chose the best match
        ntest = 10
        maxcount = 0
        maxlang = ""
@ -56,7 +63,7 @@ class European8859TextClassifier:
            for w,c in lfreq[0:ntest]:
                if w in lwords:
                    count += 1
-            print "Lang %s code %s count %d" % (lang, code, count)
+            #print "Lang %s code %s count %d" % (lang, code, count)
            if maxcount < count:
                maxlang = lang
                maxcount = count
@ -73,7 +80,7 @@ if __name__ == "__main__":
    f.close()

    dir = os.path.dirname(__file__)
-    langszip = os.path.join(dir, 'iso8859stops.zip')
+    langszip = os.path.join(dir, 'rcllatinstops.zip')

    classifier = European8859TextClassifier(langszip)

--- a/src/filters/rcllatinstops.zip
+++ b/src/filters/rcllatinstops.zip