augment the number of test words 10->20, + comments

2012-01-03 21:17:11 +01:00 · 2012-01-03 21:17:11 +01:00 · 9aeda04ccb
commit 9aeda04ccb
parent 7f57df250a
1 changed files with 24 additions and 3 deletions
--- a/src/filters/rcllatinclass.py
+++ b/src/filters/rcllatinclass.py
@ -1,4 +1,17 @@
 #!/usr/bin/env python
+"""Try to guess a text's language and character set by checking how it matches lists of
+common words. This is not a primary method of detection because it's slow and unreliable, but it
+may be a help in discrimating, for exemple, before european languages using relatively close
+variations of iso-8859.
+This is used in association with a zip file containing a number of stopwords list: rcllatinstops.zip
+
+As a note, I am looking for a good iso-8859-7 stop words list for greek, the only ones I found
+were utf-8 and there are errors when transcoding to iso-8859-7. I guess that there is something
+about Greek accents that I don't know and would enable fixing this (some kind of simplification
+allowing transliteration from utf-8 to iso-8859-7). An exemple of difficulty is the small letter
+epsilon with dasia (in unicode but not iso). Can this be replaced by either epsilon or epsilon
+with acute accent ?
+"""

 import sys
 import string
@ -9,6 +22,9 @@ from zipfile import ZipFile

 class European8859TextClassifier:
    def __init__(self, langzip):
+        """langzip contains text files. Each text file is named like lang_code.txt
+        (ie: french_cp1252.txt) and contains an encoded stop word list for the language"""
+        
        self.langtables = self.readlanguages(langzip)

        # Table to translate from punctuation to spaces
@ -18,8 +34,8 @@ class European8859TextClassifier:
            spaces += " " 
        self.spacetable = string.maketrans(self.punct, spaces)

-    # Read the languages stopwords lists
    def readlanguages(self, langzip):
+        """Extract the stop words lists from the zip file"""
        zip = ZipFile(langzip)
        langfiles = zip.namelist()
        langs = []
@ -54,13 +70,14 @@ class European8859TextClassifier:
                       key=lambda entry: entry[1], reverse=True)
        # Check the text's ntest most frequent words against the
        # language lists and chose the best match
-        ntest = 10
+        ntest = 20
        maxcount = 0
        maxlang = ""
        maxcode = ""
        for lang,code,lwords in self.langtables:
            count = 0
            for w,c in lfreq[0:ntest]:
+                #print "testing", w
                if w in lwords:
                    count += 1
            #print "Lang %s code %s count %d" % (lang, code, count)
@ -85,4 +102,8 @@ if __name__ == "__main__":
    classifier = European8859TextClassifier(langszip)

    lang,code,count = classifier.classify(rawtext)
-    print "Chosen lang/code/matchcount: %s %s %d" % (lang, code, count)
+    if count > 0:
+        print "%s %s %d" % (code, lang, count)
+    else:
+        print "UNKNOWN UNKNOWN 0"
+