diff --git a/src/filters/rcllatinclass.py b/src/filters/rcllatinclass.py index 3952e0de..5f31f704 100755 --- a/src/filters/rcllatinclass.py +++ b/src/filters/rcllatinclass.py @@ -1,4 +1,17 @@ #!/usr/bin/env python +"""Try to guess a text's language and character set by checking how it matches lists of +common words. This is not a primary method of detection because it's slow and unreliable, but it +may be a help in discrimating, for exemple, before european languages using relatively close +variations of iso-8859. +This is used in association with a zip file containing a number of stopwords list: rcllatinstops.zip + +As a note, I am looking for a good iso-8859-7 stop words list for greek, the only ones I found +were utf-8 and there are errors when transcoding to iso-8859-7. I guess that there is something +about Greek accents that I don't know and would enable fixing this (some kind of simplification +allowing transliteration from utf-8 to iso-8859-7). An exemple of difficulty is the small letter +epsilon with dasia (in unicode but not iso). Can this be replaced by either epsilon or epsilon +with acute accent ? +""" import sys import string @@ -9,6 +22,9 @@ from zipfile import ZipFile class European8859TextClassifier: def __init__(self, langzip): + """langzip contains text files. Each text file is named like lang_code.txt + (ie: french_cp1252.txt) and contains an encoded stop word list for the language""" + self.langtables = self.readlanguages(langzip) # Table to translate from punctuation to spaces @@ -18,8 +34,8 @@ class European8859TextClassifier: spaces += " " self.spacetable = string.maketrans(self.punct, spaces) - # Read the languages stopwords lists def readlanguages(self, langzip): + """Extract the stop words lists from the zip file""" zip = ZipFile(langzip) langfiles = zip.namelist() langs = [] @@ -54,13 +70,14 @@ class European8859TextClassifier: key=lambda entry: entry[1], reverse=True) # Check the text's ntest most frequent words against the # language lists and chose the best match - ntest = 10 + ntest = 20 maxcount = 0 maxlang = "" maxcode = "" for lang,code,lwords in self.langtables: count = 0 for w,c in lfreq[0:ntest]: + #print "testing", w if w in lwords: count += 1 #print "Lang %s code %s count %d" % (lang, code, count) @@ -85,4 +102,8 @@ if __name__ == "__main__": classifier = European8859TextClassifier(langszip) lang,code,count = classifier.classify(rawtext) - print "Chosen lang/code/matchcount: %s %s %d" % (lang, code, count) + if count > 0: + print "%s %s %d" % (code, lang, count) + else: + print "UNKNOWN UNKNOWN 0" +