augment the number of test words 10->20, + comments

This commit is contained in:
Jean-Francois Dockes 2012-01-03 21:17:11 +01:00
parent 7f57df250a
commit 9aeda04ccb

View File

@ -1,4 +1,17 @@
#!/usr/bin/env python
"""Try to guess a text's language and character set by checking how it matches lists of
common words. This is not a primary method of detection because it's slow and unreliable, but it
may be a help in discrimating, for exemple, before european languages using relatively close
variations of iso-8859.
This is used in association with a zip file containing a number of stopwords list: rcllatinstops.zip
As a note, I am looking for a good iso-8859-7 stop words list for greek, the only ones I found
were utf-8 and there are errors when transcoding to iso-8859-7. I guess that there is something
about Greek accents that I don't know and would enable fixing this (some kind of simplification
allowing transliteration from utf-8 to iso-8859-7). An exemple of difficulty is the small letter
epsilon with dasia (in unicode but not iso). Can this be replaced by either epsilon or epsilon
with acute accent ?
"""
import sys
import string
@ -9,6 +22,9 @@ from zipfile import ZipFile
class European8859TextClassifier:
def __init__(self, langzip):
"""langzip contains text files. Each text file is named like lang_code.txt
(ie: french_cp1252.txt) and contains an encoded stop word list for the language"""
self.langtables = self.readlanguages(langzip)
# Table to translate from punctuation to spaces
@ -18,8 +34,8 @@ class European8859TextClassifier:
spaces += " "
self.spacetable = string.maketrans(self.punct, spaces)
# Read the languages stopwords lists
def readlanguages(self, langzip):
"""Extract the stop words lists from the zip file"""
zip = ZipFile(langzip)
langfiles = zip.namelist()
langs = []
@ -54,13 +70,14 @@ class European8859TextClassifier:
key=lambda entry: entry[1], reverse=True)
# Check the text's ntest most frequent words against the
# language lists and chose the best match
ntest = 10
ntest = 20
maxcount = 0
maxlang = ""
maxcode = ""
for lang,code,lwords in self.langtables:
count = 0
for w,c in lfreq[0:ntest]:
#print "testing", w
if w in lwords:
count += 1
#print "Lang %s code %s count %d" % (lang, code, count)
@ -85,4 +102,8 @@ if __name__ == "__main__":
classifier = European8859TextClassifier(langszip)
lang,code,count = classifier.classify(rawtext)
print "Chosen lang/code/matchcount: %s %s %d" % (lang, code, count)
if count > 0:
print "%s %s %d" % (code, lang, count)
else:
print "UNKNOWN UNKNOWN 0"