augment the number of test words 10->20, + comments
This commit is contained in:
parent
7f57df250a
commit
9aeda04ccb
@ -1,4 +1,17 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
"""Try to guess a text's language and character set by checking how it matches lists of
|
||||||
|
common words. This is not a primary method of detection because it's slow and unreliable, but it
|
||||||
|
may be a help in discrimating, for exemple, before european languages using relatively close
|
||||||
|
variations of iso-8859.
|
||||||
|
This is used in association with a zip file containing a number of stopwords list: rcllatinstops.zip
|
||||||
|
|
||||||
|
As a note, I am looking for a good iso-8859-7 stop words list for greek, the only ones I found
|
||||||
|
were utf-8 and there are errors when transcoding to iso-8859-7. I guess that there is something
|
||||||
|
about Greek accents that I don't know and would enable fixing this (some kind of simplification
|
||||||
|
allowing transliteration from utf-8 to iso-8859-7). An exemple of difficulty is the small letter
|
||||||
|
epsilon with dasia (in unicode but not iso). Can this be replaced by either epsilon or epsilon
|
||||||
|
with acute accent ?
|
||||||
|
"""
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
import string
|
import string
|
||||||
@ -9,6 +22,9 @@ from zipfile import ZipFile
|
|||||||
|
|
||||||
class European8859TextClassifier:
|
class European8859TextClassifier:
|
||||||
def __init__(self, langzip):
|
def __init__(self, langzip):
|
||||||
|
"""langzip contains text files. Each text file is named like lang_code.txt
|
||||||
|
(ie: french_cp1252.txt) and contains an encoded stop word list for the language"""
|
||||||
|
|
||||||
self.langtables = self.readlanguages(langzip)
|
self.langtables = self.readlanguages(langzip)
|
||||||
|
|
||||||
# Table to translate from punctuation to spaces
|
# Table to translate from punctuation to spaces
|
||||||
@ -18,8 +34,8 @@ class European8859TextClassifier:
|
|||||||
spaces += " "
|
spaces += " "
|
||||||
self.spacetable = string.maketrans(self.punct, spaces)
|
self.spacetable = string.maketrans(self.punct, spaces)
|
||||||
|
|
||||||
# Read the languages stopwords lists
|
|
||||||
def readlanguages(self, langzip):
|
def readlanguages(self, langzip):
|
||||||
|
"""Extract the stop words lists from the zip file"""
|
||||||
zip = ZipFile(langzip)
|
zip = ZipFile(langzip)
|
||||||
langfiles = zip.namelist()
|
langfiles = zip.namelist()
|
||||||
langs = []
|
langs = []
|
||||||
@ -54,13 +70,14 @@ class European8859TextClassifier:
|
|||||||
key=lambda entry: entry[1], reverse=True)
|
key=lambda entry: entry[1], reverse=True)
|
||||||
# Check the text's ntest most frequent words against the
|
# Check the text's ntest most frequent words against the
|
||||||
# language lists and chose the best match
|
# language lists and chose the best match
|
||||||
ntest = 10
|
ntest = 20
|
||||||
maxcount = 0
|
maxcount = 0
|
||||||
maxlang = ""
|
maxlang = ""
|
||||||
maxcode = ""
|
maxcode = ""
|
||||||
for lang,code,lwords in self.langtables:
|
for lang,code,lwords in self.langtables:
|
||||||
count = 0
|
count = 0
|
||||||
for w,c in lfreq[0:ntest]:
|
for w,c in lfreq[0:ntest]:
|
||||||
|
#print "testing", w
|
||||||
if w in lwords:
|
if w in lwords:
|
||||||
count += 1
|
count += 1
|
||||||
#print "Lang %s code %s count %d" % (lang, code, count)
|
#print "Lang %s code %s count %d" % (lang, code, count)
|
||||||
@ -85,4 +102,8 @@ if __name__ == "__main__":
|
|||||||
classifier = European8859TextClassifier(langszip)
|
classifier = European8859TextClassifier(langszip)
|
||||||
|
|
||||||
lang,code,count = classifier.classify(rawtext)
|
lang,code,count = classifier.classify(rawtext)
|
||||||
print "Chosen lang/code/matchcount: %s %s %d" % (lang, code, count)
|
if count > 0:
|
||||||
|
print "%s %s %d" % (code, lang, count)
|
||||||
|
else:
|
||||||
|
print "UNKNOWN UNKNOWN 0"
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user