augment the number of test words 10->20, + comments
This commit is contained in:
parent
7f57df250a
commit
9aeda04ccb
@ -1,4 +1,17 @@
|
||||
#!/usr/bin/env python
|
||||
"""Try to guess a text's language and character set by checking how it matches lists of
|
||||
common words. This is not a primary method of detection because it's slow and unreliable, but it
|
||||
may be a help in discrimating, for exemple, before european languages using relatively close
|
||||
variations of iso-8859.
|
||||
This is used in association with a zip file containing a number of stopwords list: rcllatinstops.zip
|
||||
|
||||
As a note, I am looking for a good iso-8859-7 stop words list for greek, the only ones I found
|
||||
were utf-8 and there are errors when transcoding to iso-8859-7. I guess that there is something
|
||||
about Greek accents that I don't know and would enable fixing this (some kind of simplification
|
||||
allowing transliteration from utf-8 to iso-8859-7). An exemple of difficulty is the small letter
|
||||
epsilon with dasia (in unicode but not iso). Can this be replaced by either epsilon or epsilon
|
||||
with acute accent ?
|
||||
"""
|
||||
|
||||
import sys
|
||||
import string
|
||||
@ -9,6 +22,9 @@ from zipfile import ZipFile
|
||||
|
||||
class European8859TextClassifier:
|
||||
def __init__(self, langzip):
|
||||
"""langzip contains text files. Each text file is named like lang_code.txt
|
||||
(ie: french_cp1252.txt) and contains an encoded stop word list for the language"""
|
||||
|
||||
self.langtables = self.readlanguages(langzip)
|
||||
|
||||
# Table to translate from punctuation to spaces
|
||||
@ -18,8 +34,8 @@ class European8859TextClassifier:
|
||||
spaces += " "
|
||||
self.spacetable = string.maketrans(self.punct, spaces)
|
||||
|
||||
# Read the languages stopwords lists
|
||||
def readlanguages(self, langzip):
|
||||
"""Extract the stop words lists from the zip file"""
|
||||
zip = ZipFile(langzip)
|
||||
langfiles = zip.namelist()
|
||||
langs = []
|
||||
@ -54,13 +70,14 @@ class European8859TextClassifier:
|
||||
key=lambda entry: entry[1], reverse=True)
|
||||
# Check the text's ntest most frequent words against the
|
||||
# language lists and chose the best match
|
||||
ntest = 10
|
||||
ntest = 20
|
||||
maxcount = 0
|
||||
maxlang = ""
|
||||
maxcode = ""
|
||||
for lang,code,lwords in self.langtables:
|
||||
count = 0
|
||||
for w,c in lfreq[0:ntest]:
|
||||
#print "testing", w
|
||||
if w in lwords:
|
||||
count += 1
|
||||
#print "Lang %s code %s count %d" % (lang, code, count)
|
||||
@ -85,4 +102,8 @@ if __name__ == "__main__":
|
||||
classifier = European8859TextClassifier(langszip)
|
||||
|
||||
lang,code,count = classifier.classify(rawtext)
|
||||
print "Chosen lang/code/matchcount: %s %s %d" % (lang, code, count)
|
||||
if count > 0:
|
||||
print "%s %s %d" % (code, lang, count)
|
||||
else:
|
||||
print "UNKNOWN UNKNOWN 0"
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user