124 lines
4.4 KiB
Python
Executable File
124 lines
4.4 KiB
Python
Executable File
#!/usr/bin/env python
|
|
"""Try to guess a text's language and character set by checking how it matches lists of
|
|
common words. This is not a primary method of detection because it's slow and unreliable, but it
|
|
may be a help in discrimating, for exemple, before european languages using relatively close
|
|
variations of iso-8859.
|
|
This is used in association with a zip file containing a number of stopwords list: rcllatinstops.zip
|
|
|
|
As a note, I am looking for a good iso-8859-7 stop words list for greek, the only ones I found
|
|
were utf-8 and there are errors when transcoding to iso-8859-7. I guess that there is something
|
|
about Greek accents that I don't know and would enable fixing this (some kind of simplification
|
|
allowing transliteration from utf-8 to iso-8859-7). An exemple of difficulty is the small letter
|
|
epsilon with dasia (in unicode but not iso). Can this be replaced by either epsilon or epsilon
|
|
with acute accent ?
|
|
"""
|
|
|
|
import sys
|
|
import string
|
|
import glob
|
|
import os
|
|
import os.path
|
|
from zipfile import ZipFile
|
|
|
|
class European8859TextClassifier:
|
|
def __init__(self, langzip=""):
|
|
"""langzip contains text files. Each text file is named like lang_code.txt
|
|
(ie: french_cp1252.txt) and contains an encoded stop word list for the language"""
|
|
|
|
if langzip == "":
|
|
langzip = os.path.join(os.path.dirname(__file__), 'rcllatinstops.zip')
|
|
|
|
self.readlanguages(langzip)
|
|
|
|
# Table to translate from punctuation to spaces
|
|
self.punct = '''*?[].@+-,#_$%&={};.,:!"''' + "'\n\r"
|
|
spaces = len(self.punct) * " "
|
|
self.spacetable = string.maketrans(self.punct, spaces)
|
|
|
|
def readlanguages(self, langzip):
|
|
"""Extract the stop words lists from the zip file.
|
|
We build a merge dictionary from the lists.
|
|
The keys are the words from all the files. The
|
|
values are a list of the (lang,code) origin(s) for the each word.
|
|
"""
|
|
zip = ZipFile(langzip)
|
|
langfiles = zip.namelist()
|
|
self.allwords = {}
|
|
for fn in langfiles:
|
|
langcode = os.path.basename(fn)
|
|
langcode = os.path.splitext(langcode)[0]
|
|
(lang,code) = langcode.split('_')
|
|
text = zip.read(fn)
|
|
words = text.split()
|
|
for word in words:
|
|
if self.allwords.has_key(word):
|
|
self.allwords[word].append((lang, code))
|
|
else:
|
|
self.allwords[word] = [(lang, code)]
|
|
|
|
def classify(self, rawtext):
|
|
# Note: we can't use an re-based method to split the data because it
|
|
# should be considered binary, not text.
|
|
|
|
# Limit to reasonable size.
|
|
if len(rawtext) > 10000:
|
|
i = rawtext.find(" ", 9000)
|
|
if i == -1:
|
|
i = 9000
|
|
rawtext = rawtext[0:i]
|
|
|
|
# Remove punctuation
|
|
rawtext = rawtext.translate(self.spacetable)
|
|
|
|
# Make of list of all text words, order it by frequency, we only
|
|
# use the ntest most frequent words.
|
|
ntest = 20
|
|
words = rawtext.split()
|
|
dict = {}
|
|
for w in words:
|
|
dict[w] = dict.get(w, 0) + 1
|
|
lfreq = [a[0] for a in sorted(dict.iteritems(), \
|
|
key=lambda entry: entry[1], reverse=True)[0:ntest]]
|
|
#print lfreq
|
|
|
|
# Build a dict (lang,code)->matchcount
|
|
langstats = {}
|
|
for w in lfreq:
|
|
lcl = self.allwords.get(w, [])
|
|
for lc in lcl:
|
|
langstats[lc] = langstats.get(lc, 0) + 1
|
|
|
|
# Get a list of (lang,code) sorted by match count
|
|
lcfreq = sorted(langstats.iteritems(), \
|
|
key=lambda entry: entry[1], reverse=True)
|
|
#print lcfreq[0:3]
|
|
if len(lcfreq) != 0:
|
|
lc,maxcount = lcfreq[0]
|
|
maxlang = lc[0]
|
|
maxcode = lc[1]
|
|
else:
|
|
maxcount = 0
|
|
|
|
# If the match is too bad, default to most common. Maybe we should
|
|
# generate an error instead, but the caller can look at the count
|
|
# anyway.
|
|
if maxcount == 0:
|
|
maxlang,maxcode = ('english', 'cp1252')
|
|
|
|
return (maxlang, maxcode, maxcount)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
f = open(sys.argv[1])
|
|
rawtext = f.read()
|
|
f.close()
|
|
|
|
classifier = European8859TextClassifier()
|
|
|
|
lang,code,count = classifier.classify(rawtext)
|
|
if count > 0:
|
|
print "%s %s %d" % (code, lang, count)
|
|
else:
|
|
print "UNKNOWN UNKNOWN 0"
|
|
|