stopwords-based charset guessing: use merged dictionary for all words instead of one dictionary per language/charset. Very marginal speed improvement but somewhat cleaner

This commit is contained in:
Jean-Francois Dockes 2012-01-20 14:45:34 +01:00
parent f9a6be302b
commit dc3aa5d564

View File

@ -28,69 +28,83 @@ class European8859TextClassifier:
if langzip == "": if langzip == "":
langzip = os.path.join(os.path.dirname(__file__), 'rcllatinstops.zip') langzip = os.path.join(os.path.dirname(__file__), 'rcllatinstops.zip')
self.langtables = self.readlanguages(langzip) self.readlanguages(langzip)
# Table to translate from punctuation to spaces # Table to translate from punctuation to spaces
self.punct = '''*?[].@+-,#_$%&={};.,:!"''' + "\n\r" self.punct = '''*?[].@+-,#_$%&={};.,:!"''' + "'\n\r"
spaces = "" spaces = len(self.punct) * " "
for c in self.punct:
spaces += " "
self.spacetable = string.maketrans(self.punct, spaces) self.spacetable = string.maketrans(self.punct, spaces)
def readlanguages(self, langzip): def readlanguages(self, langzip):
"""Extract the stop words lists from the zip file""" """Extract the stop words lists from the zip file.
We build a merge dictionary from the lists.
The keys are the words from all the files. The
values are a list of the (lang,code) origin(s) for the each word.
"""
zip = ZipFile(langzip) zip = ZipFile(langzip)
langfiles = zip.namelist() langfiles = zip.namelist()
langs = [] self.allwords = {}
for fn in langfiles: for fn in langfiles:
text = zip.read(fn)
words = set(text.split())
langcode = os.path.basename(fn) langcode = os.path.basename(fn)
langcode = os.path.splitext(langcode)[0] langcode = os.path.splitext(langcode)[0]
(lang,code) = langcode.split('_') (lang,code) = langcode.split('_')
langs.append((lang, code, words)) text = zip.read(fn)
return langs words = text.split()
for word in words:
if self.allwords.has_key(word):
self.allwords[word].append((lang, code))
else:
self.allwords[word] = [(lang, code)]
def classify(self, rawtext): def classify(self, rawtext):
# Note: we can't use an re-based method to split the data because it # Note: we can't use an re-based method to split the data because it
# should be considered binary, not text. # should be considered binary, not text.
# Limit to reasonable size. # Limit to reasonable size.
if len(rawtext) > 10000: if len(rawtext) > 10000:
i = rawtext.find(" ", 9000) i = rawtext.find(" ", 9000)
if i == -1: if i == -1:
i = 9000 i = 9000
rawtext = rawtext[0:i] rawtext = rawtext[0:i]
# Remove punctuation # Remove punctuation
rawtext = rawtext.translate(self.spacetable) rawtext = rawtext.translate(self.spacetable)
# Split words.
# Make of list of all text words, order it by frequency, we only
# use the ntest most frequent words.
ntest = 20
words = rawtext.split() words = rawtext.split()
# Count frequencies
dict = {} dict = {}
for w in words: for w in words:
dict[w] = dict.get(w, 0) + 1 dict[w] = dict.get(w, 0) + 1
# Order word list by frequency lfreq = [a[0] for a in sorted(dict.iteritems(), \
lfreq = sorted(dict.iteritems(), \ key=lambda entry: entry[1], reverse=True)[0:ntest]]
key=lambda entry: entry[1], reverse=True) #print lfreq
# Check the text's ntest most frequent words against the
# language lists and chose the best match # Build a dict (lang,code)->matchcount
ntest = 20 langstats = {}
maxcount = 0 for w in lfreq:
maxlang = "" lcl = self.allwords.get(w, [])
maxcode = "" for lc in lcl:
for lang,code,lwords in self.langtables: langstats[lc] = langstats.get(lc, 0) + 1
count = 0
for w,c in lfreq[0:ntest]: # Get a list of (lang,code) sorted by match count
#print "testing", w lcfreq = sorted(langstats.iteritems(), \
if w in lwords: key=lambda entry: entry[1], reverse=True)
count += 1 #print lcfreq[0:3]
#print "Lang %s code %s count %d" % (lang, code, count) if len(lcfreq) != 0:
if maxcount < count: lc,maxcount = lcfreq[0]
maxlang = lang maxlang = lc[0]
maxcount = count maxcode = lc[1]
maxcode = code else:
# If match too bad, default to most common maxcount = 0
# If the match is too bad, default to most common. Maybe we should
# generate an error instead, but the caller can look at the count
# anyway.
if maxcount == 0: if maxcount == 0:
maxlang,maxcode = ('english', 'cp1252') maxlang,maxcode = ('english', 'cp1252')
return (maxlang, maxcode, maxcount) return (maxlang, maxcode, maxcount)