karaoke filter/language guesser: use sets to store common words

This commit is contained in:
Jean-Francois Dockes 2012-01-04 16:16:29 +01:00
parent 9aeda04ccb
commit 6d651cf043

View File

@ -21,10 +21,13 @@ import os.path
from zipfile import ZipFile from zipfile import ZipFile
class European8859TextClassifier: class European8859TextClassifier:
def __init__(self, langzip): def __init__(self, langzip=""):
"""langzip contains text files. Each text file is named like lang_code.txt """langzip contains text files. Each text file is named like lang_code.txt
(ie: french_cp1252.txt) and contains an encoded stop word list for the language""" (ie: french_cp1252.txt) and contains an encoded stop word list for the language"""
if langzip == "":
langzip = os.path.join(os.path.dirname(__file__), 'rcllatinstops.zip')
self.langtables = self.readlanguages(langzip) self.langtables = self.readlanguages(langzip)
# Table to translate from punctuation to spaces # Table to translate from punctuation to spaces
@ -41,7 +44,7 @@ class European8859TextClassifier:
langs = [] langs = []
for fn in langfiles: for fn in langfiles:
text = zip.read(fn) text = zip.read(fn)
words = text.split() words = set(text.split())
langcode = os.path.basename(fn) langcode = os.path.basename(fn)
langcode = os.path.splitext(langcode)[0] langcode = os.path.splitext(langcode)[0]
(lang,code) = langcode.split('_') (lang,code) = langcode.split('_')
@ -96,10 +99,7 @@ if __name__ == "__main__":
rawtext = f.read() rawtext = f.read()
f.close() f.close()
dir = os.path.dirname(__file__) classifier = European8859TextClassifier()
langszip = os.path.join(dir, 'rcllatinstops.zip')
classifier = European8859TextClassifier(langszip)
lang,code,count = classifier.classify(rawtext) lang,code,count = classifier.classify(rawtext)
if count > 0: if count > 0: