Name: Anonymous 2011-06-24 18:25
#!/usr/bin/python2
import sys
import bz2
def classify(text, langs=('english', 'german', 'french')):
results = {}
for lang in langs:
with open(lang + '.txt') as f:
corpus = f.read()
compressed = len(bz2.compress(corpus))
results[lang] = len(bz2.compress(corpus + text)) - compressed
return sorted(results, key=results.__getitem__)
if __name__ == '__main__':
print "Most likely %s." % classify(sys.stdin.read())[0].capitalize()
$ wget -qO - http://www.gutenberg.org/ebooks/31469.txt.utf8 | ./classific.py
Most likely English.
$ wget -qO - http://www.gutenberg.org/ebooks/22367.txt.utf8 | ./classific.py
Most likely German.
$ wget -qO - http://www.gutenberg.org/ebooks/4968.txt.utf8 | ./classific.py
Most likely French.