1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70
|
"""Build a language detector model
The goal of this exercise is to train a linear classifier on text features
that represent sequences of up to 3 consecutive characters so as to be
recognize natural languages by using the frequencies of short character
sequences as 'fingerprints'.
"""
# Author: Olivier Grisel <olivier.grisel@ensta.org>
# License: Simplified BSD
import sys
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Perceptron
from sklearn.pipeline import Pipeline
from sklearn.datasets import load_files
from sklearn.model_selection import train_test_split
from sklearn import metrics
# The training data folder must be passed as first argument
languages_data_folder = sys.argv[1]
dataset = load_files(languages_data_folder)
# Split the dataset in training and test set:
docs_train, docs_test, y_train, y_test = train_test_split(
dataset.data, dataset.target, test_size=0.5)
# TASK: Build a vectorizer that splits strings into sequence of 1 to 3
# characters instead of word tokens
vectorizer = TfidfVectorizer(ngram_range=(1, 3), analyzer='char',
use_idf=False)
# TASK: Build a vectorizer / classifier pipeline using the previous analyzer
# the pipeline instance should stored in a variable named clf
clf = Pipeline([
('vec', vectorizer),
('clf', Perceptron()),
])
# TASK: Fit the pipeline on the training set
clf.fit(docs_train, y_train)
# TASK: Predict the outcome on the testing set in a variable named y_predicted
y_predicted = clf.predict(docs_test)
# Print the classification report
print(metrics.classification_report(y_test, y_predicted,
target_names=dataset.target_names))
# Plot the confusion matrix
cm = metrics.confusion_matrix(y_test, y_predicted)
print(cm)
#import matlotlib.pyplot as plt
#plt.matshow(cm, cmap=plt.cm.jet)
#plt.show()
# Predict the result on some short new sentences:
sentences = [
'This is a language detection test.',
'Ceci est un test de d\xe9tection de la langue.',
'Dies ist ein Test, um die Sprache zu erkennen.',
]
predicted = clf.predict(sentences)
for s, p in zip(sentences, predicted):
print('The language of "%s" is "%s"' % (s, dataset.target_names[p]))
|