Skip to content

ENZE'S PRACTICAL #32

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 21 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
НовостиМосковской патриархии мощи царственных мучеников без надобности!Почему до сих пор не могут похоронить последних Романовых?Игра в царские кости: почему похороны Алексея и Марии отложили Вдова Децла заявила, что его родители объявили ей войнуБоец UFC Даррен Тилл восхитился российской продавщицей, нокаутировавшей покупателяBloomberg: США нашли способ отговорить Турцию от закупки российских С-400СМИ сообщили об аресте в России экс-министра ДНР ТимофееваВ квартире московского убийцы жены и сына обнаружили кровавые надписиУкраинский адмирал поведал, как ВСУ отказались стрелять по участникам «Крымской весны»Божене Рынски потребовали санитарный самолет в ИспаниюУстановлено время убийства семьи в Медведково: женщина успела отправить СМСДруг Жореса Алферова рассказал о событиях перед его смертьюВнезапное помешательство: подробности убийства мужем жены и 5-летнего сынаВ Москве мужчина зверски зарезал жену и сынаВрачи сообщили причину смерти Жореса АлфероваВселенская родительская суббота: значение и традицииВ Сети ответили экс-жене бывшего депутата, пожелавшей не видеть быдло в ресторанахЛукашенко объяснил, чем Медведев помешал признанию Белоруссией АбхазииЕвросоюз запретил Польше строить канал рядом с российской границейКриминалист оценил версию «инсценировки смерти» Игоря МалашенкоБывшая супруга экс-депутата Госдумы потребовала не пускать в рестораны быдлоВенесуэла предложила России свою нефтьСтали известны детали жизни Скрипаля после отравленияПьяный Александр Емельяненко устроил ДТП на "Мерседесе" с чеченскими номерамиВдова Кобзона решилась показать семейные сокровищаПартнерыАрхиерейский собор Русской православной церкви на заседании в среду принял решение о канонизации доктора Евгения Боткина, передает "Интерфакс". Глава синодального Отдела внешних церковных связей митрополит Волоколамский Иларион заявил, что это давно желанное решение, поскольку Боткин почитается как святой не только в Русской зарубежной церкви, но и во многих епархиях РПЦ и в медицинском сообществе.
373 changes: 373 additions & 0 deletions 2018-komp-ling/practicals/Practical 5/Pronounciation.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,373 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"from sklearn.linear_model import perceptron\n",
"\n",
"words = [] # The word, correct label and pronunciation\n",
"data = [] # Training examples, e.g. feature vectors\n",
"labels = [] # Correct labels"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"for line in open('pronunciation_data.tsv').readlines():\n",
" row = line.strip().split('\\t')\n",
" vec = []\n",
" for i in row[3].split(','):\n",
" vec.append(int(i))\n",
" data.append(vec)\n",
" labels.append(int(row[0]))\n",
" words.append((row[1], row[2], int(row[0])))"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/enzeg/.local/lib/python2.7/site-packages/sklearn/linear_model/stochastic_gradient.py:152: DeprecationWarning: n_iter parameter is deprecated in 0.19 and will be removed in 0.21. Use max_iter and tol instead.\n",
" DeprecationWarning)\n"
]
},
{
"data": {
"text/plain": [
"Perceptron(alpha=0.0001, class_weight=None, early_stopping=False, eta0=0.002,\n",
" fit_intercept=True, max_iter=None, n_iter=100, n_iter_no_change=5,\n",
" n_jobs=None, penalty=None, random_state=None, shuffle=True, tol=None,\n",
" validation_fraction=0.1, verbose=0, warm_start=False)"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"net = perceptron.Perceptron(n_iter=100, verbose=0, random_state=None, fit_intercept=True, eta0=0.002)\n",
"net.fit(data,labels)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"result = net.predict(data)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.995517705065\n"
]
}
],
"source": [
"total = 0\n",
"correct = 0\n",
"for i in range(0, len(words)):\n",
" \n",
" if result[i] == words[i][2]:\n",
" correct = correct + 1\n",
" else:\n",
" pass\n",
" total = total + 1\n",
"print(float(correct)/total)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Next we split the dataset into a training part and a testing part."
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split\n",
"data_train, data_test, labels_train, labels_test = train_test_split(data, labels, test_size=0.3)"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"StandardScaler(copy=True, with_mean=True, with_std=True)"
]
},
"execution_count": 60,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.preprocessing import StandardScaler\n",
"sc = StandardScaler()\n",
"sc.fit(data_train)"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {},
"outputs": [],
"source": [
"data_train_std = sc.transform(data_train)\n",
"data_test_std = sc.transform(data_test)"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/enzeg/.local/lib/python2.7/site-packages/sklearn/linear_model/stochastic_gradient.py:152: DeprecationWarning: n_iter parameter is deprecated in 0.19 and will be removed in 0.21. Use max_iter and tol instead.\n",
" DeprecationWarning)\n"
]
},
{
"data": {
"text/plain": [
"Perceptron(alpha=0.0001, class_weight=None, early_stopping=False, eta0=0.002,\n",
" fit_intercept=True, max_iter=None, n_iter=100, n_iter_no_change=5,\n",
" n_jobs=None, penalty=None, random_state=None, shuffle=True, tol=None,\n",
" validation_fraction=0.1, verbose=0, warm_start=False)"
]
},
"execution_count": 62,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"net_new = perceptron.Perceptron(n_iter=100, verbose=0, random_state=None, fit_intercept=True, eta0=0.002)\n",
"net_new.fit(data_train_std,labels_train)"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {},
"outputs": [],
"source": [
"new_result = net_new.predict(data_test_std)"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.99552238806\n"
]
}
],
"source": [
"total = 0\n",
"correct = 0\n",
"wrong = []\n",
"for i in range(0, len(data_test_std)):\n",
" \n",
" if new_result[i] == labels_test[i]:\n",
" correct = correct + 1\n",
" else:\n",
" wrong.append(data_test_std[i])\n",
" total = total + 1\n",
"print(float(correct)/total)"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[array([ 1.09056903, -0.46242985, -0.26440669, -0.50879364, -1.12645255,\n",
" -0.33558281, -0.17811527, -0.20337381]),\n",
" array([ 1.09056903, -0.46242985, -0.26440669, -0.50879364, -1.12645255,\n",
" -0.33558281, -0.17811527, -0.20337381]),\n",
" array([-0.9169525 , -0.46242985, -0.26440669, -0.50879364, -1.12645255,\n",
" -0.33558281, -0.17811527, -0.20337381])]"
]
},
"execution_count": 65,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wrong"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The accuracy is basically the same."
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/enzeg/.local/lib/python2.7/site-packages/sklearn/linear_model/stochastic_gradient.py:152: DeprecationWarning: n_iter parameter is deprecated in 0.19 and will be removed in 0.21. Use max_iter and tol instead.\n",
" DeprecationWarning)\n"
]
},
{
"data": {
"text/plain": [
"Perceptron(alpha=0.0001, class_weight=None, early_stopping=False, eta0=0.002,\n",
" fit_intercept=True, max_iter=None, n_iter=2, n_iter_no_change=5,\n",
" n_jobs=None, penalty=None, random_state=None, shuffle=True, tol=None,\n",
" validation_fraction=0.1, verbose=0, warm_start=False)"
]
},
"execution_count": 55,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"net_new2 = perceptron.Perceptron(n_iter=2, verbose=0, random_state=None, fit_intercept=True, eta0=0.002)\n",
"net_new2.fit(data_train_std,labels_train)"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {},
"outputs": [],
"source": [
"new_result2 = net_new2.predict(data_test_std)"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.994029850746\n"
]
}
],
"source": [
"total = 0\n",
"correct = 0\n",
"wrong = []\n",
"for i in range(0, len(data_test_std)):\n",
" \n",
" if new_result2[i] == labels_test[i]:\n",
" correct = correct + 1\n",
" else:\n",
" wrong.append(data_test_std[i])\n",
" total = total + 1\n",
"print(float(correct)/total)"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[array([ 1.07798417, -0.4736177 , -0.25740402, -0.50180109, -1.16380818,\n",
" -0.33321469, -0.17228865, -0.20676598]),\n",
" array([ 1.07798417, -0.4736177 , -0.25740402, -0.50180109, -1.16380818,\n",
" -0.33321469, -0.17228865, -0.20676598]),\n",
" array([ 1.07798417, -0.4736177 , -0.25740402, -0.50180109, -1.16380818,\n",
" -0.33321469, -0.17228865, -0.20676598]),\n",
" array([ 1.07798417, -0.4736177 , -0.25740402, -0.50180109, -1.16380818,\n",
" -0.33321469, -0.17228865, -0.20676598])]"
]
},
"execution_count": 58,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wrong"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"When the iteration is reduced from 100 to 2, there is one more false prediction."
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.13"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
3 changes: 3 additions & 0 deletions 2018-komp-ling/practicals/Practical 5/Readme
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
sklearn and unigram tagging

Since I do not have a tagged corpus to train on, Yandex's mystem package is used to tag the part of speech information.
Loading