diff --git a/SentimentAnalysis_RNN.ipynb b/SentimentAnalysis_RNN.ipynb new file mode 100644 index 0000000..93e7f5a --- /dev/null +++ b/SentimentAnalysis_RNN.ipynb @@ -0,0 +1,878 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "efyTtfc_oJkh" + }, + "source": [ + "## 1. Adding imports & installing neccessay packages ##" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "GaVIBWlyoKz3", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "1fbc85d8-fc75-4df9-dd99-b9d2968fcf46" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Mounted at /content/gdrive\n" + ] + } + ], + "source": [ + "### run this if using google colab to mount google drive as local storage\n", + "\n", + "from google.colab import drive\n", + "import os\n", + "drive.mount('/content/gdrive')\n", + "\n", + "repo_path = '/content/gdrive/My Drive/colab/NLP-Bootcamp/'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "sdBgdze84r8s" + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import collections\n", + "%matplotlib inline\n", + "\n", + "# Import modules to calculate accuracy and confusion matrix\n", + "from sklearn.metrics import confusion_matrix, accuracy_score" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "YLttTMckfNa_" + }, + "source": [ + "## 2. Loading Data ##" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "w9EA4jMv4ywO", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "outputId": "4590a838-52eb-45f6-b056-bb8685147946" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " target ids user \\\n", + "0 p 1978186076 ceruleanbreeze \n", + "1 p 1994697891 enthusiasticjen \n", + "2 p 2191885992 LifeRemixed \n", + "3 p 1753662211 lovemandy \n", + "4 p 2177442789 _LOVELYmanu \n", + "\n", + " text \n", + "0 @nocturnalie Anyway, and now Abby and I share ... \n", + "1 @JoeGigantino Few times I'm trying to leave co... \n", + "2 @AngieGriffin Good Morning Angie I'll be in t... \n", + "3 had a good day driving up mountains, visiting ... \n", + "4 downloading some songs i love lady GaGa. " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
targetidsusertext
0p1978186076ceruleanbreeze@nocturnalie Anyway, and now Abby and I share ...
1p1994697891enthusiasticjen@JoeGigantino Few times I'm trying to leave co...
2p2191885992LifeRemixed@AngieGriffin Good Morning Angie I'll be in t...
3p1753662211lovemandyhad a good day driving up mountains, visiting ...
4p2177442789_LOVELYmanudownloading some songs i love lady GaGa.
\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " + ] + }, + "metadata": {}, + "execution_count": 3 + } + ], + "source": [ + "### run below 2 lines of code for setting train & test data path on google colab\n", + "trainData = os.path.join(repo_path, 'data/sentiment140_160k_tweets_train.csv')\n", + "testData = os.path.join(repo_path, 'data/sentiment140_test.csv')\n", + "\n", + "### run below 3 lines of code for setting train & test data path on local machine\n", + "'''\n", + "DATA = './data/'\n", + "trainData = DATA + 'sentiment140_160k_tweets_train.csv'\n", + "testData = DATA + 'sentiment140_test.csv'\n", + "'''\n", + "\n", + "train = pd.read_csv(trainData)\n", + "test = pd.read_csv(testData)\n", + "\n", + "train.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DE0NVFR9s4o4" + }, + "source": [ + "Looking at distribution of *'positives'* & *'negatives'* samples in train dataset " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "MF2-MSXFoJkr", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "1ba63640-3a77-48e5-f620-11b8260ada21" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "Counter({'p': 80000, 'n': 79985})" + ] + }, + "metadata": {}, + "execution_count": 4 + } + ], + "source": [ + "collections.Counter(train['target'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "vwyLXx_moJks", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 293 + }, + "outputId": "aceb2fa0-4236-46ab-a950-d6eb656c9cbb" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "" + ] + }, + "metadata": {}, + "execution_count": 5 + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYMAAAEDCAYAAADX1GjKAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAVBUlEQVR4nO3df6zd9X3f8ecrdggkKdiEO4vZZkaLlcphhcAduEtVrXg1Nu1qSw0I1s1XyMKbIFs7TducaZI1CFMiVWNlIkhW8bCjLg5ljeylpq7lUFXtZOLLjwKGIN9AiG0BvsXGJGFAnb73x/m4Pbk51/cYrs81+PmQjs7n+/58vt/zOdK1X/d8v59zv6kqJElntw/N9AQkSTPPMJAkGQaSJMNAkoRhIEnCMJAkAbNnegLv1kUXXVSLFi2a6WlI0vvGY4899pdVNdSr730bBosWLWJ0dHSmpyFJ7xtJXpqsz9NEkiTDQJJkGEiSMAwkSRgGkiT6DIMk/zbJviTPJPlaknOTXJrk0SRjSb6e5Jw29iNte6z1L+o6zhda/fkk13XVV7TaWJL10/0mJUknN2UYJJkP/BtguKouA2YBNwFfBu6uqk8CR4G1bZe1wNFWv7uNI8mStt+ngRXAV5LMSjILuBdYCSwBbm5jJUkD0u9potnAeUlmAx8FXgauBR5q/ZuB1a29qm3T+pclSatvraq3q+pFYAy4uj3GquqFqnoH2NrGSpIGZMovnVXVoSS/DXwf+H/AHwOPAa9X1fE27CAwv7XnAwfavseTHAM+0ep7ug7dvc+BCfVres0lyTpgHcAll1wy1dRn3KL1fzjTU/hA+d6XfmWmp/CB4s/n9Hq//3z2c5poLp3f1C8F/i7wMTqneQauqjZW1XBVDQ8N9fxGtSTpXejnNNE/AV6sqvGq+ivgD4DPAnPaaSOABcCh1j4ELARo/RcAr3XXJ+wzWV2SNCD9hMH3gaVJPtrO/S8DngUeAT7XxowA21p7e9um9X+rOjda3g7c1FYbXQosBr4N7AUWt9VJ59C5yLz9vb81SVK/+rlm8GiSh4DHgePAE8BG4A+BrUm+2Gr3t13uB76aZAw4Quc/d6pqX5IH6QTJceD2qvoxQJLPAzvprFTaVFX7pu8tSpKm0tdfLa2qDcCGCeUX6KwEmjj2LeCGSY5zF3BXj/oOYEc/c5EkTT+/gSxJMgwkSYaBJAnDQJKEYSBJwjCQJGEYSJIwDCRJGAaSJAwDSRKGgSQJw0CShGEgScIwkCRhGEiSMAwkSfQRBkk+leTJrscbSX4ryYVJdiXZ357ntvFJck+SsSRPJbmy61gjbfz+JCNd9auSPN32uafdXlOSNCBThkFVPV9VV1TVFcBVwJvAN4D1wO6qWgzsbtsAK+nc33gxsA64DyDJhXTulnYNnTukbTgRIG3MrV37rZiWdydJ6supniZaBny3ql4CVgGbW30zsLq1VwFbqmMPMCfJxcB1wK6qOlJVR4FdwIrWd35V7amqArZ0HUuSNACnGgY3AV9r7XlV9XJrvwLMa+35wIGufQ622snqB3vUJUkD0ncYJDkH+DXg9yf2td/oaxrnNdkc1iUZTTI6Pj5+ul9Oks4ap/LJYCXweFW92rZfbad4aM+HW/0QsLBrvwWtdrL6gh71n1JVG6tquKqGh4aGTmHqkqSTOZUwuJm/PUUEsB04sSJoBNjWVV/TVhUtBY6100k7geVJ5rYLx8uBna3vjSRL2yqiNV3HkiQNwOx+BiX5GPDLwL/sKn8JeDDJWuAl4MZW3wFcD4zRWXl0C0BVHUlyJ7C3jbujqo609m3AA8B5wMPtIUkakL7CoKp+BHxiQu01OquLJo4t4PZJjrMJ2NSjPgpc1s9cJEnTz28gS5IMA0mSYSBJwjCQJGEYSJIwDCRJGAaSJAwDSRKGgSQJw0CShGEgScIwkCRhGEiSMAwkSRgGkiQMA0kShoEkiT7DIMmcJA8l+U6S55L8fJILk+xKsr89z21jk+SeJGNJnkpyZddxRtr4/UlGuupXJXm67XNPuxeyJGlA+v1k8DvAH1XVzwKXA88B64HdVbUY2N22AVYCi9tjHXAfQJILgQ3ANcDVwIYTAdLG3Nq134r39rYkSadiyjBIcgHwi8D9AFX1TlW9DqwCNrdhm4HVrb0K2FIde4A5SS4GrgN2VdWRqjoK7AJWtL7zq2pPu3/ylq5jSZIGoJ9PBpcC48D/TPJEkt9N8jFgXlW93Ma8Asxr7fnAga79D7bayeoHe9QlSQPSTxjMBq4E7quqzwA/4m9PCQHQfqOv6Z/eT0qyLsloktHx8fHT/XKSdNboJwwOAger6tG2/RCdcHi1neKhPR9u/YeAhV37L2i1k9UX9Kj/lKraWFXDVTU8NDTUx9QlSf2YMgyq6hXgQJJPtdIy4FlgO3BiRdAIsK21twNr2qqipcCxdjppJ7A8ydx24Xg5sLP1vZFkaVtFtKbrWJKkAZjd57h/DfxeknOAF4Bb6ATJg0nWAi8BN7axO4DrgTHgzTaWqjqS5E5gbxt3R1Udae3bgAeA84CH20OSNCB9hUFVPQkM9+ha1mNsAbdPcpxNwKYe9VHgsn7mIkmafn4DWZJkGEiSDANJEoaBJAnDQJKEYSBJwjCQJGEYSJIwDCRJGAaSJAwDSRKGgSQJw0CShGEgScIwkCRhGEiSMAwkSfQZBkm+l+TpJE8mGW21C5PsSrK/Pc9t9SS5J8lYkqeSXNl1nJE2fn+Ska76Ve34Y23fTPcblSRN7lQ+GfxSVV1RVSduf7ke2F1Vi4HdbRtgJbC4PdYB90EnPIANwDXA1cCGEwHSxtzatd+Kd/2OJEmn7L2cJloFbG7tzcDqrvqW6tgDzElyMXAdsKuqjlTVUWAXsKL1nV9Ve9r9k7d0HUuSNAD9hkEBf5zksSTrWm1eVb3c2q8A81p7PnCga9+DrXay+sEedUnSgMzuc9wvVNWhJH8H2JXkO92dVVVJavqn95NaEK0DuOSSS073y0nSWaOvTwZVdag9Hwa+Qeec/6vtFA/t+XAbfghY2LX7glY7WX1Bj3qveWysquGqGh4aGupn6pKkPkwZBkk+luRnTrSB5cAzwHbgxIqgEWBba28H1rRVRUuBY+100k5geZK57cLxcmBn63sjydK2imhN17EkSQPQz2miecA32mrP2cD/qqo/SrIXeDDJWuAl4MY2fgdwPTAGvAncAlBVR5LcCext4+6oqiOtfRvwAHAe8HB7SJIGZMowqKoXgMt71F8DlvWoF3D7JMfaBGzqUR8FLutjvpKk08BvIEuSDANJkmEgScIwkCRhGEiSMAwkSRgGkiQMA0kShoEkCcNAkoRhIEnCMJAkYRhIkjAMJEkYBpIkDANJEoaBJIlTCIMks5I8keSbbfvSJI8mGUvy9STntPpH2vZY61/UdYwvtPrzSa7rqq9otbEk66fv7UmS+nEqnwx+E3iua/vLwN1V9UngKLC21dcCR1v97jaOJEuAm4BPAyuAr7SAmQXcC6wElgA3t7GSpAHpKwySLAB+Bfjdth3gWuChNmQzsLq1V7VtWv+yNn4VsLWq3q6qF4Ex4Or2GKuqF6rqHWBrGytJGpB+Pxn8d+A/AH/dtj8BvF5Vx9v2QWB+a88HDgC0/mNt/N/UJ+wzWV2SNCBThkGSXwUOV9VjA5jPVHNZl2Q0yej4+PhMT0eSPjD6+WTwWeDXknyPzimca4HfAeYkmd3GLAAOtfYhYCFA678AeK27PmGfyeo/pao2VtVwVQ0PDQ31MXVJUj+mDIOq+kJVLaiqRXQuAH+rqn4DeAT4XBs2Amxr7e1tm9b/raqqVr+prTa6FFgMfBvYCyxuq5POaa+xfVrenSSpL7OnHjKp/whsTfJF4Ang/la/H/hqkjHgCJ3/3KmqfUkeBJ4FjgO3V9WPAZJ8HtgJzAI2VdW+9zAvSdIpOqUwqKo/Af6ktV+gsxJo4pi3gBsm2f8u4K4e9R3AjlOZiyRp+vgNZEmSYSBJMgwkSRgGkiQMA0kShoEkCcNAkoRhIEnCMJAkYRhIkjAMJEkYBpIkDANJEoaBJAnDQJKEYSBJwjCQJNFHGCQ5N8m3k/xFkn1J/kurX5rk0SRjSb7e7l9Mu8fx11v90SSLuo71hVZ/Psl1XfUVrTaWZP30v01J0sn088ngbeDaqrocuAJYkWQp8GXg7qr6JHAUWNvGrwWOtvrdbRxJltC5H/KngRXAV5LMSjILuBdYCSwBbm5jJUkDMmUYVMcP2+aH26OAa4GHWn0zsLq1V7VtWv+yJGn1rVX1dlW9CIzRuYfy1cBYVb1QVe8AW9tYSdKA9HXNoP0G/yRwGNgFfBd4vaqOtyEHgfmtPR84AND6jwGf6K5P2GeyuiRpQPoKg6r6cVVdASyg85v8z57WWU0iyboko0lGx8fHZ2IKkvSBdEqriarqdeAR4OeBOUlmt64FwKHWPgQsBGj9FwCvddcn7DNZvdfrb6yq4aoaHhoaOpWpS5JOop/VRENJ5rT2ecAvA8/RCYXPtWEjwLbW3t62af3fqqpq9ZvaaqNLgcXAt4G9wOK2OukcOheZt0/Hm5Mk9Wf21EO4GNjcVv18CHiwqr6Z5Flga5IvAk8A97fx9wNfTTIGHKHznztVtS/Jg8CzwHHg9qr6MUCSzwM7gVnApqraN23vUJI0pSnDoKqeAj7To/4CnesHE+tvATdMcqy7gLt61HcAO/qYryTpNPAbyJIkw0CSZBhIkjAMJEkYBpIkDANJEoaBJAnDQJKEYSBJwjCQJGEYSJIwDCRJGAaSJAwDSRKGgSQJw0CShGEgSaK/eyAvTPJIkmeT7Evym61+YZJdSfa357mtniT3JBlL8lSSK7uONdLG708y0lW/KsnTbZ97kuR0vFlJUm/9fDI4Dvy7qloCLAVuT7IEWA/srqrFwO62DbCSzs3uFwPrgPugEx7ABuAaOrfL3HAiQNqYW7v2W/He35okqV9ThkFVvVxVj7f2D4DngPnAKmBzG7YZWN3aq4At1bEHmJPkYuA6YFdVHamqo8AuYEXrO7+q9lRVAVu6jiVJGoBTumaQZBHwGeBRYF5Vvdy6XgHmtfZ84EDXbgdb7WT1gz3qkqQB6TsMknwc+N/Ab1XVG9197Tf6mua59ZrDuiSjSUbHx8dP98tJ0lmjrzBI8mE6QfB7VfUHrfxqO8VDez7c6oeAhV27L2i1k9UX9Kj/lKraWFXDVTU8NDTUz9QlSX3oZzVRgPuB56rqv3V1bQdOrAgaAbZ11de0VUVLgWPtdNJOYHmSue3C8XJgZ+t7I8nS9lpruo4lSRqA2X2M+SzwL4CnkzzZav8J+BLwYJK1wEvAja1vB3A9MAa8CdwCUFVHktwJ7G3j7qiqI619G/AAcB7wcHtIkgZkyjCoqj8DJlv3v6zH+AJun+RYm4BNPeqjwGVTzUWSdHr4DWRJkmEgSTIMJEkYBpIkDANJEoaBJAnDQJKEYSBJwjCQJGEYSJIwDCRJGAaSJAwDSRKGgSQJw0CShGEgScIwkCTR3z2QNyU5nOSZrtqFSXYl2d+e57Z6ktyTZCzJU0mu7NpnpI3fn2Skq35VkqfbPve0+yBLkgaon08GDwArJtTWA7urajGwu20DrAQWt8c64D7ohAewAbgGuBrYcCJA2phbu/ab+FqSpNNsyjCoqj8FjkworwI2t/ZmYHVXfUt17AHmJLkYuA7YVVVHquoosAtY0frOr6o97d7JW7qOJUkakHd7zWBeVb3c2q8A81p7PnCga9zBVjtZ/WCPuiRpgN7zBeT2G31Nw1ymlGRdktEko+Pj44N4SUk6K7zbMHi1neKhPR9u9UPAwq5xC1rtZPUFPeo9VdXGqhququGhoaF3OXVJ0kTvNgy2AydWBI0A27rqa9qqoqXAsXY6aSewPMncduF4ObCz9b2RZGlbRbSm61iSpAGZPdWAJF8D/jFwUZKDdFYFfQl4MMla4CXgxjZ8B3A9MAa8CdwCUFVHktwJ7G3j7qiqExelb6OzYuk84OH2kCQN0JRhUFU3T9K1rMfYAm6f5DibgE096qPAZVPNQ5J0+vgNZEmSYSBJMgwkSRgGkiQMA0kShoEkCcNAkoRhIEnCMJAkYRhIkjAMJEkYBpIkDANJEoaBJAnDQJKEYSBJwjCQJHEGhUGSFUmeTzKWZP1Mz0eSziZnRBgkmQXcC6wElgA3J1kys7OSpLPHGREGwNXAWFW9UFXvAFuBVTM8J0k6a8ye6Qk084EDXdsHgWsmDkqyDljXNn+Y5PkBzO1scBHwlzM9iankyzM9A80Qfz6nz9+brONMCYO+VNVGYONMz+ODJsloVQ3P9DykXvz5HIwz5TTRIWBh1/aCVpMkDcCZEgZ7gcVJLk1yDnATsH2G5yRJZ40z4jRRVR1P8nlgJzAL2FRV+2Z4WmcTT73pTObP5wCkqmZ6DpKkGXamnCaSJM0gw0CSZBhIks6QC8gavCQfAX4dWETXz0FV3TFTc5JOSHIucBvwC0ABfwbcV1VvzejEPsAMg7PXNuAY8Bjw9gzPRZpoC/AD4H+07X8GfBW4YcZm9AHnaqKzVJJnquqymZ6H1EuSZ6tqyVQ1TR+vGZy9/m+SfzDTk5Am8XiSpSc2klwDjM7gfD7w/GRwlkryLPBJ4EU6p4kCVFX93IxOTAKSPAd8Cvh+K10CPA8cx5/T08IwOEsl6fnXC6vqpUHPRZposp/PE/w5nX6GgSTJawaSJMNAkoRhIPWUZE6S2wbwOqu937fOBIaB1NscOt+A7Us63s2/p9WAYaAZ5wVkqYckW4FVdJYzPgL8HDAX+DDwn6tqW5JFdO7B8ShwFXA9sAb458A4nft6P1ZVv53k7wP3AkPAm8CtwIXAN+l8E/wY8OtV9d0BvUXpJ/jnKKTe1gOXVdUVSWYDH62qN5JcBOxJcuJOfIuBkarak+Qf0vl7T5fTCY3H6fy5D+jcoOVfVdX+9gWqr1TVte0436yqhwb55qSJDANpagH+a5JfBP4amA/Ma30vVdWe1v4ssK39MbW3kvwfgCQfB/4R8PtJThzzI4OavNQPw0Ca2m/QOb1zVVX9VZLvAee2vh/1sf+HgNer6orTND/pPfMCstTbD4Cfae0LgMMtCH4JmOzbsX8O/NMk57ZPA78KUFVvAC8muQH+5mLz5T1eR5oxhoHUQ1W9Bvx5kmeAK4DhJE/TuUD8nUn22QtsB54CHgaepnNhGDqfLtYm+QtgH52L0wBbgX+f5Il2kVmaEa4mkqZRko9X1Q+TfBT4U2BdVT0+0/OSpuI1A2l6bWxfIjsX2GwQ6P3CTwaSJK8ZSJIMA0kShoEkCcNAkoRhIEnCMJAkAf8fRluB5QL2emMAAAAASUVORK5CYII=\n" + }, + "metadata": { + "needs_background": "light" + } + } + ], + "source": [ + "train.groupby('target').size().plot(kind='bar')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "xyHV7gCCxCpO" + }, + "source": [ + "We will find that it is a relatively well-balanced dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5RK3QPUnbUFq" + }, + "source": [ + "## 3. Data (Text) Preprocessing ##" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "N_p8SQxQMKrq" + }, + "outputs": [], + "source": [ + "### mapping a dictionary of apostrophe words\n", + "\n", + "appos = {\n", + "\"aren't\" : \"are not\",\n", + "\"can't\" : \"cannot\",\n", + "\"cant\" : \"cannot\",\n", + "\"couldn't\" : \"could not\",\n", + "\"didn't\" : \"did not\",\n", + "\"doesn't\" : \"does not\",\n", + "\"don't\" : \"do not\",\n", + "\"hadn't\" : \"had not\",\n", + "\"hasn't\" : \"has not\",\n", + "\"haven't\" : \"have not\",\n", + "\"he'd\" : \"he would\",\n", + "\"he'll\" : \"he will\",\n", + "\"he's\" : \"he is\",\n", + "\"i'd\" : \"I would\",\n", + "\"i'd\" : \"I had\",\n", + "\"i'll\" : \"I will\",\n", + "\"i'm\" : \"I am\",\n", + "\"im\" : \"I am\",\n", + "\"isn't\" : \"is not\",\n", + "\"it's\" : \"it is\",\n", + "\"it'll\":\"it will\",\n", + "\"i've\" : \"I have\",\n", + "\"let's\" : \"let us\",\n", + "\"mightn't\" : \"might not\",\n", + "\"mustn't\" : \"must not\",\n", + "\"shan't\" : \"shall not\",\n", + "\"she'd\" : \"she would\",\n", + "\"she'll\" : \"she will\",\n", + "\"she's\" : \"she is\",\n", + "\"shouldn't\" : \"should not\",\n", + "\"that's\" : \"that is\",\n", + "\"there's\" : \"there is\",\n", + "\"they'd\" : \"they would\",\n", + "\"they'll\" : \"they will\",\n", + "\"they're\" : \"they are\",\n", + "\"they've\" : \"they have\",\n", + "\"we'd\" : \"we would\",\n", + "\"we're\" : \"we are\",\n", + "\"weren't\" : \"were not\",\n", + "\"we've\" : \"we have\",\n", + "\"what'll\" : \"what will\",\n", + "\"what're\" : \"what are\",\n", + "\"what's\" : \"what is\",\n", + "\"what've\" : \"what have\",\n", + "\"where's\" : \"where is\",\n", + "\"who'd\" : \"who would\",\n", + "\"who'll\" : \"who will\",\n", + "\"who're\" : \"who are\",\n", + "\"who's\" : \"who is\",\n", + "\"who've\" : \"who have\",\n", + "\"won't\" : \"will not\",\n", + "\"wouldn't\" : \"would not\",\n", + "\"you'd\" : \"you would\",\n", + "\"you'll\" : \"you will\",\n", + "\"you're\" : \"you are\",\n", + "\"you've\" : \"you have\",\n", + "\"'re\": \" are\",\n", + "\"wasn't\": \"was not\",\n", + "\"we'll\":\" will\",\n", + "\"didn't\": \"did not\",\n", + "\"gg\" : \"going\"\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "7IbqS4m-4-EX", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "outputId": "38904185-6b96-49f0-a68c-05012f58de30" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " target ids user \\\n", + "0 p 1978186076 ceruleanbreeze \n", + "1 p 1994697891 enthusiasticjen \n", + "2 p 2191885992 LifeRemixed \n", + "3 p 1753662211 lovemandy \n", + "4 p 2177442789 _LOVELYmanu \n", + "\n", + " text \\\n", + "0 @nocturnalie Anyway, and now Abby and I share ... \n", + "1 @JoeGigantino Few times I'm trying to leave co... \n", + "2 @AngieGriffin Good Morning Angie I'll be in t... \n", + "3 had a good day driving up mountains, visiting ... \n", + "4 downloading some songs i love lady GaGa. \n", + "\n", + " ugc \n", + "0 anyway and now abby and i share all our crops ... \n", + "1 few times I am trying to leave comments in you... \n", + "2 good morning angie I will be in the atl july 8... \n", + "3 had a good day driving up mountains visiting k... \n", + "4 downloading some songs i love lady gaga " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
targetidsusertextugc
0p1978186076ceruleanbreeze@nocturnalie Anyway, and now Abby and I share ...anyway and now abby and i share all our crops ...
1p1994697891enthusiasticjen@JoeGigantino Few times I'm trying to leave co...few times I am trying to leave comments in you...
2p2191885992LifeRemixed@AngieGriffin Good Morning Angie I'll be in t...good morning angie I will be in the atl july 8...
3p1753662211lovemandyhad a good day driving up mountains, visiting ...had a good day driving up mountains visiting k...
4p2177442789_LOVELYmanudownloading some songs i love lady GaGa.downloading some songs i love lady gaga
\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " + ] + }, + "metadata": {}, + "execution_count": 7 + } + ], + "source": [ + "import re\n", + "\n", + "def preprocess_text(sentence):\n", + " text = re.sub('((www\\.[^\\s]+)|(https?://[^\\s]+))','', sentence['text'])\n", + " text = re.sub('@[^\\s]+','', text)\n", + " text = text.lower().split()\n", + " reformed = [appos[word] if word in appos else word for word in text]\n", + " reformed = \" \".join(reformed) \n", + " text = re.sub('&[^\\s]+;', '', reformed)\n", + " text = re.sub('[^a-zA-Zа-яА-Я1-9]+', ' ', text)\n", + " text = re.sub(' +',' ', text)\n", + " #text = re.sub(' [\\w] ', ' ', text)\n", + " return text.strip()\n", + "\n", + "preprocess = train\n", + "preprocess['ugc'] = preprocess.apply(preprocess_text, axis=1)\n", + "\n", + "preprocess.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "CUsyZDEqK_4r" + }, + "source": [ + "## 4. Sentiment Analysis using Deep Learning-based Method: RNN ##" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ReMymperLMAh" + }, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import classification_report\n", + "from keras.preprocessing.text import Tokenizer\n", + "from keras.preprocessing import sequence\n", + "\n", + "max_features = 4000\n", + "#num_words = 20\n", + "embedding_size = 256\n", + "lstm_dim = 256\n", + "batch_size = 64\n", + "num_epochs = 10\n", + "\n", + "# Create tokenizer by converting text into sequence of integers\n", + "tokenizer = Tokenizer(num_words=max_features, split=' ')\n", + "tokenizer.fit_on_texts(preprocess['ugc'].values)\n", + "\n", + "X = tokenizer.texts_to_sequences(preprocess['ugc'].values)\n", + "X = sequence.pad_sequences(X)\n", + "#X = sequence.pad_sequences(X, maxlen=num_words)\n", + "y = pd.get_dummies(preprocess['target']).values\n", + "\n", + "# Adding 1 because of reserved 0 index\n", + "vocab_size = len(tokenizer.word_index) + 1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "fqZyya_Y6xhl", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "44c5cbd1-51bb-45c9-f107-45b513c341a1" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "WARNING:tensorflow:Layer lstm will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU.\n", + "WARNING:tensorflow:Layer lstm_1 will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU.\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Model: \"sequential\"\n", + "_________________________________________________________________\n", + " Layer (type) Output Shape Param # \n", + "=================================================================\n", + " embedding (Embedding) (None, 38, 256) 1024000 \n", + " \n", + " spatial_dropout1d (SpatialD (None, 38, 256) 0 \n", + " ropout1D) \n", + " \n", + " lstm (LSTM) (None, 38, 256) 525312 \n", + " \n", + " lstm_1 (LSTM) (None, 128) 197120 \n", + " \n", + " dense (Dense) (None, 2) 258 \n", + " \n", + "=================================================================\n", + "Total params: 1,746,690\n", + "Trainable params: 1,746,690\n", + "Non-trainable params: 0\n", + "_________________________________________________________________\n" + ] + } + ], + "source": [ + "# Define model\n", + "from keras import Sequential\n", + "from keras.layers import Embedding, LSTM, Dropout, Dense, SpatialDropout1D\n", + "\n", + "model = Sequential()\n", + "model.add(Embedding(input_dim=max_features,\n", + " output_dim=embedding_size,\n", + " input_length=X.shape[1]))\n", + "model.add(SpatialDropout1D(0.4))\n", + "model.add(LSTM(units=lstm_dim,\n", + " dropout=0.2,\n", + " activation='tanh',\n", + " recurrent_dropout=0.2,\n", + " recurrent_activation='sigmoid',\n", + " return_sequences=True))\n", + "model.add(LSTM(units=128,\n", + " dropout=0.2,\n", + " activation='tanh',\n", + " recurrent_dropout=0.2,\n", + " recurrent_activation='sigmoid'))\n", + "model.add(Dense(2, activation='sigmoid')) \n", + "\n", + "model.summary()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "HRNFpdoUN3Cw" + }, + "outputs": [], + "source": [ + "# Compile model\n", + "model.compile(loss='binary_crossentropy',\n", + " optimizer='adam',\n", + " metrics=[['accuracy']])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ogRmCK1UZhCC" + }, + "outputs": [], + "source": [ + "Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size = 0.2, shuffle=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "AJ083iTdOKNo", + "outputId": "eaf7b1cc-040a-4e3d-84eb-f8ad1e77ddf6" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 1/10\n", + "2000/2000 [==============================] - 699s 346ms/step - loss: 0.4805 - accuracy: 0.7679 - val_loss: 0.4439 - val_accuracy: 0.7926\n", + "Epoch 2/10\n", + "2000/2000 [==============================] - 702s 351ms/step - loss: 0.4287 - accuracy: 0.8005 - val_loss: 0.4355 - val_accuracy: 0.7964\n", + "Epoch 3/10\n", + "2000/2000 [==============================] - 690s 345ms/step - loss: 0.4076 - accuracy: 0.8133 - val_loss: 0.4422 - val_accuracy: 0.7967\n", + "Epoch 4/10\n", + "2000/2000 [==============================] - 684s 342ms/step - loss: 0.3895 - accuracy: 0.8233 - val_loss: 0.4425 - val_accuracy: 0.7979\n", + "Epoch 5/10\n", + "2000/2000 [==============================] - 677s 339ms/step - loss: 0.3737 - accuracy: 0.8324 - val_loss: 0.4458 - val_accuracy: 0.7996\n", + "Epoch 6/10\n", + "2000/2000 [==============================] - 674s 337ms/step - loss: 0.3560 - accuracy: 0.8405 - val_loss: 0.4561 - val_accuracy: 0.7929\n", + "Epoch 7/10\n", + "2000/2000 [==============================] - 674s 337ms/step - loss: 0.3394 - accuracy: 0.8485 - val_loss: 0.4660 - val_accuracy: 0.7936\n", + "Epoch 8/10\n", + "2000/2000 [==============================] - 676s 338ms/step - loss: 0.3208 - accuracy: 0.8588 - val_loss: 0.4931 - val_accuracy: 0.7899\n", + "Epoch 9/10\n", + "2000/2000 [==============================] - 678s 339ms/step - loss: 0.3037 - accuracy: 0.8671 - val_loss: 0.5225 - val_accuracy: 0.7865\n", + "Epoch 10/10\n", + "2000/2000 [==============================] - 674s 337ms/step - loss: 0.2876 - accuracy: 0.8745 - val_loss: 0.5419 - val_accuracy: 0.7852\n" + ] + } + ], + "source": [ + "# Fit model\n", + "history = model.fit(Xtrain, ytrain, batch_size=batch_size, epochs=num_epochs, validation_data=(Xtest, ytest))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Dft2-xIgnjCI", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "d0b90103-126f-473b-af21-0a3bfc00d373" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Accuracy: 78.52%\n" + ] + } + ], + "source": [ + "# Final evaluation of the model\n", + "scores = model.evaluate(Xtest, ytest, verbose=0)\n", + "print(\"Accuracy: %.2f%%\" % (scores[1]*100))" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "collapsed_sections": [], + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + }, + "gpuClass": "standard" + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file