diff --git a/SentimentAnalysis_RNN.ipynb b/SentimentAnalysis_RNN.ipynb
new file mode 100644
index 0000000..93e7f5a
--- /dev/null
+++ b/SentimentAnalysis_RNN.ipynb
@@ -0,0 +1,878 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "efyTtfc_oJkh"
+ },
+ "source": [
+ "## 1. Adding imports & installing neccessay packages ##"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "GaVIBWlyoKz3",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "outputId": "1fbc85d8-fc75-4df9-dd99-b9d2968fcf46"
+ },
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Mounted at /content/gdrive\n"
+ ]
+ }
+ ],
+ "source": [
+ "### run this if using google colab to mount google drive as local storage\n",
+ "\n",
+ "from google.colab import drive\n",
+ "import os\n",
+ "drive.mount('/content/gdrive')\n",
+ "\n",
+ "repo_path = '/content/gdrive/My Drive/colab/NLP-Bootcamp/'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "sdBgdze84r8s"
+ },
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "import collections\n",
+ "%matplotlib inline\n",
+ "\n",
+ "# Import modules to calculate accuracy and confusion matrix\n",
+ "from sklearn.metrics import confusion_matrix, accuracy_score"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "YLttTMckfNa_"
+ },
+ "source": [
+ "## 2. Loading Data ##"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "w9EA4jMv4ywO",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 206
+ },
+ "outputId": "4590a838-52eb-45f6-b056-bb8685147946"
+ },
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " target ids user \\\n",
+ "0 p 1978186076 ceruleanbreeze \n",
+ "1 p 1994697891 enthusiasticjen \n",
+ "2 p 2191885992 LifeRemixed \n",
+ "3 p 1753662211 lovemandy \n",
+ "4 p 2177442789 _LOVELYmanu \n",
+ "\n",
+ " text \n",
+ "0 @nocturnalie Anyway, and now Abby and I share ... \n",
+ "1 @JoeGigantino Few times I'm trying to leave co... \n",
+ "2 @AngieGriffin Good Morning Angie I'll be in t... \n",
+ "3 had a good day driving up mountains, visiting ... \n",
+ "4 downloading some songs i love lady GaGa. "
+ ],
+ "text/html": [
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " target | \n",
+ " ids | \n",
+ " user | \n",
+ " text | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " p | \n",
+ " 1978186076 | \n",
+ " ceruleanbreeze | \n",
+ " @nocturnalie Anyway, and now Abby and I share ... | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " p | \n",
+ " 1994697891 | \n",
+ " enthusiasticjen | \n",
+ " @JoeGigantino Few times I'm trying to leave co... | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " p | \n",
+ " 2191885992 | \n",
+ " LifeRemixed | \n",
+ " @AngieGriffin Good Morning Angie I'll be in t... | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " p | \n",
+ " 1753662211 | \n",
+ " lovemandy | \n",
+ " had a good day driving up mountains, visiting ... | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " p | \n",
+ " 2177442789 | \n",
+ " _LOVELYmanu | \n",
+ " downloading some songs i love lady GaGa. | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ " "
+ ]
+ },
+ "metadata": {},
+ "execution_count": 3
+ }
+ ],
+ "source": [
+ "### run below 2 lines of code for setting train & test data path on google colab\n",
+ "trainData = os.path.join(repo_path, 'data/sentiment140_160k_tweets_train.csv')\n",
+ "testData = os.path.join(repo_path, 'data/sentiment140_test.csv')\n",
+ "\n",
+ "### run below 3 lines of code for setting train & test data path on local machine\n",
+ "'''\n",
+ "DATA = './data/'\n",
+ "trainData = DATA + 'sentiment140_160k_tweets_train.csv'\n",
+ "testData = DATA + 'sentiment140_test.csv'\n",
+ "'''\n",
+ "\n",
+ "train = pd.read_csv(trainData)\n",
+ "test = pd.read_csv(testData)\n",
+ "\n",
+ "train.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "DE0NVFR9s4o4"
+ },
+ "source": [
+ "Looking at distribution of *'positives'* & *'negatives'* samples in train dataset "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "MF2-MSXFoJkr",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "outputId": "1ba63640-3a77-48e5-f620-11b8260ada21"
+ },
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "Counter({'p': 80000, 'n': 79985})"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 4
+ }
+ ],
+ "source": [
+ "collections.Counter(train['target'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "vwyLXx_moJks",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 293
+ },
+ "outputId": "aceb2fa0-4236-46ab-a950-d6eb656c9cbb"
+ },
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "execution_count": 5
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ ""
+ ],
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYMAAAEDCAYAAADX1GjKAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAVBUlEQVR4nO3df6zd9X3f8ecrdggkKdiEO4vZZkaLlcphhcAduEtVrXg1Nu1qSw0I1s1XyMKbIFs7TducaZI1CFMiVWNlIkhW8bCjLg5ljeylpq7lUFXtZOLLjwKGIN9AiG0BvsXGJGFAnb73x/m4Pbk51/cYrs81+PmQjs7n+/58vt/zOdK1X/d8v59zv6kqJElntw/N9AQkSTPPMJAkGQaSJMNAkoRhIEnCMJAkAbNnegLv1kUXXVSLFi2a6WlI0vvGY4899pdVNdSr730bBosWLWJ0dHSmpyFJ7xtJXpqsz9NEkiTDQJJkGEiSMAwkSRgGkiT6DIMk/zbJviTPJPlaknOTXJrk0SRjSb6e5Jw29iNte6z1L+o6zhda/fkk13XVV7TaWJL10/0mJUknN2UYJJkP/BtguKouA2YBNwFfBu6uqk8CR4G1bZe1wNFWv7uNI8mStt+ngRXAV5LMSjILuBdYCSwBbm5jJUkD0u9potnAeUlmAx8FXgauBR5q/ZuB1a29qm3T+pclSatvraq3q+pFYAy4uj3GquqFqnoH2NrGSpIGZMovnVXVoSS/DXwf+H/AHwOPAa9X1fE27CAwv7XnAwfavseTHAM+0ep7ug7dvc+BCfVres0lyTpgHcAll1wy1dRn3KL1fzjTU/hA+d6XfmWmp/CB4s/n9Hq//3z2c5poLp3f1C8F/i7wMTqneQauqjZW1XBVDQ8N9fxGtSTpXejnNNE/AV6sqvGq+ivgD4DPAnPaaSOABcCh1j4ELARo/RcAr3XXJ+wzWV2SNCD9hMH3gaVJPtrO/S8DngUeAT7XxowA21p7e9um9X+rOjda3g7c1FYbXQosBr4N7AUWt9VJ59C5yLz9vb81SVK/+rlm8GiSh4DHgePAE8BG4A+BrUm+2Gr3t13uB76aZAw4Quc/d6pqX5IH6QTJceD2qvoxQJLPAzvprFTaVFX7pu8tSpKm0tdfLa2qDcCGCeUX6KwEmjj2LeCGSY5zF3BXj/oOYEc/c5EkTT+/gSxJMgwkSYaBJAnDQJKEYSBJwjCQJGEYSJIwDCRJGAaSJAwDSRKGgSQJw0CShGEgScIwkCRhGEiSMAwkSfQRBkk+leTJrscbSX4ryYVJdiXZ357ntvFJck+SsSRPJbmy61gjbfz+JCNd9auSPN32uafdXlOSNCBThkFVPV9VV1TVFcBVwJvAN4D1wO6qWgzsbtsAK+nc33gxsA64DyDJhXTulnYNnTukbTgRIG3MrV37rZiWdydJ6supniZaBny3ql4CVgGbW30zsLq1VwFbqmMPMCfJxcB1wK6qOlJVR4FdwIrWd35V7amqArZ0HUuSNACnGgY3AV9r7XlV9XJrvwLMa+35wIGufQ622snqB3vUJUkD0ncYJDkH+DXg9yf2td/oaxrnNdkc1iUZTTI6Pj5+ul9Oks4ap/LJYCXweFW92rZfbad4aM+HW/0QsLBrvwWtdrL6gh71n1JVG6tquKqGh4aGTmHqkqSTOZUwuJm/PUUEsB04sSJoBNjWVV/TVhUtBY6100k7geVJ5rYLx8uBna3vjSRL2yqiNV3HkiQNwOx+BiX5GPDLwL/sKn8JeDDJWuAl4MZW3wFcD4zRWXl0C0BVHUlyJ7C3jbujqo609m3AA8B5wMPtIUkakL7CoKp+BHxiQu01OquLJo4t4PZJjrMJ2NSjPgpc1s9cJEnTz28gS5IMA0mSYSBJwjCQJGEYSJIwDCRJGAaSJAwDSRKGgSQJw0CShGEgScIwkCRhGEiSMAwkSRgGkiQMA0kShoEkiT7DIMmcJA8l+U6S55L8fJILk+xKsr89z21jk+SeJGNJnkpyZddxRtr4/UlGuupXJXm67XNPuxeyJGlA+v1k8DvAH1XVzwKXA88B64HdVbUY2N22AVYCi9tjHXAfQJILgQ3ANcDVwIYTAdLG3Nq134r39rYkSadiyjBIcgHwi8D9AFX1TlW9DqwCNrdhm4HVrb0K2FIde4A5SS4GrgN2VdWRqjoK7AJWtL7zq2pPu3/ylq5jSZIGoJ9PBpcC48D/TPJEkt9N8jFgXlW93Ma8Asxr7fnAga79D7bayeoHe9QlSQPSTxjMBq4E7quqzwA/4m9PCQHQfqOv6Z/eT0qyLsloktHx8fHT/XKSdNboJwwOAger6tG2/RCdcHi1neKhPR9u/YeAhV37L2i1k9UX9Kj/lKraWFXDVTU8NDTUx9QlSf2YMgyq6hXgQJJPtdIy4FlgO3BiRdAIsK21twNr2qqipcCxdjppJ7A8ydx24Xg5sLP1vZFkaVtFtKbrWJKkAZjd57h/DfxeknOAF4Bb6ATJg0nWAi8BN7axO4DrgTHgzTaWqjqS5E5gbxt3R1Udae3bgAeA84CH20OSNCB9hUFVPQkM9+ha1mNsAbdPcpxNwKYe9VHgsn7mIkmafn4DWZJkGEiSDANJEoaBJAnDQJKEYSBJwjCQJGEYSJIwDCRJGAaSJAwDSRKGgSQJw0CShGEgScIwkCRhGEiSMAwkSfQZBkm+l+TpJE8mGW21C5PsSrK/Pc9t9SS5J8lYkqeSXNl1nJE2fn+Ska76Ve34Y23fTPcblSRN7lQ+GfxSVV1RVSduf7ke2F1Vi4HdbRtgJbC4PdYB90EnPIANwDXA1cCGEwHSxtzatd+Kd/2OJEmn7L2cJloFbG7tzcDqrvqW6tgDzElyMXAdsKuqjlTVUWAXsKL1nV9Ve9r9k7d0HUuSNAD9hkEBf5zksSTrWm1eVb3c2q8A81p7PnCga9+DrXay+sEedUnSgMzuc9wvVNWhJH8H2JXkO92dVVVJavqn95NaEK0DuOSSS073y0nSWaOvTwZVdag9Hwa+Qeec/6vtFA/t+XAbfghY2LX7glY7WX1Bj3qveWysquGqGh4aGupn6pKkPkwZBkk+luRnTrSB5cAzwHbgxIqgEWBba28H1rRVRUuBY+100k5geZK57cLxcmBn63sjydK2imhN17EkSQPQz2miecA32mrP2cD/qqo/SrIXeDDJWuAl4MY2fgdwPTAGvAncAlBVR5LcCext4+6oqiOtfRvwAHAe8HB7SJIGZMowqKoXgMt71F8DlvWoF3D7JMfaBGzqUR8FLutjvpKk08BvIEuSDANJkmEgScIwkCRhGEiSMAwkSRgGkiQMA0kShoEkCcNAkoRhIEnCMJAkYRhIkjAMJEkYBpIkDANJEoaBJIlTCIMks5I8keSbbfvSJI8mGUvy9STntPpH2vZY61/UdYwvtPrzSa7rqq9otbEk66fv7UmS+nEqnwx+E3iua/vLwN1V9UngKLC21dcCR1v97jaOJEuAm4BPAyuAr7SAmQXcC6wElgA3t7GSpAHpKwySLAB+Bfjdth3gWuChNmQzsLq1V7VtWv+yNn4VsLWq3q6qF4Ex4Or2GKuqF6rqHWBrGytJGpB+Pxn8d+A/AH/dtj8BvF5Vx9v2QWB+a88HDgC0/mNt/N/UJ+wzWV2SNCBThkGSXwUOV9VjA5jPVHNZl2Q0yej4+PhMT0eSPjD6+WTwWeDXknyPzimca4HfAeYkmd3GLAAOtfYhYCFA678AeK27PmGfyeo/pao2VtVwVQ0PDQ31MXVJUj+mDIOq+kJVLaiqRXQuAH+rqn4DeAT4XBs2Amxr7e1tm9b/raqqVr+prTa6FFgMfBvYCyxuq5POaa+xfVrenSSpL7OnHjKp/whsTfJF4Ang/la/H/hqkjHgCJ3/3KmqfUkeBJ4FjgO3V9WPAZJ8HtgJzAI2VdW+9zAvSdIpOqUwqKo/Af6ktV+gsxJo4pi3gBsm2f8u4K4e9R3AjlOZiyRp+vgNZEmSYSBJMgwkSRgGkiQMA0kShoEkCcNAkoRhIEnCMJAkYRhIkjAMJEkYBpIkDANJEoaBJAnDQJKEYSBJwjCQJNFHGCQ5N8m3k/xFkn1J/kurX5rk0SRjSb7e7l9Mu8fx11v90SSLuo71hVZ/Psl1XfUVrTaWZP30v01J0sn088ngbeDaqrocuAJYkWQp8GXg7qr6JHAUWNvGrwWOtvrdbRxJltC5H/KngRXAV5LMSjILuBdYCSwBbm5jJUkDMmUYVMcP2+aH26OAa4GHWn0zsLq1V7VtWv+yJGn1rVX1dlW9CIzRuYfy1cBYVb1QVe8AW9tYSdKA9HXNoP0G/yRwGNgFfBd4vaqOtyEHgfmtPR84AND6jwGf6K5P2GeyuiRpQPoKg6r6cVVdASyg85v8z57WWU0iyboko0lGx8fHZ2IKkvSBdEqriarqdeAR4OeBOUlmt64FwKHWPgQsBGj9FwCvddcn7DNZvdfrb6yq4aoaHhoaOpWpS5JOop/VRENJ5rT2ecAvA8/RCYXPtWEjwLbW3t62af3fqqpq9ZvaaqNLgcXAt4G9wOK2OukcOheZt0/Hm5Mk9Wf21EO4GNjcVv18CHiwqr6Z5Flga5IvAk8A97fx9wNfTTIGHKHznztVtS/Jg8CzwHHg9qr6MUCSzwM7gVnApqraN23vUJI0pSnDoKqeAj7To/4CnesHE+tvATdMcqy7gLt61HcAO/qYryTpNPAbyJIkw0CSZBhIkjAMJEkYBpIkDANJEoaBJAnDQJKEYSBJwjCQJGEYSJIwDCRJGAaSJAwDSRKGgSQJw0CShGEgSaK/eyAvTPJIkmeT7Evym61+YZJdSfa357mtniT3JBlL8lSSK7uONdLG708y0lW/KsnTbZ97kuR0vFlJUm/9fDI4Dvy7qloCLAVuT7IEWA/srqrFwO62DbCSzs3uFwPrgPugEx7ABuAaOrfL3HAiQNqYW7v2W/He35okqV9ThkFVvVxVj7f2D4DngPnAKmBzG7YZWN3aq4At1bEHmJPkYuA6YFdVHamqo8AuYEXrO7+q9lRVAVu6jiVJGoBTumaQZBHwGeBRYF5Vvdy6XgHmtfZ84EDXbgdb7WT1gz3qkqQB6TsMknwc+N/Ab1XVG9197Tf6mua59ZrDuiSjSUbHx8dP98tJ0lmjrzBI8mE6QfB7VfUHrfxqO8VDez7c6oeAhV27L2i1k9UX9Kj/lKraWFXDVTU8NDTUz9QlSX3oZzVRgPuB56rqv3V1bQdOrAgaAbZ11de0VUVLgWPtdNJOYHmSue3C8XJgZ+t7I8nS9lpruo4lSRqA2X2M+SzwL4CnkzzZav8J+BLwYJK1wEvAja1vB3A9MAa8CdwCUFVHktwJ7G3j7qiqI619G/AAcB7wcHtIkgZkyjCoqj8DJlv3v6zH+AJun+RYm4BNPeqjwGVTzUWSdHr4DWRJkmEgSTIMJEkYBpIkDANJEoaBJAnDQJKEYSBJwjCQJGEYSJIwDCRJGAaSJAwDSRKGgSQJw0CShGEgScIwkCTR3z2QNyU5nOSZrtqFSXYl2d+e57Z6ktyTZCzJU0mu7NpnpI3fn2Skq35VkqfbPve0+yBLkgaon08GDwArJtTWA7urajGwu20DrAQWt8c64D7ohAewAbgGuBrYcCJA2phbu/ab+FqSpNNsyjCoqj8FjkworwI2t/ZmYHVXfUt17AHmJLkYuA7YVVVHquoosAtY0frOr6o97d7JW7qOJUkakHd7zWBeVb3c2q8A81p7PnCga9zBVjtZ/WCPuiRpgN7zBeT2G31Nw1ymlGRdktEko+Pj44N4SUk6K7zbMHi1neKhPR9u9UPAwq5xC1rtZPUFPeo9VdXGqhququGhoaF3OXVJ0kTvNgy2AydWBI0A27rqa9qqoqXAsXY6aSewPMncduF4ObCz9b2RZGlbRbSm61iSpAGZPdWAJF8D/jFwUZKDdFYFfQl4MMla4CXgxjZ8B3A9MAa8CdwCUFVHktwJ7G3j7qiqExelb6OzYuk84OH2kCQN0JRhUFU3T9K1rMfYAm6f5DibgE096qPAZVPNQ5J0+vgNZEmSYSBJMgwkSRgGkiQMA0kShoEkCcNAkoRhIEnCMJAkYRhIkjAMJEkYBpIkDANJEoaBJAnDQJKEYSBJwjCQJHEGhUGSFUmeTzKWZP1Mz0eSziZnRBgkmQXcC6wElgA3J1kys7OSpLPHGREGwNXAWFW9UFXvAFuBVTM8J0k6a8ye6Qk084EDXdsHgWsmDkqyDljXNn+Y5PkBzO1scBHwlzM9iankyzM9A80Qfz6nz9+brONMCYO+VNVGYONMz+ODJsloVQ3P9DykXvz5HIwz5TTRIWBh1/aCVpMkDcCZEgZ7gcVJLk1yDnATsH2G5yRJZ40z4jRRVR1P8nlgJzAL2FRV+2Z4WmcTT73pTObP5wCkqmZ6DpKkGXamnCaSJM0gw0CSZBhIks6QC8gavCQfAX4dWETXz0FV3TFTc5JOSHIucBvwC0ABfwbcV1VvzejEPsAMg7PXNuAY8Bjw9gzPRZpoC/AD4H+07X8GfBW4YcZm9AHnaqKzVJJnquqymZ6H1EuSZ6tqyVQ1TR+vGZy9/m+SfzDTk5Am8XiSpSc2klwDjM7gfD7w/GRwlkryLPBJ4EU6p4kCVFX93IxOTAKSPAd8Cvh+K10CPA8cx5/T08IwOEsl6fnXC6vqpUHPRZposp/PE/w5nX6GgSTJawaSJMNAkoRhIPWUZE6S2wbwOqu937fOBIaB1NscOt+A7Us63s2/p9WAYaAZ5wVkqYckW4FVdJYzPgL8HDAX+DDwn6tqW5JFdO7B8ShwFXA9sAb458A4nft6P1ZVv53k7wP3AkPAm8CtwIXAN+l8E/wY8OtV9d0BvUXpJ/jnKKTe1gOXVdUVSWYDH62qN5JcBOxJcuJOfIuBkarak+Qf0vl7T5fTCY3H6fy5D+jcoOVfVdX+9gWqr1TVte0436yqhwb55qSJDANpagH+a5JfBP4amA/Ma30vVdWe1v4ssK39MbW3kvwfgCQfB/4R8PtJThzzI4OavNQPw0Ca2m/QOb1zVVX9VZLvAee2vh/1sf+HgNer6orTND/pPfMCstTbD4Cfae0LgMMtCH4JmOzbsX8O/NMk57ZPA78KUFVvAC8muQH+5mLz5T1eR5oxhoHUQ1W9Bvx5kmeAK4DhJE/TuUD8nUn22QtsB54CHgaepnNhGDqfLtYm+QtgH52L0wBbgX+f5Il2kVmaEa4mkqZRko9X1Q+TfBT4U2BdVT0+0/OSpuI1A2l6bWxfIjsX2GwQ6P3CTwaSJK8ZSJIMA0kShoEkCcNAkoRhIEnCMJAkAf8fRluB5QL2emMAAAAASUVORK5CYII=\n"
+ },
+ "metadata": {
+ "needs_background": "light"
+ }
+ }
+ ],
+ "source": [
+ "train.groupby('target').size().plot(kind='bar')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "xyHV7gCCxCpO"
+ },
+ "source": [
+ "We will find that it is a relatively well-balanced dataset"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "5RK3QPUnbUFq"
+ },
+ "source": [
+ "## 3. Data (Text) Preprocessing ##"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "N_p8SQxQMKrq"
+ },
+ "outputs": [],
+ "source": [
+ "### mapping a dictionary of apostrophe words\n",
+ "\n",
+ "appos = {\n",
+ "\"aren't\" : \"are not\",\n",
+ "\"can't\" : \"cannot\",\n",
+ "\"cant\" : \"cannot\",\n",
+ "\"couldn't\" : \"could not\",\n",
+ "\"didn't\" : \"did not\",\n",
+ "\"doesn't\" : \"does not\",\n",
+ "\"don't\" : \"do not\",\n",
+ "\"hadn't\" : \"had not\",\n",
+ "\"hasn't\" : \"has not\",\n",
+ "\"haven't\" : \"have not\",\n",
+ "\"he'd\" : \"he would\",\n",
+ "\"he'll\" : \"he will\",\n",
+ "\"he's\" : \"he is\",\n",
+ "\"i'd\" : \"I would\",\n",
+ "\"i'd\" : \"I had\",\n",
+ "\"i'll\" : \"I will\",\n",
+ "\"i'm\" : \"I am\",\n",
+ "\"im\" : \"I am\",\n",
+ "\"isn't\" : \"is not\",\n",
+ "\"it's\" : \"it is\",\n",
+ "\"it'll\":\"it will\",\n",
+ "\"i've\" : \"I have\",\n",
+ "\"let's\" : \"let us\",\n",
+ "\"mightn't\" : \"might not\",\n",
+ "\"mustn't\" : \"must not\",\n",
+ "\"shan't\" : \"shall not\",\n",
+ "\"she'd\" : \"she would\",\n",
+ "\"she'll\" : \"she will\",\n",
+ "\"she's\" : \"she is\",\n",
+ "\"shouldn't\" : \"should not\",\n",
+ "\"that's\" : \"that is\",\n",
+ "\"there's\" : \"there is\",\n",
+ "\"they'd\" : \"they would\",\n",
+ "\"they'll\" : \"they will\",\n",
+ "\"they're\" : \"they are\",\n",
+ "\"they've\" : \"they have\",\n",
+ "\"we'd\" : \"we would\",\n",
+ "\"we're\" : \"we are\",\n",
+ "\"weren't\" : \"were not\",\n",
+ "\"we've\" : \"we have\",\n",
+ "\"what'll\" : \"what will\",\n",
+ "\"what're\" : \"what are\",\n",
+ "\"what's\" : \"what is\",\n",
+ "\"what've\" : \"what have\",\n",
+ "\"where's\" : \"where is\",\n",
+ "\"who'd\" : \"who would\",\n",
+ "\"who'll\" : \"who will\",\n",
+ "\"who're\" : \"who are\",\n",
+ "\"who's\" : \"who is\",\n",
+ "\"who've\" : \"who have\",\n",
+ "\"won't\" : \"will not\",\n",
+ "\"wouldn't\" : \"would not\",\n",
+ "\"you'd\" : \"you would\",\n",
+ "\"you'll\" : \"you will\",\n",
+ "\"you're\" : \"you are\",\n",
+ "\"you've\" : \"you have\",\n",
+ "\"'re\": \" are\",\n",
+ "\"wasn't\": \"was not\",\n",
+ "\"we'll\":\" will\",\n",
+ "\"didn't\": \"did not\",\n",
+ "\"gg\" : \"going\"\n",
+ "}"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "7IbqS4m-4-EX",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 206
+ },
+ "outputId": "38904185-6b96-49f0-a68c-05012f58de30"
+ },
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " target ids user \\\n",
+ "0 p 1978186076 ceruleanbreeze \n",
+ "1 p 1994697891 enthusiasticjen \n",
+ "2 p 2191885992 LifeRemixed \n",
+ "3 p 1753662211 lovemandy \n",
+ "4 p 2177442789 _LOVELYmanu \n",
+ "\n",
+ " text \\\n",
+ "0 @nocturnalie Anyway, and now Abby and I share ... \n",
+ "1 @JoeGigantino Few times I'm trying to leave co... \n",
+ "2 @AngieGriffin Good Morning Angie I'll be in t... \n",
+ "3 had a good day driving up mountains, visiting ... \n",
+ "4 downloading some songs i love lady GaGa. \n",
+ "\n",
+ " ugc \n",
+ "0 anyway and now abby and i share all our crops ... \n",
+ "1 few times I am trying to leave comments in you... \n",
+ "2 good morning angie I will be in the atl july 8... \n",
+ "3 had a good day driving up mountains visiting k... \n",
+ "4 downloading some songs i love lady gaga "
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " target | \n",
+ " ids | \n",
+ " user | \n",
+ " text | \n",
+ " ugc | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " p | \n",
+ " 1978186076 | \n",
+ " ceruleanbreeze | \n",
+ " @nocturnalie Anyway, and now Abby and I share ... | \n",
+ " anyway and now abby and i share all our crops ... | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " p | \n",
+ " 1994697891 | \n",
+ " enthusiasticjen | \n",
+ " @JoeGigantino Few times I'm trying to leave co... | \n",
+ " few times I am trying to leave comments in you... | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " p | \n",
+ " 2191885992 | \n",
+ " LifeRemixed | \n",
+ " @AngieGriffin Good Morning Angie I'll be in t... | \n",
+ " good morning angie I will be in the atl july 8... | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " p | \n",
+ " 1753662211 | \n",
+ " lovemandy | \n",
+ " had a good day driving up mountains, visiting ... | \n",
+ " had a good day driving up mountains visiting k... | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " p | \n",
+ " 2177442789 | \n",
+ " _LOVELYmanu | \n",
+ " downloading some songs i love lady GaGa. | \n",
+ " downloading some songs i love lady gaga | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ " "
+ ]
+ },
+ "metadata": {},
+ "execution_count": 7
+ }
+ ],
+ "source": [
+ "import re\n",
+ "\n",
+ "def preprocess_text(sentence):\n",
+ " text = re.sub('((www\\.[^\\s]+)|(https?://[^\\s]+))','', sentence['text'])\n",
+ " text = re.sub('@[^\\s]+','', text)\n",
+ " text = text.lower().split()\n",
+ " reformed = [appos[word] if word in appos else word for word in text]\n",
+ " reformed = \" \".join(reformed) \n",
+ " text = re.sub('&[^\\s]+;', '', reformed)\n",
+ " text = re.sub('[^a-zA-Zа-яА-Я1-9]+', ' ', text)\n",
+ " text = re.sub(' +',' ', text)\n",
+ " #text = re.sub(' [\\w] ', ' ', text)\n",
+ " return text.strip()\n",
+ "\n",
+ "preprocess = train\n",
+ "preprocess['ugc'] = preprocess.apply(preprocess_text, axis=1)\n",
+ "\n",
+ "preprocess.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "CUsyZDEqK_4r"
+ },
+ "source": [
+ "## 4. Sentiment Analysis using Deep Learning-based Method: RNN ##"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "ReMymperLMAh"
+ },
+ "outputs": [],
+ "source": [
+ "from sklearn.model_selection import train_test_split\n",
+ "from sklearn.metrics import classification_report\n",
+ "from keras.preprocessing.text import Tokenizer\n",
+ "from keras.preprocessing import sequence\n",
+ "\n",
+ "max_features = 4000\n",
+ "#num_words = 20\n",
+ "embedding_size = 256\n",
+ "lstm_dim = 256\n",
+ "batch_size = 64\n",
+ "num_epochs = 10\n",
+ "\n",
+ "# Create tokenizer by converting text into sequence of integers\n",
+ "tokenizer = Tokenizer(num_words=max_features, split=' ')\n",
+ "tokenizer.fit_on_texts(preprocess['ugc'].values)\n",
+ "\n",
+ "X = tokenizer.texts_to_sequences(preprocess['ugc'].values)\n",
+ "X = sequence.pad_sequences(X)\n",
+ "#X = sequence.pad_sequences(X, maxlen=num_words)\n",
+ "y = pd.get_dummies(preprocess['target']).values\n",
+ "\n",
+ "# Adding 1 because of reserved 0 index\n",
+ "vocab_size = len(tokenizer.word_index) + 1"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "fqZyya_Y6xhl",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "outputId": "44c5cbd1-51bb-45c9-f107-45b513c341a1"
+ },
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stderr",
+ "text": [
+ "WARNING:tensorflow:Layer lstm will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU.\n",
+ "WARNING:tensorflow:Layer lstm_1 will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU.\n"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Model: \"sequential\"\n",
+ "_________________________________________________________________\n",
+ " Layer (type) Output Shape Param # \n",
+ "=================================================================\n",
+ " embedding (Embedding) (None, 38, 256) 1024000 \n",
+ " \n",
+ " spatial_dropout1d (SpatialD (None, 38, 256) 0 \n",
+ " ropout1D) \n",
+ " \n",
+ " lstm (LSTM) (None, 38, 256) 525312 \n",
+ " \n",
+ " lstm_1 (LSTM) (None, 128) 197120 \n",
+ " \n",
+ " dense (Dense) (None, 2) 258 \n",
+ " \n",
+ "=================================================================\n",
+ "Total params: 1,746,690\n",
+ "Trainable params: 1,746,690\n",
+ "Non-trainable params: 0\n",
+ "_________________________________________________________________\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Define model\n",
+ "from keras import Sequential\n",
+ "from keras.layers import Embedding, LSTM, Dropout, Dense, SpatialDropout1D\n",
+ "\n",
+ "model = Sequential()\n",
+ "model.add(Embedding(input_dim=max_features,\n",
+ " output_dim=embedding_size,\n",
+ " input_length=X.shape[1]))\n",
+ "model.add(SpatialDropout1D(0.4))\n",
+ "model.add(LSTM(units=lstm_dim,\n",
+ " dropout=0.2,\n",
+ " activation='tanh',\n",
+ " recurrent_dropout=0.2,\n",
+ " recurrent_activation='sigmoid',\n",
+ " return_sequences=True))\n",
+ "model.add(LSTM(units=128,\n",
+ " dropout=0.2,\n",
+ " activation='tanh',\n",
+ " recurrent_dropout=0.2,\n",
+ " recurrent_activation='sigmoid'))\n",
+ "model.add(Dense(2, activation='sigmoid')) \n",
+ "\n",
+ "model.summary()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "HRNFpdoUN3Cw"
+ },
+ "outputs": [],
+ "source": [
+ "# Compile model\n",
+ "model.compile(loss='binary_crossentropy',\n",
+ " optimizer='adam',\n",
+ " metrics=[['accuracy']])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "ogRmCK1UZhCC"
+ },
+ "outputs": [],
+ "source": [
+ "Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size = 0.2, shuffle=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "AJ083iTdOKNo",
+ "outputId": "eaf7b1cc-040a-4e3d-84eb-f8ad1e77ddf6"
+ },
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Epoch 1/10\n",
+ "2000/2000 [==============================] - 699s 346ms/step - loss: 0.4805 - accuracy: 0.7679 - val_loss: 0.4439 - val_accuracy: 0.7926\n",
+ "Epoch 2/10\n",
+ "2000/2000 [==============================] - 702s 351ms/step - loss: 0.4287 - accuracy: 0.8005 - val_loss: 0.4355 - val_accuracy: 0.7964\n",
+ "Epoch 3/10\n",
+ "2000/2000 [==============================] - 690s 345ms/step - loss: 0.4076 - accuracy: 0.8133 - val_loss: 0.4422 - val_accuracy: 0.7967\n",
+ "Epoch 4/10\n",
+ "2000/2000 [==============================] - 684s 342ms/step - loss: 0.3895 - accuracy: 0.8233 - val_loss: 0.4425 - val_accuracy: 0.7979\n",
+ "Epoch 5/10\n",
+ "2000/2000 [==============================] - 677s 339ms/step - loss: 0.3737 - accuracy: 0.8324 - val_loss: 0.4458 - val_accuracy: 0.7996\n",
+ "Epoch 6/10\n",
+ "2000/2000 [==============================] - 674s 337ms/step - loss: 0.3560 - accuracy: 0.8405 - val_loss: 0.4561 - val_accuracy: 0.7929\n",
+ "Epoch 7/10\n",
+ "2000/2000 [==============================] - 674s 337ms/step - loss: 0.3394 - accuracy: 0.8485 - val_loss: 0.4660 - val_accuracy: 0.7936\n",
+ "Epoch 8/10\n",
+ "2000/2000 [==============================] - 676s 338ms/step - loss: 0.3208 - accuracy: 0.8588 - val_loss: 0.4931 - val_accuracy: 0.7899\n",
+ "Epoch 9/10\n",
+ "2000/2000 [==============================] - 678s 339ms/step - loss: 0.3037 - accuracy: 0.8671 - val_loss: 0.5225 - val_accuracy: 0.7865\n",
+ "Epoch 10/10\n",
+ "2000/2000 [==============================] - 674s 337ms/step - loss: 0.2876 - accuracy: 0.8745 - val_loss: 0.5419 - val_accuracy: 0.7852\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Fit model\n",
+ "history = model.fit(Xtrain, ytrain, batch_size=batch_size, epochs=num_epochs, validation_data=(Xtest, ytest))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "Dft2-xIgnjCI",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "outputId": "d0b90103-126f-473b-af21-0a3bfc00d373"
+ },
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Accuracy: 78.52%\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Final evaluation of the model\n",
+ "scores = model.evaluate(Xtest, ytest, verbose=0)\n",
+ "print(\"Accuracy: %.2f%%\" % (scores[1]*100))"
+ ]
+ }
+ ],
+ "metadata": {
+ "accelerator": "GPU",
+ "colab": {
+ "collapsed_sections": [],
+ "provenance": []
+ },
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.3"
+ },
+ "gpuClass": "standard"
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
\ No newline at end of file