From feea228b15c2fc031752e7e1f2ea2526407a294c Mon Sep 17 00:00:00 2001
From: Amdework Asefa <amdeamd7@gmaik.com>
Date: Wed, 10 Aug 2022 09:31:27 +0300
Subject: [PATCH 1/2] Amdework Commited bugfix

---
 clean_tweets_dataframe.py     |  88 +++++++++++++
 extract_dataframe.py          | 234 ++++++++++++++++++++++++++++++++++
 fix_clean_tweets_dataframe.py |  58 ---------
 fix_extract_dataframe.py      | 137 --------------------
 4 files changed, 322 insertions(+), 195 deletions(-)
 create mode 100644 clean_tweets_dataframe.py
 create mode 100644 extract_dataframe.py
 delete mode 100644 fix_clean_tweets_dataframe.py
 delete mode 100644 fix_extract_dataframe.py

diff --git a/clean_tweets_dataframe.py b/clean_tweets_dataframe.py
new file mode 100644
index 0000000..aaa6c8b
--- /dev/null
+++ b/clean_tweets_dataframe.py
@@ -0,0 +1,88 @@
+import pandas as pd
+import re
+
+class Clean_Tweets:
+    """
+    The PEP8 Standard AMAZING!!!
+    """
+
+    def __init__(self):
+        print('Automation in Action...!!!')
+
+    def add_clean_text(self, df: pd.DataFrame) -> pd.DataFrame:
+        """
+        convert original_text values to clean_text values
+        """
+
+        df['clean_text'] = df['original_text'].apply(clean_text)
+        return df
+
+    def drop_nullValue_rows(self, df: pd.DataFrame) -> pd.DataFrame:
+        """
+        convert original_text values to clean_text values
+        """
+
+        df.dropna(inplace=True)
+        df.reset_index(drop=True, inplace=True)
+
+        return df
+
+    def drop_unwanted_column(self, df: pd.DataFrame) -> pd.DataFrame:
+        """
+        remove rows that has column names. This error originated from
+        the data collection stage.  
+        """
+        columns = ['created_at', 'source', 'original_text', 'clean_text', 'sentiment', 'polarity', 'subjectivity', 'lang', 'favorite_count', 'retweet_count',
+                   'original_author', 'screen_count', 'followers_count', 'friends_count', 'possibly_sensitive', 'hashtags', 'user_mentions', 'place', 'place_coord_boundaries']
+        unwanted_rows = []
+        for columnName in columns:
+            unwanted_rows += df[df[columnName] == columnName].index
+
+        df.drop(unwanted_rows, inplace=True)
+        df.reset_index(drop=True, inplace=True)
+        return df
+
+    def drop_duplicate(self, df: pd.DataFrame) -> pd.DataFrame:
+        """
+        drop duplicate rows
+        """
+        df.drop_duplicates(inplace=True)
+        df.reset_index(drop=True, inplace=True)
+        return df
+
+    def convert_to_datetime(self, df: pd.DataFrame) -> pd.DataFrame:
+        """
+        convert column to datetime
+        """
+        df['created_at'] = pd.to_datetime(df['created_at'])
+        return df
+
+    def convert_to_numbers(self, df: pd.DataFrame) -> pd.DataFrame:
+        """
+        convert columns like polarity, subjectivity, retweet_count
+        favorite_count etc to numbers
+        """
+        df[['polarity', 'subjectivity', 'favorite_count', 'retweet_count', 'screen_count', 'followers_count', 'friends_count']] = df[[
+            'polarity', 'subjectivity', 'favorite_count', 'retweet_count', 'screen_count', 'followers_count', 'friends_count']].apply(pd.to_numeric)
+        return df
+
+    def remove_non_english_tweets(self, df: pd.DataFrame) -> pd.DataFrame:
+        """
+        remove non english tweets from lang
+        """
+
+        index_names = df[df['lang'] != "en"].index
+
+        df.drop(index_names, inplace=True)
+        df.reset_index(drop=True, inplace=True)
+
+        return df
+
+
+def clean_text(original_text: str) -> str:
+    cleaned_text = re.sub('\n', '', original_text)
+    cleaned_text = re.findall(r'[a-zA-Z]+', cleaned_text)
+    cleaned_text = " ".join(cleaned_text)
+    cleaned_text = re.sub(r'http.*', "", cleaned_text)
+
+    return cleaned_text
\ No newline at end of file
diff --git a/extract_dataframe.py b/extract_dataframe.py
new file mode 100644
index 0000000..825b8ba
--- /dev/null
+++ b/extract_dataframe.py
@@ -0,0 +1,234 @@
+import json
+import pandas as pd
+from textblob import TextBlob
+
+
+def read_json(json_file: str) -> list:
+    """
+    json file reader to open and read json files into a list
+    Args:
+    -----
+    json_file: str - path of a json file
+    Returns
+    -------
+    length of the json file and a list of json
+    """
+
+    tweets_data = []
+    for tweets in open(json_file, 'r'):
+        tweets_data.append(json.loads(tweets))
+
+    return len(tweets_data), tweets_data
+
+
+class TweetDfExtractor:
+    """
+    this function will parse tweets json into a pandas dataframe
+    Return
+    ------
+    dataframe
+    """
+
+    def __init__(self, tweets_list):
+
+        self.tweets_list = tweets_list
+
+    # an example function
+    def find_statuses_count(self) -> list:
+        statuses_count = [x["user"]["statuses_count"]
+                          for x in self.tweets_list]
+
+        return statuses_count
+
+    def find_full_text(self) -> list:
+        full_text = []
+        for tweet in self.tweets_list:
+            try:
+                full_text.append(
+                    tweet["retweeted_status"]['extended_tweet']['full_text'])
+            except KeyError:
+                full_text.append("")
+
+        return full_text
+
+    def find_original_text(self) -> list:
+        original_text = [x['text'] for x in self.tweets_list]
+
+        return original_text
+
+    def find_sentiment(self, polarity, subjectivity) -> list:
+        sentiment = []
+        for i in range(len(polarity)):
+            if polarity[i] > 0:
+                sentiment.append(1)
+            elif polarity[i] < 0:
+                sentiment.append(0)
+            else:
+                sentiment.append(-1)
+
+        return sentiment
+
+    def find_sentiments(self, text) -> list:
+        polarityList = []
+        subjectivityList = []
+        for eachText in text:
+            polarity, subjectivity = TextBlob(eachText).sentiment
+            polarityList.append(polarity)
+            subjectivityList.append(subjectivity)
+
+        return polarityList, subjectivityList
+
+    def find_lang(self) -> list:
+        lang = [x['lang'] for x in self.tweets_list]
+
+        return lang
+
+    def find_created_time(self) -> list:
+        created_at = [x['created_at'] for x in self.tweets_list]
+
+        return created_at
+
+    def find_source(self) -> list:
+        source = [x['source'] for x in self.tweets_list]
+
+        return source
+
+    def find_screen_name(self) -> list:
+        screen_name = [x['user']['screen_name'] for x in self.tweets_list]
+
+        return screen_name
+
+    def find_screen_count(self) -> list:
+        screen_count = [x['user']['listed_count']
+                        for x in self.tweets_list]
+
+        return screen_count
+
+    def find_followers_count(self) -> list:
+        followers_count = [x['user']['followers_count']
+                           for x in self.tweets_list]
+
+        return followers_count
+
+    def find_friends_count(self) -> list:
+        friends_count = [x['user']['friends_count'] for x in self.tweets_list]
+
+        return friends_count
+
+    def is_sensitive(self) -> list:
+        is_sensitive = []
+        for tweet in self.tweets_list:
+            try:
+                value = tweet["retweeted_status"]['possibly_sensitive']
+                if(not value):
+                    is_sensitive.append(None)
+                else:
+                    is_sensitive.append(value)
+            except KeyError:
+                is_sensitive.append(None)
+
+        return is_sensitive
+
+    def find_favourite_count(self) -> list:
+        favourite_count = []
+        for tweet in self.tweets_list:
+            try:
+                favourite_count.append(
+                    tweet["retweeted_status"]['favorite_count'])
+            except KeyError:
+                favourite_count.append(0)
+
+        return favourite_count
+
+    def find_retweet_count(self) -> list:
+        retweet_count = []
+        for tweet in self.tweets_list:
+            try:
+                retweet_count.append(
+                    tweet["retweeted_status"]['retweet_count'])
+            except KeyError:
+                retweet_count.append(0)
+
+        return retweet_count
+
+    def find_hashtags(self) -> list:
+        hashtags = []
+        for tweet in self.tweets_list:
+            try:
+                hashtags.append(tweet['entities']['hashtags'][0]['text'])
+            except KeyError:
+                hashtags.append(None)
+            except IndexError:
+                hashtags.append(None)
+
+        return hashtags
+
+    def find_mentions(self) -> list:
+        mentions = []
+        main_mentions = [x['entities']['user_mentions']
+                         for x in self.tweets_list]
+        for mention in main_mentions:
+            for each in mention:
+                mentions.append(each['screen_name'])
+
+        return mentions
+
+    def find_place(self) -> list:
+        place = [x['place'] for x in self.tweets_list]
+
+        return place
+
+    def find_coordinates(self) -> list:
+        coordinates = [x['coordinates'] for x in self.tweets_list]
+
+        return coordinates
+
+    def find_location(self) -> list:
+        location = [x['user']['location'] for x in self.tweets_list]
+
+        return location
+
+    def get_tweet_df(self, save=False) -> pd.DataFrame:
+        """required column to be generated you should be creative and add more features"""
+
+        columns = ['created_at', 'source', 'original_text', 'sentiment', 'polarity', 'subjectivity', 'lang', 'favorite_count', 'retweet_count',
+                   'original_author', 'followers_count', 'friends_count', 'possibly_sensitive', 'hashtags', 'user_mentions', 'place']
+
+        created_at = self.find_created_time()
+        source = self.find_source()
+        original_text = self.find_original_text()
+        clean_text = self.find_full_text()
+        polarity, subjectivity = self.find_sentiments(clean_text)
+        sentiment = self.find_sentiment(polarity, subjectivity)
+        lang = self.find_lang()
+        favorite_count = self.find_favourite_count()
+        retweet_count = self.find_retweet_count()
+        original_author = self.find_screen_name()
+        screen_count = self.find_screen_count()
+        followers_count = self.find_followers_count()
+        friends_count = self.find_friends_count()
+        possibly_sensitive = self.is_sensitive()
+        hashtags = self.find_hashtags()
+        user_mentions = self.find_mentions()
+        place = self.find_location()
+        place_coord_boundaries = self.find_coordinates()
+        data = zip(created_at, source, original_text, sentiment, polarity, subjectivity, lang, favorite_count, retweet_count,
+                   original_author, followers_count, friends_count, possibly_sensitive, hashtags, user_mentions, place)
+        df = pd.DataFrame(data=data, columns=columns)
+
+        if save:
+            df.to_csv('./data/processed_tweet_data.csv', index=False)
+            print('File Successfully Saved.!!!')
+
+        return df
+
+
+if __name__ == "__main__":
+    # required column to be generated you should be creative and add more features
+    columns = ['created_at', 'source', 'original_text', 'clean_text', 'sentiment', 'polarity', 'subjectivity', 'lang', 'favorite_count', 'retweet_count',
+               'original_author', 'screen_count', 'followers_count', 'friends_count', 'possibly_sensitive', 'hashtags', 'user_mentions', 'place', 'place_coord_boundaries']
+    _, tweet_list = read_json("./data/covid19.json")
+    tweet = TweetDfExtractor(tweet_list)
+    tweet_df = tweet.get_tweet_df(save=True)
+
+    # use all defined functions to generate a dataframe with the specified columns above
\ No newline at end of file
diff --git a/fix_clean_tweets_dataframe.py b/fix_clean_tweets_dataframe.py
deleted file mode 100644
index 7b45a35..0000000
--- a/fix_clean_tweets_dataframe.py
+++ /dev/null
@@ -1,58 +0,0 @@
-class Clean_Tweets:
-    """
-    The PEP8 Standard AMAZING!!!
-    """
-    def __init__(self, df:pd.DataFrame):
-        self.df = df
-        print('Automation in Action...!!!')
-        
-    def drop_unwanted_column(self, df:pd.DataFrame)->pd.DataFrame:
-        """
-        remove rows that has column names. This error originated from
-        the data collection stage.  
-        """
-        unwanted_rows = df[df['retweet_count'] == 'retweet_count' ].index
-        df.drop(unwanted_rows , inplace=True)
-        df = df[df['polarity'] != 'polarity']
-        
-        return df
-    def drop_duplicate(self, df:pd.DataFrame)->pd.DataFrame:
-        """
-        drop duplicate rows
-        """
-        
-        ---
-        
-        return df
-    def convert_to_datetime(self, df:pd.DataFrame)->pd.DataFrame:
-        """
-        convert column to datetime
-        """
-        ----
-        
-        ----
-        
-        df = df[df['created_at'] >= '2020-12-31' ]
-        
-        return df
-    
-    def convert_to_numbers(self, df:pd.DataFrame)->pd.DataFrame:
-        """
-        convert columns like polarity, subjectivity, retweet_count
-        favorite_count etc to numbers
-        """
-        df['polarity'] = pd.----
-        
-        ----
-        ----
-        
-        return df
-    
-    def remove_non_english_tweets(self, df:pd.DataFrame)->pd.DataFrame:
-        """
-        remove non english tweets from lang
-        """
-        
-        df = ----
-        
-        return df
\ No newline at end of file
diff --git a/fix_extract_dataframe.py b/fix_extract_dataframe.py
deleted file mode 100644
index 3bd792d..0000000
--- a/fix_extract_dataframe.py
+++ /dev/null
@@ -1,137 +0,0 @@
-import json
-import pandas as pd
-from textblob import TextBlob
-
-
-def read_json(json_file: str)->list:
-    """
-    json file reader to open and read json files into a list
-    Args:
-    -----
-    json_file: str - path of a json file
-    
-    Returns
-    -------
-    length of the json file and a list of json
-    """
-    
-    tweets_data = []
-    for tweets in open(json_file,'r'):
-        tweets_data.append(json.loads(tweets))
-    
-    
-    return len(tweets_data), tweets_data
-
-class TweetDfExtractor:
-    """
-    this function will parse tweets json into a pandas dataframe
-    
-    Return
-    ------
-    dataframe
-    """
-    def __init__(self, tweets_list):
-        
-        self.tweets_list = tweets_list
-
-    # an example function
-    def find_statuses_count(self)->list:
-        statuses_count 
-        
-    def find_full_text(self)->list:
-        text = 
-       
-    
-    def find_sentiments(self, text)->list:
-        
-        return polarity, self.subjectivity
-
-    def find_created_time(self)->list:
-       
-        return created_at
-
-    def find_source(self)->list:
-        source = 
-
-        return source
-
-    def find_screen_name(self)->list:
-        screen_name = 
-
-    def find_followers_count(self)->list:
-        followers_count = 
-
-    def find_friends_count(self)->list:
-        friends_count = 
-
-    def is_sensitive(self)->list:
-        try:
-            is_sensitive = [x['possibly_sensitive'] for x in self.tweets_list]
-        except KeyError:
-            is_sensitive = None
-
-        return is_sensitive
-
-    def find_favourite_count(self)->list:
-        
-    
-    def find_retweet_count(self)->list:
-        retweet_count = 
-
-    def find_hashtags(self)->list:
-        hashtags =
-
-    def find_mentions(self)->list:
-        mentions = 
-
-
-    def find_location(self)->list:
-        try:
-            location = self.tweets_list['user']['location']
-        except TypeError:
-            location = ''
-        
-        return location
-
-    
-        
-        
-    def get_tweet_df(self, save=False)->pd.DataFrame:
-        """required column to be generated you should be creative and add more features"""
-        
-        columns = ['created_at', 'source', 'original_text','polarity','subjectivity', 'lang', 'favorite_count', 'retweet_count', 
-            'original_author', 'followers_count','friends_count','possibly_sensitive', 'hashtags', 'user_mentions', 'place']
-        
-        created_at = self.find_created_time()
-        source = self.find_source()
-        text = self.find_full_text()
-        polarity, subjectivity = self.find_sentiments(text)
-        lang = self.find_lang()
-        fav_count = self.find_favourite_count()
-        retweet_count = self.find_retweet_count()
-        screen_name = self.find_screen_name()
-        follower_count = self.find_followers_count()
-        friends_count = self.find_friends_count()
-        sensitivity = self.is_sensitive()
-        hashtags = self.find_hashtags()
-        mentions = self.find_mentions()
-        location = self.find_location()
-        data = zip(created_at, source, text, polarity, subjectivity, lang, fav_count, retweet_count, screen_name, follower_count, friends_count, sensitivity, hashtags, mentions, location)
-        df = pd.DataFrame(data=data, columns=columns)
-
-        if save:
-            df.to_csv('processed_tweet_data.csv', index=False)
-            print('File Successfully Saved.!!!')
-        
-        return df
-
-                
-if __name__ == "__main__":
-    # required column to be generated you should be creative and add more features
-    columns = ['created_at', 'source', 'original_text','clean_text', 'sentiment','polarity','subjectivity', 'lang', 'favorite_count', 'retweet_count', 
-    'original_author', 'screen_count', 'followers_count','friends_count','possibly_sensitive', 'hashtags', 'user_mentions', 'place', 'place_coord_boundaries']
-    _, tweet_list = read_json("../covid19.json")
-    tweet = TweetDfExtractor(tweet_list)
-    tweet_df = tweet.get_tweet_df() 
-
-    # use all defined functions to generate a dataframe with the specified columns above
\ No newline at end of file

From 66d90fbe5cfd08c0f48c745fbb20b53f4df7574f Mon Sep 17 00:00:00 2001
From: Amdework Asefa <56429095+Amdework21@users.noreply.github.com>
Date: Wed, 10 Aug 2022 13:05:58 +0300
Subject: [PATCH 2/2] Created using Colaboratory

---
 Done_Challenge__Day2.ipynb | 628 +++++++++++++++++++++++++++++++++++++
 1 file changed, 628 insertions(+)
 create mode 100644 Done_Challenge__Day2.ipynb

diff --git a/Done_Challenge__Day2.ipynb b/Done_Challenge__Day2.ipynb
new file mode 100644
index 0000000..5c0e597
--- /dev/null
+++ b/Done_Challenge__Day2.ipynb
@@ -0,0 +1,628 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "name": "Done_Challenge_ Day2.ipynb",
+      "provenance": [],
+      "collapsed_sections": [],
+      "include_colab_link": true
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "view-in-github",
+        "colab_type": "text"
+      },
+      "source": [
+        "<a href=\"https://colab.research.google.com/github/Amdework21/Twitter-Data-Analysis21/blob/main/Done_Challenge__Day2.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "#Done Project: Data Minining Project for X company"
+      ],
+      "metadata": {
+        "id": "zroHHWfG7V2M"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "zDwep1K8Erxl"
+      },
+      "source": [
+        "**Project:** Data Minining Project for  X company"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "JzIu-UWIDXHw"
+      },
+      "source": [
+        ""
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "d7-ii3uyI8KY"
+      },
+      "source": [
+        "The CRISP-DM Framework\n",
+        "\n",
+        "\n",
+        "The CRISP-DM methodology provides a structured approach to planning a data mining project. It is a robust and well-proven methodology.\n",
+        "* Business understanding (BU): Determine Business Objectives, Assess Situation, Determine Data Mining Goals, Produce Project Plan\n",
+        "\n",
+        "* Data understanding (DU): Collect Initial Data, Describe Data, Explore Data, Verify Data Quality\n",
+        "\n",
+        "* Data preparation (DP): Select Data, Clean Data, Construct Data, Integrate Data\n",
+        "\n",
+        "* Modeling (M): Select modeling technique, Generate Test Design, Build Model, Assess Model\n",
+        "*  Evaluation (E): Evaluate Results, Review Process, Determine Next Steps\n",
+        "*  Deployment (D): Plan Deployment, Plan Monitoring and Maintenance, Produce Final Report, Review Project\n",
+        "\n",
+        "\n",
+        "References:\n",
+        "\n",
+        "[What is the CRISP-DM methodology?](https://www.sv-europe.com/crisp-dm-methodology/)\n",
+        "\n",
+        "[Introduction to CRISP DM Framework for Data Science and Machine Learning](https://www.linkedin.com/pulse/chapter-1-introduction-crisp-dm-framework-data-science-anshul-roy/)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "5lo7Ml7tMQOf"
+      },
+      "source": [
+        "**Data Set**\n",
+        "### The data is for company X which is trying to control attrition. \n",
+        "### There are two sets of data: \"Existing employees\" and \"Employees who have left\". The following attributes are available for every employee.\n",
+        "\n",
+        "\n",
+        "*   Satisfaction Level\n",
+        "\n",
+        "*   Last evaluation\n",
+        "\n",
+        "*   Number of projects\n",
+        "\n",
+        "*   Average monthly hours\n",
+        "\n",
+        "*   Time spent at the company\n",
+        "*   Whether they have had a work accident\n",
+        "\n",
+        "\n",
+        "*  Whether they have had a promotion in the last 5 years\n",
+        "\n",
+        "\n",
+        "*   Departments (column sales)\n",
+        "\n",
+        "\n",
+        "*   Salary\n",
+        "\n",
+        "\n",
+        "*  Whether the employee has left\n",
+        "\n",
+        "\n",
+        "\n",
+        "\n",
+        "\n",
+        "\n",
+        "\n",
+        "\n",
+        "\n",
+        "\n",
+        "\n",
+        "\n",
+        "\n",
+        "\n",
+        "\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "sjSj2A2sSph_"
+      },
+      "source": [
+        "**Your Role**\n",
+        " \n",
+        "\n",
+        "*   As data science team member X company asked you to answer this two questions.\n",
+        "*  What type of employees is leaving? \n",
+        "\n",
+        "*   Determine which employees are prone to leave next.\n",
+        "\n",
+        "\n",
+        "\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "ajdEVA7LiBUp"
+      },
+      "source": [
+        "Business Understanding\n",
+        "\n",
+        "---\n",
+        "\n",
+        "This step mostly focuses on understanding the Business in all the different aspects. It follows the below different steps.\n",
+        "\n",
+        "\n",
+        "\n",
+        "\n",
+        "* Identify the goal and frame the business problem.\n",
+        "* Prepare Analytical Goal i.e. what type of performance metric and loss function to use\n",
+        "* Gather information on resource, constraints, assumptions, risks etc\n",
+        "* Gather information on resource, constraints, assumptions, risks etc\n",
+        "*   Prepare Work Flow Chart"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "J4MwiCYzj2_u"
+      },
+      "source": [
+        "### Write the main objectives of this project in your words?\n",
+        "minimum of 100 characters"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "STyLda45j1Mf"
+      },
+      "source": [
+        "main_objectives ='''This project aims to allow me to understand about Data mining methodologies such as CRISP in general, \n",
+        "particularly business and data understanding. we have two classes, namely: \"Existing employees\" and \"Employees who have left\"\n",
+        "It could be identified based on the value of each given questions as the model will train from it. \n",
+        "'''"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "CuOlxLxKMOLI"
+      },
+      "source": [
+        "assert len(main_objectives) > 100 \n",
+        "### BEGIN HIDDEN TESTS\n",
+        "assert len(main_objectives) > 80 \n",
+        "### END HIDDEN TESTS"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "NyXeNxlCkbaw"
+      },
+      "source": [
+        "### Outline the different data analysis steps you will follow to carry out the project"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "rC-tl8sUksQq"
+      },
+      "source": [
+        "dm_outline = '''According to Will Hillier, we have 7 data analysis steps [1] (https://careerfoundry.com/en/blog/data-analytics/the-data-analysis-process-step-by-step/#step-four-analyzing-the-data) \n",
+        "1. Defining the question: we have already given two questions 'What type of employees is leaving?' and 'which employees are prone to leave next?'\n",
+        "2. Collecting the data: we need a massive amount of data in order to train the model well. ML requires large amount of data.\n",
+        "3. Cleaning the data: uncleaned data leads to wrong prediction, hence cleaning data is mandatory.\n",
+        "4. Analyzing the data: This one is the main step. after we cleaned the data, analyzing or using it for training is the next level with such as predictive analysis\n",
+        "5. Sharing your results: We have found something (insights) in analysis level, the next is sharing it to the x organization\n",
+        "6. Embracing failure: failure is the sign of working something harder to work, hence accepting failure and hone your ability to spot and rectify errors is the main thing. \n",
+        "7. Summary: the final step, is to summarize what we have done.\n",
+        "'''"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "-K1mWuDoksTk"
+      },
+      "source": [
+        "assert len(dm_outline) > 100 \n",
+        "### BEGIN HIDDEN TESTS\n",
+        "assert len(dm_outline) > 70 \n",
+        "### END HIDDEN TESTS"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "pmUDFG1wkzUy"
+      },
+      "source": [
+        "I will use the Accuracy metric to measure the performance of this data analysis model\n",
+        "# accuracy  =  **$\\frac{correct-predictions}{all-predictions}$**"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "KCNulojKk_BP"
+      },
+      "source": [
+        "\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "vLS2YHoRk_EK"
+      },
+      "source": [
+        "Why do you choose these metrics? minimum of 100 characters"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "LSynT14KlPSJ"
+      },
+      "source": [
+        "why_metrics = '''we are developing a model to predict whether the employees of the x organization will leave or not based on the data collected data from it. \n",
+        "Hence, we want to build a more accurate model that can be able to outcomes result in better decisions. There might be a cost of errors, but optimizing model accuracy mitigates that cost. \n",
+        "There are many optimization algorithms to handle such losses of a model. The benefits of improving model accuracy help avoid considerable time, money, and undue stress.\n",
+        "'''"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "yr-Mk0E8lPVJ"
+      },
+      "source": [
+        "assert len(why_metrics) > 100 \n",
+        "### BEGIN HIDDEN TESTS\n",
+        "assert len(why_metrics) > 80 \n",
+        "### END HIDDEN TESTS"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "aAo19Ip6lUtm"
+      },
+      "source": [
+        "### How would you know if your data analysis work is a success or not?\n",
+        "minimum of 100 characters"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "HESsiXW5llX-"
+      },
+      "source": [
+        "how_success = '''After we have analyzed the data (or experiment the model) we'll demonstrate the result to the organization. \n",
+        "What is next is taking their response and feedback by applying a usability testing and quality testing measurements. \n",
+        "'''"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "FdUoiMIOlmXq",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "cf546265-bd20-46dd-e5fe-273e53eef495"
+      },
+      "source": [
+        "assert len(how_success) > 100 \n",
+        "### BEGIN HIDDEN TESTS\n",
+        "print(len(how_success))\n",
+        "assert len(how_success) > 80 \n",
+        "### END HIDDEN TESTS"
+      ],
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "227\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "DQE6dqo6l1TZ"
+      },
+      "source": [
+        "## What kind of challenges do you expect in your analysis?\n",
+        "List at least 3 challenges"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "WrAhBQhQl8Lh"
+      },
+      "source": [
+        "challenge_text = '''The most challenge will be related to the data collection process. however, we could also face other challenges. \n",
+        "Here are some challenges what I expect during data analysis.\n",
+        "1. Collecting meaningful data: Identifying and collecting which data is vital for the organization/business is one \n",
+        "2. Selecting the right tool: Since the nature of data may vary as per the area we are going to work, selecting the right tool for the collected data may also a challenge.\n",
+        "3. Consolidate data from multiple sources: data can be collected form different sources; hence structure of these data will be different, putting these data together and using it is another challenge \n",
+        "'''\n"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "EedHa-Pll8X7",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "1b8df4ac-173a-42aa-d0f7-df15a5844714"
+      },
+      "source": [
+        "assert len(challenge_text) > 100 \n",
+        "### BEGIN HIDDEN TESTS\n",
+        "print(len(challenge_text))\n",
+        "assert len(how_success) > 80 \n",
+        "### END HIDDEN TESTS"
+      ],
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "663\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "ZcJ8M6uWDeSE"
+      },
+      "source": [
+        "<h2>Using the processed twitter data from yesterday's challenge</h2>.\n",
+        "\n",
+        "\n",
+        "- Form a new data frame (named `cleanTweet`), containing columns $\\textbf{clean-text}$ and $\\textbf{polarity}$.\n",
+        "\n",
+        "- Write a function `text_category` that takes a value `p` and returns, depending on the value of p, a string `'positive'`, `'negative'` or `'neutral'`.\n",
+        "\n",
+        "- Apply this function (`text_category`) on the $\\textbf{polarity}$ column of `cleanTweet` in 1 above to form a new column called $\\textbf{score}$ in `cleanTweet`.\n",
+        "\n",
+        "- Visualize The $\\textbf{score}$ column using piechart and barchart\n",
+        "\n",
+        "<h5>Now I want to build a classification model on the clean tweet following the steps below:</h5>\n",
+        "\n",
+        "* Remove rows from `cleanTweet` where $\\textbf{polarity}$ $= 0$ (i.e where $\\textbf{score}$ = Neutral) and reset the frame index.\n",
+        "* Construct a column $\\textbf{scoremap}$ Use the mapping {'positive':1, 'negative':0} on the $\\textbf{score}$ column\n",
+        "* Create feature and target variables `(X,y)` from $\\textbf{clean-text}$ and $\\textbf{scoremap}$ columns respectively.\n",
+        "* Use `train_test_split` function to construct `(X_train, y_train)` and `(X_test, y_test)` from `(X,y)`\n",
+        "\n",
+        "* Build an `SGDClassifier` model from the vectorize train text data. Use `CountVectorizer()` with a $\\textit{trigram}$ parameter.\n",
+        "\n",
+        "* Evaluate your model on the test data.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "#install tweepy if not installed\n",
+        "#!pip uninstall tweepy\n",
+        "#!pip install git+https://github.com/tweepy/tweepy.git"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "H9QudDFcB1S2",
+        "outputId": "39983ce2-28e1-4d39-c569-1b197c976c5e"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Found existing installation: tweepy 4.10.0\n",
+            "Uninstalling tweepy-4.10.0:\n",
+            "  Would remove:\n",
+            "    /usr/local/lib/python3.7/dist-packages/tweepy-4.10.0.dist-info/*\n",
+            "    /usr/local/lib/python3.7/dist-packages/tweepy/*\n",
+            "Proceed (y/n)? y\n",
+            "  Successfully uninstalled tweepy-4.10.0\n",
+            "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
+            "Collecting git+https://github.com/tweepy/tweepy.git\n",
+            "  Cloning https://github.com/tweepy/tweepy.git to /tmp/pip-req-build-ilv676b3\n",
+            "  Running command git clone -q https://github.com/tweepy/tweepy.git /tmp/pip-req-build-ilv676b3\n",
+            "Requirement already satisfied: oauthlib<4,>=3.2.0 in /usr/local/lib/python3.7/dist-packages (from tweepy==4.10.0) (3.2.0)\n",
+            "Requirement already satisfied: requests<3,>=2.27.0 in /usr/local/lib/python3.7/dist-packages (from tweepy==4.10.0) (2.28.1)\n",
+            "Requirement already satisfied: requests-oauthlib<2,>=1.2.0 in /usr/local/lib/python3.7/dist-packages (from tweepy==4.10.0) (1.3.1)\n",
+            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests<3,>=2.27.0->tweepy==4.10.0) (2022.6.15)\n",
+            "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests<3,>=2.27.0->tweepy==4.10.0) (1.24.3)\n",
+            "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests<3,>=2.27.0->tweepy==4.10.0) (2.10)\n",
+            "Requirement already satisfied: charset-normalizer<3,>=2 in /usr/local/lib/python3.7/dist-packages (from requests<3,>=2.27.0->tweepy==4.10.0) (2.1.0)\n",
+            "Building wheels for collected packages: tweepy\n",
+            "  Building wheel for tweepy (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+            "  Created wheel for tweepy: filename=tweepy-4.10.0-py3-none-any.whl size=94559 sha256=960c825fe40e9aa906c08568d99e2a97c0c100c3f794f8b777a495e3cbe3c83f\n",
+            "  Stored in directory: /tmp/pip-ephem-wheel-cache-8f1vqvt6/wheels/b4/a5/5a/5074abdb9f4bd5bd0e22631a63fc41ae2fa71ad83780ea18d1\n",
+            "Successfully built tweepy\n",
+            "Installing collected packages: tweepy\n",
+            "Successfully installed tweepy-4.10.0\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "85WxmGNGDcBY"
+      },
+      "source": [
+        "%load_ext autoreload\n",
+        "%autoreload 2\n",
+        "%reload_ext autoreload"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Imports\n",
+        "import tweepy\n",
+        "import json\n",
+        "import time"
+      ],
+      "metadata": {
+        "id": "d-5-tkS4_oLE"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Config file variables\n",
+        "consumer_key = '<finance>'\n",
+        "consumer_secret = '<balance>'\n",
+        "access_token = '<result>'\n",
+        "access_token_secret = '<answer>'\n"
+      ],
+      "metadata": {
+        "id": "iSz4D89j_UE3"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Authenticate into Tweepy\n",
+        "auth = tweepy.OAuthHandler(consumer_key, consumer_secret)\n",
+        "auth.set_access_token(access_token, access_token_secret)\n",
+        "api = tweepy.API(auth, wait_on_rate_limit=True)\n"
+      ],
+      "metadata": {
+        "id": "yuKFY_G__UHz"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "keywords = \"#blockchain OR #cryptocurreny OR #financialmarket OR #bitcoin OR #ethereum\"\n",
+        "# keywords_with_geocode = \"#blockchain OR #cryptocurreny OR #financialmarket OR #bitcoin OR #ethereum geocode:6.611,20.934,240km\"\n",
+        "limit = 50"
+      ],
+      "metadata": {
+        "id": "40YuqSGS_UKb"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# save tweets to json file\n",
+        "c = tweepy.Cursor(\n",
+        "    api.search_tweets,\n",
+        "    q=keywords,\n",
+        "    tweet_mode=\"extended\",\n",
+        "    include_entities=True,\n",
+        ").items(limit)"
+      ],
+      "metadata": {
+        "id": "9qFjY8hN_UMz"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "while True:\n",
+        "    try:\n",
+        "        tweet = c.next()\n",
+        "        with open(\"../data/\" + \"web3.json\", \"a\", encoding=\"utf-8\") as f:\n",
+        "            # for tweet in tweets:\n",
+        "            data = tweet._json\n",
+        "            f.write(json.dumps(data))\n",
+        "            f.write(\"\\n\")\n",
+        "    except tweepy.TooManyRequests:\n",
+        "        print(\"Limit Reached. Sleeping for 15 minutes\")\n",
+        "        time.sleep(60 * 15)\n",
+        "        continue\n",
+        "    except StopIteration:\n",
+        "        break"
+      ],
+      "metadata": {
+        "id": "Iul5TTYw_UPn"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "working on"
+      ],
+      "metadata": {
+        "id": "upr7mlvDFG58"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        ""
+      ],
+      "metadata": {
+        "id": "mJ64ezpVlxAT"
+      },
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}
\ No newline at end of file