From feea228b15c2fc031752e7e1f2ea2526407a294c Mon Sep 17 00:00:00 2001 From: Amdework Asefa Date: Wed, 10 Aug 2022 09:31:27 +0300 Subject: [PATCH 1/2] Amdework Commited bugfix --- clean_tweets_dataframe.py | 88 +++++++++++++ extract_dataframe.py | 234 ++++++++++++++++++++++++++++++++++ fix_clean_tweets_dataframe.py | 58 --------- fix_extract_dataframe.py | 137 -------------------- 4 files changed, 322 insertions(+), 195 deletions(-) create mode 100644 clean_tweets_dataframe.py create mode 100644 extract_dataframe.py delete mode 100644 fix_clean_tweets_dataframe.py delete mode 100644 fix_extract_dataframe.py diff --git a/clean_tweets_dataframe.py b/clean_tweets_dataframe.py new file mode 100644 index 0000000..aaa6c8b --- /dev/null +++ b/clean_tweets_dataframe.py @@ -0,0 +1,88 @@ +import pandas as pd +import re + +class Clean_Tweets: + """ + The PEP8 Standard AMAZING!!! + """ + + def __init__(self): + print('Automation in Action...!!!') + + def add_clean_text(self, df: pd.DataFrame) -> pd.DataFrame: + """ + convert original_text values to clean_text values + """ + + df['clean_text'] = df['original_text'].apply(clean_text) + return df + + def drop_nullValue_rows(self, df: pd.DataFrame) -> pd.DataFrame: + """ + convert original_text values to clean_text values + """ + + df.dropna(inplace=True) + df.reset_index(drop=True, inplace=True) + + return df + + def drop_unwanted_column(self, df: pd.DataFrame) -> pd.DataFrame: + """ + remove rows that has column names. This error originated from + the data collection stage. + """ + columns = ['created_at', 'source', 'original_text', 'clean_text', 'sentiment', 'polarity', 'subjectivity', 'lang', 'favorite_count', 'retweet_count', + 'original_author', 'screen_count', 'followers_count', 'friends_count', 'possibly_sensitive', 'hashtags', 'user_mentions', 'place', 'place_coord_boundaries'] + unwanted_rows = [] + for columnName in columns: + unwanted_rows += df[df[columnName] == columnName].index + + df.drop(unwanted_rows, inplace=True) + df.reset_index(drop=True, inplace=True) + return df + + def drop_duplicate(self, df: pd.DataFrame) -> pd.DataFrame: + """ + drop duplicate rows + """ + df.drop_duplicates(inplace=True) + df.reset_index(drop=True, inplace=True) + return df + + def convert_to_datetime(self, df: pd.DataFrame) -> pd.DataFrame: + """ + convert column to datetime + """ + df['created_at'] = pd.to_datetime(df['created_at']) + return df + + def convert_to_numbers(self, df: pd.DataFrame) -> pd.DataFrame: + """ + convert columns like polarity, subjectivity, retweet_count + favorite_count etc to numbers + """ + df[['polarity', 'subjectivity', 'favorite_count', 'retweet_count', 'screen_count', 'followers_count', 'friends_count']] = df[[ + 'polarity', 'subjectivity', 'favorite_count', 'retweet_count', 'screen_count', 'followers_count', 'friends_count']].apply(pd.to_numeric) + return df + + def remove_non_english_tweets(self, df: pd.DataFrame) -> pd.DataFrame: + """ + remove non english tweets from lang + """ + + index_names = df[df['lang'] != "en"].index + + df.drop(index_names, inplace=True) + df.reset_index(drop=True, inplace=True) + + return df + + +def clean_text(original_text: str) -> str: + cleaned_text = re.sub('\n', '', original_text) + cleaned_text = re.findall(r'[a-zA-Z]+', cleaned_text) + cleaned_text = " ".join(cleaned_text) + cleaned_text = re.sub(r'http.*', "", cleaned_text) + + return cleaned_text \ No newline at end of file diff --git a/extract_dataframe.py b/extract_dataframe.py new file mode 100644 index 0000000..825b8ba --- /dev/null +++ b/extract_dataframe.py @@ -0,0 +1,234 @@ +import json +import pandas as pd +from textblob import TextBlob + + +def read_json(json_file: str) -> list: + """ + json file reader to open and read json files into a list + Args: + ----- + json_file: str - path of a json file + Returns + ------- + length of the json file and a list of json + """ + + tweets_data = [] + for tweets in open(json_file, 'r'): + tweets_data.append(json.loads(tweets)) + + return len(tweets_data), tweets_data + + +class TweetDfExtractor: + """ + this function will parse tweets json into a pandas dataframe + Return + ------ + dataframe + """ + + def __init__(self, tweets_list): + + self.tweets_list = tweets_list + + # an example function + def find_statuses_count(self) -> list: + statuses_count = [x["user"]["statuses_count"] + for x in self.tweets_list] + + return statuses_count + + def find_full_text(self) -> list: + full_text = [] + for tweet in self.tweets_list: + try: + full_text.append( + tweet["retweeted_status"]['extended_tweet']['full_text']) + except KeyError: + full_text.append("") + + return full_text + + def find_original_text(self) -> list: + original_text = [x['text'] for x in self.tweets_list] + + return original_text + + def find_sentiment(self, polarity, subjectivity) -> list: + sentiment = [] + for i in range(len(polarity)): + if polarity[i] > 0: + sentiment.append(1) + elif polarity[i] < 0: + sentiment.append(0) + else: + sentiment.append(-1) + + return sentiment + + def find_sentiments(self, text) -> list: + polarityList = [] + subjectivityList = [] + for eachText in text: + polarity, subjectivity = TextBlob(eachText).sentiment + polarityList.append(polarity) + subjectivityList.append(subjectivity) + + return polarityList, subjectivityList + + def find_lang(self) -> list: + lang = [x['lang'] for x in self.tweets_list] + + return lang + + def find_created_time(self) -> list: + created_at = [x['created_at'] for x in self.tweets_list] + + return created_at + + def find_source(self) -> list: + source = [x['source'] for x in self.tweets_list] + + return source + + def find_screen_name(self) -> list: + screen_name = [x['user']['screen_name'] for x in self.tweets_list] + + return screen_name + + def find_screen_count(self) -> list: + screen_count = [x['user']['listed_count'] + for x in self.tweets_list] + + return screen_count + + def find_followers_count(self) -> list: + followers_count = [x['user']['followers_count'] + for x in self.tweets_list] + + return followers_count + + def find_friends_count(self) -> list: + friends_count = [x['user']['friends_count'] for x in self.tweets_list] + + return friends_count + + def is_sensitive(self) -> list: + is_sensitive = [] + for tweet in self.tweets_list: + try: + value = tweet["retweeted_status"]['possibly_sensitive'] + if(not value): + is_sensitive.append(None) + else: + is_sensitive.append(value) + except KeyError: + is_sensitive.append(None) + + return is_sensitive + + def find_favourite_count(self) -> list: + favourite_count = [] + for tweet in self.tweets_list: + try: + favourite_count.append( + tweet["retweeted_status"]['favorite_count']) + except KeyError: + favourite_count.append(0) + + return favourite_count + + def find_retweet_count(self) -> list: + retweet_count = [] + for tweet in self.tweets_list: + try: + retweet_count.append( + tweet["retweeted_status"]['retweet_count']) + except KeyError: + retweet_count.append(0) + + return retweet_count + + def find_hashtags(self) -> list: + hashtags = [] + for tweet in self.tweets_list: + try: + hashtags.append(tweet['entities']['hashtags'][0]['text']) + except KeyError: + hashtags.append(None) + except IndexError: + hashtags.append(None) + + return hashtags + + def find_mentions(self) -> list: + mentions = [] + main_mentions = [x['entities']['user_mentions'] + for x in self.tweets_list] + for mention in main_mentions: + for each in mention: + mentions.append(each['screen_name']) + + return mentions + + def find_place(self) -> list: + place = [x['place'] for x in self.tweets_list] + + return place + + def find_coordinates(self) -> list: + coordinates = [x['coordinates'] for x in self.tweets_list] + + return coordinates + + def find_location(self) -> list: + location = [x['user']['location'] for x in self.tweets_list] + + return location + + def get_tweet_df(self, save=False) -> pd.DataFrame: + """required column to be generated you should be creative and add more features""" + + columns = ['created_at', 'source', 'original_text', 'sentiment', 'polarity', 'subjectivity', 'lang', 'favorite_count', 'retweet_count', + 'original_author', 'followers_count', 'friends_count', 'possibly_sensitive', 'hashtags', 'user_mentions', 'place'] + + created_at = self.find_created_time() + source = self.find_source() + original_text = self.find_original_text() + clean_text = self.find_full_text() + polarity, subjectivity = self.find_sentiments(clean_text) + sentiment = self.find_sentiment(polarity, subjectivity) + lang = self.find_lang() + favorite_count = self.find_favourite_count() + retweet_count = self.find_retweet_count() + original_author = self.find_screen_name() + screen_count = self.find_screen_count() + followers_count = self.find_followers_count() + friends_count = self.find_friends_count() + possibly_sensitive = self.is_sensitive() + hashtags = self.find_hashtags() + user_mentions = self.find_mentions() + place = self.find_location() + place_coord_boundaries = self.find_coordinates() + data = zip(created_at, source, original_text, sentiment, polarity, subjectivity, lang, favorite_count, retweet_count, + original_author, followers_count, friends_count, possibly_sensitive, hashtags, user_mentions, place) + df = pd.DataFrame(data=data, columns=columns) + + if save: + df.to_csv('./data/processed_tweet_data.csv', index=False) + print('File Successfully Saved.!!!') + + return df + + +if __name__ == "__main__": + # required column to be generated you should be creative and add more features + columns = ['created_at', 'source', 'original_text', 'clean_text', 'sentiment', 'polarity', 'subjectivity', 'lang', 'favorite_count', 'retweet_count', + 'original_author', 'screen_count', 'followers_count', 'friends_count', 'possibly_sensitive', 'hashtags', 'user_mentions', 'place', 'place_coord_boundaries'] + _, tweet_list = read_json("./data/covid19.json") + tweet = TweetDfExtractor(tweet_list) + tweet_df = tweet.get_tweet_df(save=True) + + # use all defined functions to generate a dataframe with the specified columns above \ No newline at end of file diff --git a/fix_clean_tweets_dataframe.py b/fix_clean_tweets_dataframe.py deleted file mode 100644 index 7b45a35..0000000 --- a/fix_clean_tweets_dataframe.py +++ /dev/null @@ -1,58 +0,0 @@ -class Clean_Tweets: - """ - The PEP8 Standard AMAZING!!! - """ - def __init__(self, df:pd.DataFrame): - self.df = df - print('Automation in Action...!!!') - - def drop_unwanted_column(self, df:pd.DataFrame)->pd.DataFrame: - """ - remove rows that has column names. This error originated from - the data collection stage. - """ - unwanted_rows = df[df['retweet_count'] == 'retweet_count' ].index - df.drop(unwanted_rows , inplace=True) - df = df[df['polarity'] != 'polarity'] - - return df - def drop_duplicate(self, df:pd.DataFrame)->pd.DataFrame: - """ - drop duplicate rows - """ - - --- - - return df - def convert_to_datetime(self, df:pd.DataFrame)->pd.DataFrame: - """ - convert column to datetime - """ - ---- - - ---- - - df = df[df['created_at'] >= '2020-12-31' ] - - return df - - def convert_to_numbers(self, df:pd.DataFrame)->pd.DataFrame: - """ - convert columns like polarity, subjectivity, retweet_count - favorite_count etc to numbers - """ - df['polarity'] = pd.---- - - ---- - ---- - - return df - - def remove_non_english_tweets(self, df:pd.DataFrame)->pd.DataFrame: - """ - remove non english tweets from lang - """ - - df = ---- - - return df \ No newline at end of file diff --git a/fix_extract_dataframe.py b/fix_extract_dataframe.py deleted file mode 100644 index 3bd792d..0000000 --- a/fix_extract_dataframe.py +++ /dev/null @@ -1,137 +0,0 @@ -import json -import pandas as pd -from textblob import TextBlob - - -def read_json(json_file: str)->list: - """ - json file reader to open and read json files into a list - Args: - ----- - json_file: str - path of a json file - - Returns - ------- - length of the json file and a list of json - """ - - tweets_data = [] - for tweets in open(json_file,'r'): - tweets_data.append(json.loads(tweets)) - - - return len(tweets_data), tweets_data - -class TweetDfExtractor: - """ - this function will parse tweets json into a pandas dataframe - - Return - ------ - dataframe - """ - def __init__(self, tweets_list): - - self.tweets_list = tweets_list - - # an example function - def find_statuses_count(self)->list: - statuses_count - - def find_full_text(self)->list: - text = - - - def find_sentiments(self, text)->list: - - return polarity, self.subjectivity - - def find_created_time(self)->list: - - return created_at - - def find_source(self)->list: - source = - - return source - - def find_screen_name(self)->list: - screen_name = - - def find_followers_count(self)->list: - followers_count = - - def find_friends_count(self)->list: - friends_count = - - def is_sensitive(self)->list: - try: - is_sensitive = [x['possibly_sensitive'] for x in self.tweets_list] - except KeyError: - is_sensitive = None - - return is_sensitive - - def find_favourite_count(self)->list: - - - def find_retweet_count(self)->list: - retweet_count = - - def find_hashtags(self)->list: - hashtags = - - def find_mentions(self)->list: - mentions = - - - def find_location(self)->list: - try: - location = self.tweets_list['user']['location'] - except TypeError: - location = '' - - return location - - - - - def get_tweet_df(self, save=False)->pd.DataFrame: - """required column to be generated you should be creative and add more features""" - - columns = ['created_at', 'source', 'original_text','polarity','subjectivity', 'lang', 'favorite_count', 'retweet_count', - 'original_author', 'followers_count','friends_count','possibly_sensitive', 'hashtags', 'user_mentions', 'place'] - - created_at = self.find_created_time() - source = self.find_source() - text = self.find_full_text() - polarity, subjectivity = self.find_sentiments(text) - lang = self.find_lang() - fav_count = self.find_favourite_count() - retweet_count = self.find_retweet_count() - screen_name = self.find_screen_name() - follower_count = self.find_followers_count() - friends_count = self.find_friends_count() - sensitivity = self.is_sensitive() - hashtags = self.find_hashtags() - mentions = self.find_mentions() - location = self.find_location() - data = zip(created_at, source, text, polarity, subjectivity, lang, fav_count, retweet_count, screen_name, follower_count, friends_count, sensitivity, hashtags, mentions, location) - df = pd.DataFrame(data=data, columns=columns) - - if save: - df.to_csv('processed_tweet_data.csv', index=False) - print('File Successfully Saved.!!!') - - return df - - -if __name__ == "__main__": - # required column to be generated you should be creative and add more features - columns = ['created_at', 'source', 'original_text','clean_text', 'sentiment','polarity','subjectivity', 'lang', 'favorite_count', 'retweet_count', - 'original_author', 'screen_count', 'followers_count','friends_count','possibly_sensitive', 'hashtags', 'user_mentions', 'place', 'place_coord_boundaries'] - _, tweet_list = read_json("../covid19.json") - tweet = TweetDfExtractor(tweet_list) - tweet_df = tweet.get_tweet_df() - - # use all defined functions to generate a dataframe with the specified columns above \ No newline at end of file From 66d90fbe5cfd08c0f48c745fbb20b53f4df7574f Mon Sep 17 00:00:00 2001 From: Amdework Asefa <56429095+Amdework21@users.noreply.github.com> Date: Wed, 10 Aug 2022 13:05:58 +0300 Subject: [PATCH 2/2] Created using Colaboratory --- Done_Challenge__Day2.ipynb | 628 +++++++++++++++++++++++++++++++++++++ 1 file changed, 628 insertions(+) create mode 100644 Done_Challenge__Day2.ipynb diff --git a/Done_Challenge__Day2.ipynb b/Done_Challenge__Day2.ipynb new file mode 100644 index 0000000..5c0e597 --- /dev/null +++ b/Done_Challenge__Day2.ipynb @@ -0,0 +1,628 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "Done_Challenge_ Day2.ipynb", + "provenance": [], + "collapsed_sections": [], + "include_colab_link": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "source": [ + "#Done Project: Data Minining Project for X company" + ], + "metadata": { + "id": "zroHHWfG7V2M" + } + }, + { + "cell_type": "markdown", + "metadata": { + "id": "zDwep1K8Erxl" + }, + "source": [ + "**Project:** Data Minining Project for X company" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "JzIu-UWIDXHw" + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "d7-ii3uyI8KY" + }, + "source": [ + "The CRISP-DM Framework\n", + "\n", + "\n", + "The CRISP-DM methodology provides a structured approach to planning a data mining project. It is a robust and well-proven methodology.\n", + "* Business understanding (BU): Determine Business Objectives, Assess Situation, Determine Data Mining Goals, Produce Project Plan\n", + "\n", + "* Data understanding (DU): Collect Initial Data, Describe Data, Explore Data, Verify Data Quality\n", + "\n", + "* Data preparation (DP): Select Data, Clean Data, Construct Data, Integrate Data\n", + "\n", + "* Modeling (M): Select modeling technique, Generate Test Design, Build Model, Assess Model\n", + "* Evaluation (E): Evaluate Results, Review Process, Determine Next Steps\n", + "* Deployment (D): Plan Deployment, Plan Monitoring and Maintenance, Produce Final Report, Review Project\n", + "\n", + "\n", + "References:\n", + "\n", + "[What is the CRISP-DM methodology?](https://www.sv-europe.com/crisp-dm-methodology/)\n", + "\n", + "[Introduction to CRISP DM Framework for Data Science and Machine Learning](https://www.linkedin.com/pulse/chapter-1-introduction-crisp-dm-framework-data-science-anshul-roy/)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5lo7Ml7tMQOf" + }, + "source": [ + "**Data Set**\n", + "### The data is for company X which is trying to control attrition. \n", + "### There are two sets of data: \"Existing employees\" and \"Employees who have left\". The following attributes are available for every employee.\n", + "\n", + "\n", + "* Satisfaction Level\n", + "\n", + "* Last evaluation\n", + "\n", + "* Number of projects\n", + "\n", + "* Average monthly hours\n", + "\n", + "* Time spent at the company\n", + "* Whether they have had a work accident\n", + "\n", + "\n", + "* Whether they have had a promotion in the last 5 years\n", + "\n", + "\n", + "* Departments (column sales)\n", + "\n", + "\n", + "* Salary\n", + "\n", + "\n", + "* Whether the employee has left\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "sjSj2A2sSph_" + }, + "source": [ + "**Your Role**\n", + " \n", + "\n", + "* As data science team member X company asked you to answer this two questions.\n", + "* What type of employees is leaving? \n", + "\n", + "* Determine which employees are prone to leave next.\n", + "\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ajdEVA7LiBUp" + }, + "source": [ + "Business Understanding\n", + "\n", + "---\n", + "\n", + "This step mostly focuses on understanding the Business in all the different aspects. It follows the below different steps.\n", + "\n", + "\n", + "\n", + "\n", + "* Identify the goal and frame the business problem.\n", + "* Prepare Analytical Goal i.e. what type of performance metric and loss function to use\n", + "* Gather information on resource, constraints, assumptions, risks etc\n", + "* Gather information on resource, constraints, assumptions, risks etc\n", + "* Prepare Work Flow Chart" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "J4MwiCYzj2_u" + }, + "source": [ + "### Write the main objectives of this project in your words?\n", + "minimum of 100 characters" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "STyLda45j1Mf" + }, + "source": [ + "main_objectives ='''This project aims to allow me to understand about Data mining methodologies such as CRISP in general, \n", + "particularly business and data understanding. we have two classes, namely: \"Existing employees\" and \"Employees who have left\"\n", + "It could be identified based on the value of each given questions as the model will train from it. \n", + "'''" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "CuOlxLxKMOLI" + }, + "source": [ + "assert len(main_objectives) > 100 \n", + "### BEGIN HIDDEN TESTS\n", + "assert len(main_objectives) > 80 \n", + "### END HIDDEN TESTS" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "NyXeNxlCkbaw" + }, + "source": [ + "### Outline the different data analysis steps you will follow to carry out the project" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "rC-tl8sUksQq" + }, + "source": [ + "dm_outline = '''According to Will Hillier, we have 7 data analysis steps [1] (https://careerfoundry.com/en/blog/data-analytics/the-data-analysis-process-step-by-step/#step-four-analyzing-the-data) \n", + "1. Defining the question: we have already given two questions 'What type of employees is leaving?' and 'which employees are prone to leave next?'\n", + "2. Collecting the data: we need a massive amount of data in order to train the model well. ML requires large amount of data.\n", + "3. Cleaning the data: uncleaned data leads to wrong prediction, hence cleaning data is mandatory.\n", + "4. Analyzing the data: This one is the main step. after we cleaned the data, analyzing or using it for training is the next level with such as predictive analysis\n", + "5. Sharing your results: We have found something (insights) in analysis level, the next is sharing it to the x organization\n", + "6. Embracing failure: failure is the sign of working something harder to work, hence accepting failure and hone your ability to spot and rectify errors is the main thing. \n", + "7. Summary: the final step, is to summarize what we have done.\n", + "'''" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "-K1mWuDoksTk" + }, + "source": [ + "assert len(dm_outline) > 100 \n", + "### BEGIN HIDDEN TESTS\n", + "assert len(dm_outline) > 70 \n", + "### END HIDDEN TESTS" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "pmUDFG1wkzUy" + }, + "source": [ + "I will use the Accuracy metric to measure the performance of this data analysis model\n", + "# accuracy = **$\\frac{correct-predictions}{all-predictions}$**" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "KCNulojKk_BP" + }, + "source": [ + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vLS2YHoRk_EK" + }, + "source": [ + "Why do you choose these metrics? minimum of 100 characters" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "LSynT14KlPSJ" + }, + "source": [ + "why_metrics = '''we are developing a model to predict whether the employees of the x organization will leave or not based on the data collected data from it. \n", + "Hence, we want to build a more accurate model that can be able to outcomes result in better decisions. There might be a cost of errors, but optimizing model accuracy mitigates that cost. \n", + "There are many optimization algorithms to handle such losses of a model. The benefits of improving model accuracy help avoid considerable time, money, and undue stress.\n", + "'''" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "yr-Mk0E8lPVJ" + }, + "source": [ + "assert len(why_metrics) > 100 \n", + "### BEGIN HIDDEN TESTS\n", + "assert len(why_metrics) > 80 \n", + "### END HIDDEN TESTS" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "aAo19Ip6lUtm" + }, + "source": [ + "### How would you know if your data analysis work is a success or not?\n", + "minimum of 100 characters" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "HESsiXW5llX-" + }, + "source": [ + "how_success = '''After we have analyzed the data (or experiment the model) we'll demonstrate the result to the organization. \n", + "What is next is taking their response and feedback by applying a usability testing and quality testing measurements. \n", + "'''" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "FdUoiMIOlmXq", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "cf546265-bd20-46dd-e5fe-273e53eef495" + }, + "source": [ + "assert len(how_success) > 100 \n", + "### BEGIN HIDDEN TESTS\n", + "print(len(how_success))\n", + "assert len(how_success) > 80 \n", + "### END HIDDEN TESTS" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "227\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DQE6dqo6l1TZ" + }, + "source": [ + "## What kind of challenges do you expect in your analysis?\n", + "List at least 3 challenges" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "WrAhBQhQl8Lh" + }, + "source": [ + "challenge_text = '''The most challenge will be related to the data collection process. however, we could also face other challenges. \n", + "Here are some challenges what I expect during data analysis.\n", + "1. Collecting meaningful data: Identifying and collecting which data is vital for the organization/business is one \n", + "2. Selecting the right tool: Since the nature of data may vary as per the area we are going to work, selecting the right tool for the collected data may also a challenge.\n", + "3. Consolidate data from multiple sources: data can be collected form different sources; hence structure of these data will be different, putting these data together and using it is another challenge \n", + "'''\n" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "EedHa-Pll8X7", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "1b8df4ac-173a-42aa-d0f7-df15a5844714" + }, + "source": [ + "assert len(challenge_text) > 100 \n", + "### BEGIN HIDDEN TESTS\n", + "print(len(challenge_text))\n", + "assert len(how_success) > 80 \n", + "### END HIDDEN TESTS" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "663\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ZcJ8M6uWDeSE" + }, + "source": [ + "

Using the processed twitter data from yesterday's challenge

.\n", + "\n", + "\n", + "- Form a new data frame (named `cleanTweet`), containing columns $\\textbf{clean-text}$ and $\\textbf{polarity}$.\n", + "\n", + "- Write a function `text_category` that takes a value `p` and returns, depending on the value of p, a string `'positive'`, `'negative'` or `'neutral'`.\n", + "\n", + "- Apply this function (`text_category`) on the $\\textbf{polarity}$ column of `cleanTweet` in 1 above to form a new column called $\\textbf{score}$ in `cleanTweet`.\n", + "\n", + "- Visualize The $\\textbf{score}$ column using piechart and barchart\n", + "\n", + "
Now I want to build a classification model on the clean tweet following the steps below:
\n", + "\n", + "* Remove rows from `cleanTweet` where $\\textbf{polarity}$ $= 0$ (i.e where $\\textbf{score}$ = Neutral) and reset the frame index.\n", + "* Construct a column $\\textbf{scoremap}$ Use the mapping {'positive':1, 'negative':0} on the $\\textbf{score}$ column\n", + "* Create feature and target variables `(X,y)` from $\\textbf{clean-text}$ and $\\textbf{scoremap}$ columns respectively.\n", + "* Use `train_test_split` function to construct `(X_train, y_train)` and `(X_test, y_test)` from `(X,y)`\n", + "\n", + "* Build an `SGDClassifier` model from the vectorize train text data. Use `CountVectorizer()` with a $\\textit{trigram}$ parameter.\n", + "\n", + "* Evaluate your model on the test data.\n" + ] + }, + { + "cell_type": "code", + "source": [ + "#install tweepy if not installed\n", + "#!pip uninstall tweepy\n", + "#!pip install git+https://github.com/tweepy/tweepy.git" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "H9QudDFcB1S2", + "outputId": "39983ce2-28e1-4d39-c569-1b197c976c5e" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Found existing installation: tweepy 4.10.0\n", + "Uninstalling tweepy-4.10.0:\n", + " Would remove:\n", + " /usr/local/lib/python3.7/dist-packages/tweepy-4.10.0.dist-info/*\n", + " /usr/local/lib/python3.7/dist-packages/tweepy/*\n", + "Proceed (y/n)? y\n", + " Successfully uninstalled tweepy-4.10.0\n", + "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", + "Collecting git+https://github.com/tweepy/tweepy.git\n", + " Cloning https://github.com/tweepy/tweepy.git to /tmp/pip-req-build-ilv676b3\n", + " Running command git clone -q https://github.com/tweepy/tweepy.git /tmp/pip-req-build-ilv676b3\n", + "Requirement already satisfied: oauthlib<4,>=3.2.0 in /usr/local/lib/python3.7/dist-packages (from tweepy==4.10.0) (3.2.0)\n", + "Requirement already satisfied: requests<3,>=2.27.0 in /usr/local/lib/python3.7/dist-packages (from tweepy==4.10.0) (2.28.1)\n", + "Requirement already satisfied: requests-oauthlib<2,>=1.2.0 in /usr/local/lib/python3.7/dist-packages (from tweepy==4.10.0) (1.3.1)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests<3,>=2.27.0->tweepy==4.10.0) (2022.6.15)\n", + "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests<3,>=2.27.0->tweepy==4.10.0) (1.24.3)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests<3,>=2.27.0->tweepy==4.10.0) (2.10)\n", + "Requirement already satisfied: charset-normalizer<3,>=2 in /usr/local/lib/python3.7/dist-packages (from requests<3,>=2.27.0->tweepy==4.10.0) (2.1.0)\n", + "Building wheels for collected packages: tweepy\n", + " Building wheel for tweepy (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for tweepy: filename=tweepy-4.10.0-py3-none-any.whl size=94559 sha256=960c825fe40e9aa906c08568d99e2a97c0c100c3f794f8b777a495e3cbe3c83f\n", + " Stored in directory: /tmp/pip-ephem-wheel-cache-8f1vqvt6/wheels/b4/a5/5a/5074abdb9f4bd5bd0e22631a63fc41ae2fa71ad83780ea18d1\n", + "Successfully built tweepy\n", + "Installing collected packages: tweepy\n", + "Successfully installed tweepy-4.10.0\n" + ] + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "85WxmGNGDcBY" + }, + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "%reload_ext autoreload" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Imports\n", + "import tweepy\n", + "import json\n", + "import time" + ], + "metadata": { + "id": "d-5-tkS4_oLE" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Config file variables\n", + "consumer_key = ''\n", + "consumer_secret = ''\n", + "access_token = ''\n", + "access_token_secret = ''\n" + ], + "metadata": { + "id": "iSz4D89j_UE3" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Authenticate into Tweepy\n", + "auth = tweepy.OAuthHandler(consumer_key, consumer_secret)\n", + "auth.set_access_token(access_token, access_token_secret)\n", + "api = tweepy.API(auth, wait_on_rate_limit=True)\n" + ], + "metadata": { + "id": "yuKFY_G__UHz" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "keywords = \"#blockchain OR #cryptocurreny OR #financialmarket OR #bitcoin OR #ethereum\"\n", + "# keywords_with_geocode = \"#blockchain OR #cryptocurreny OR #financialmarket OR #bitcoin OR #ethereum geocode:6.611,20.934,240km\"\n", + "limit = 50" + ], + "metadata": { + "id": "40YuqSGS_UKb" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# save tweets to json file\n", + "c = tweepy.Cursor(\n", + " api.search_tweets,\n", + " q=keywords,\n", + " tweet_mode=\"extended\",\n", + " include_entities=True,\n", + ").items(limit)" + ], + "metadata": { + "id": "9qFjY8hN_UMz" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "while True:\n", + " try:\n", + " tweet = c.next()\n", + " with open(\"../data/\" + \"web3.json\", \"a\", encoding=\"utf-8\") as f:\n", + " # for tweet in tweets:\n", + " data = tweet._json\n", + " f.write(json.dumps(data))\n", + " f.write(\"\\n\")\n", + " except tweepy.TooManyRequests:\n", + " print(\"Limit Reached. Sleeping for 15 minutes\")\n", + " time.sleep(60 * 15)\n", + " continue\n", + " except StopIteration:\n", + " break" + ], + "metadata": { + "id": "Iul5TTYw_UPn" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "working on" + ], + "metadata": { + "id": "upr7mlvDFG58" + } + }, + { + "cell_type": "code", + "source": [ + "" + ], + "metadata": { + "id": "mJ64ezpVlxAT" + }, + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file