code prep

SaeedShurrab · May 3, 2021 · 1991f8e · 1991f8e
1 parent 45646d6
commit 1991f8e
Show file tree

Hide file tree

Showing 7 changed files with 614 additions and 26 deletions.
diff --git a/notebooks/project_code.ipynb b/notebooks/project_code.ipynb
diff --git a/plots/apr_sent.jpg b/plots/apr_sent.jpg
diff --git a/plots/feb_sent.jpg b/plots/feb_sent.jpg
diff --git a/plots/label_dist.jpg b/plots/label_dist.jpg
diff --git a/plots/mar_sent.jpg b/plots/mar_sent.jpg
diff --git a/preprocessing.py b/preprocessing.py
@@ -20,10 +20,6 @@
 
 data = data[['text','datetime','hashtags']]
 
-import numpy as np
-idx = np.arange(data.shape[0])
-np.random.shuffle(idx)
-data = data.iloc[0:13000]
 
 # Apply Cleaning
 data['clean_text']=data['text'].progress_apply(lambda x: process_all_text(x))
@@ -84,10 +80,10 @@
 mar_tweets = data[data.month == 3]
 apr_tweets = data[data.month == 4]
 
-data.to_csv(os.path.join(intermediate_dir,'cleaned_data.csv'))
+data.to_pickle(os.path.join(intermediate_dir,'cleaned_data.pkl'))
 
-feb_tweets.to_csv(os.path.join(preprocessed_dir,'feb_tweets.csv'))
-mar_tweets.to_csv(os.path.join(preprocessed_dir,'mar_tweets.csv'))
-apr_tweets.to_csv(os.path.join(preprocessed_dir,'apr_tweets.csv'))
+feb_tweets.to_pickle(os.path.join(preprocessed_dir,'feb_tweets.pkl'))
+mar_tweets.to_pickle(os.path.join(preprocessed_dir,'mar_tweets.pkl'))
+apr_tweets.to_pickle(os.path.join(preprocessed_dir,'apr_tweets.pkl'))
 
 print('preprocessing completed \n')
diff --git a/sentiment.py b/sentiment.py
@@ -1,14 +1,16 @@
 import os
 import numpy as np
 import pandas as pd
-from src.utils import plot_sentiment
+import matplotlib.pyplot as plt
+from src.utils import plot_sentiment, font
+import warnings
 
 preprocessed_dir = os.path.join(os.curdir,'data','preprocessed')
 plots_dir = os.path.join(os.curdir,'plots')
 
-feb_tweets = pd.read_csv(os.path.join(preprocessed_dir,'feb_tweets.csv'),index_col=0)
-mar_tweets = pd.read_csv(os.path.join(preprocessed_dir,'mar_tweets.csv'),index_col=0)
-#apr_tweets = pd.read_csv(os.path.join(preprocessed_dir,'apr_tweets.csv'),index_col=0)
+feb_tweets = pd.read_pickle(os.path.join(preprocessed_dir,'feb_tweets.pkl'))
+mar_tweets = pd.read_pickle(os.path.join(preprocessed_dir,'mar_tweets.pkl'))
+apr_tweets = pd.read_pickle(os.path.join(preprocessed_dir,'apr_tweets.pkl'))
 
 # COVID -19 Confirmed Cases 1/Feb. - 29/Apr.
 
@@ -55,13 +57,13 @@
 
 feb_tweets['day'] = pd.DatetimeIndex(feb_tweets['datetime']).day
 mar_tweets['day'] = pd.DatetimeIndex(mar_tweets['datetime']).day
-#apr_tweets['day'] = pd.DatetimeIndex(apr_tweets['datetime']).day
+apr_tweets['day'] = pd.DatetimeIndex(apr_tweets['datetime']).day
 
 
 # Sentimet frequency per month extraction
 feb_freq = list(feb_tweets.groupby(['label','day'])['day'].count().to_dict().values())
 mar_freq = list(mar_tweets.groupby(['label','day'])['day'].count().to_dict().values())
-#apr_freq = list(apr_tweets.groupby(['label','day'])['day'].count().to_dict().values())
+apr_freq = list(apr_tweets.groupby(['label','day'])['day'].count().to_dict().values())
 
 feb_nuet = feb_freq[0:29]
 feb_neg = feb_freq[29:58]
@@ -71,9 +73,11 @@
 mar_neg = mar_freq[31:62]
 mar_pos = mar_freq[62:]
 
-#apr_nuet = apr_freq[0:29]
-#apr_neg = apr_freq[29:58]
-#apr_pos = apr_freq[58:]
+apr_nuet = apr_freq[0:29]
+apr_neg = apr_freq[29:58]
+apr_pos = apr_freq[58:]
+
+
 
 
 # Tweets sentiment associated with number of COVID-19 cases in February. 
@@ -100,12 +104,12 @@
 
 
 # Tweets sentiment associated with number of COVID-19 cases in April. 
-#plot_sentiment(apr_days,
-#               apr_cases, 
-#               apr_death,
-#               apr_rec,
-#               apr_pos,
-#               apr_neg,
-#               apr_nuet,
-#               os.path.join(plots_dir,'apr_sent.jpg'))
+plot_sentiment(apr_days,
+               apr_cases, 
+               apr_death,
+               apr_rec,
+               apr_pos,
+               apr_neg,
+               apr_nuet,
+               os.path.join(plots_dir,'apr_sent.jpg'))