Skip to content

Commit

Permalink
code prep
Browse files Browse the repository at this point in the history
  • Loading branch information
SaeedShurrab committed May 3, 2021
1 parent 45646d6 commit 1991f8e
Show file tree
Hide file tree
Showing 7 changed files with 614 additions and 26 deletions.
590 changes: 589 additions & 1 deletion notebooks/project_code.ipynb

Large diffs are not rendered by default.

Binary file added plots/apr_sent.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added plots/feb_sent.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified plots/label_dist.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added plots/mar_sent.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
12 changes: 4 additions & 8 deletions preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,6 @@

data = data[['text','datetime','hashtags']]

import numpy as np
idx = np.arange(data.shape[0])
np.random.shuffle(idx)
data = data.iloc[0:13000]

# Apply Cleaning
data['clean_text']=data['text'].progress_apply(lambda x: process_all_text(x))
Expand Down Expand Up @@ -84,10 +80,10 @@
mar_tweets = data[data.month == 3]
apr_tweets = data[data.month == 4]

data.to_csv(os.path.join(intermediate_dir,'cleaned_data.csv'))
data.to_pickle(os.path.join(intermediate_dir,'cleaned_data.pkl'))

feb_tweets.to_csv(os.path.join(preprocessed_dir,'feb_tweets.csv'))
mar_tweets.to_csv(os.path.join(preprocessed_dir,'mar_tweets.csv'))
apr_tweets.to_csv(os.path.join(preprocessed_dir,'apr_tweets.csv'))
feb_tweets.to_pickle(os.path.join(preprocessed_dir,'feb_tweets.pkl'))
mar_tweets.to_pickle(os.path.join(preprocessed_dir,'mar_tweets.pkl'))
apr_tweets.to_pickle(os.path.join(preprocessed_dir,'apr_tweets.pkl'))

print('preprocessing completed \n')
38 changes: 21 additions & 17 deletions sentiment.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
import os
import numpy as np
import pandas as pd
from src.utils import plot_sentiment
import matplotlib.pyplot as plt
from src.utils import plot_sentiment, font
import warnings

preprocessed_dir = os.path.join(os.curdir,'data','preprocessed')
plots_dir = os.path.join(os.curdir,'plots')

feb_tweets = pd.read_csv(os.path.join(preprocessed_dir,'feb_tweets.csv'),index_col=0)
mar_tweets = pd.read_csv(os.path.join(preprocessed_dir,'mar_tweets.csv'),index_col=0)
#apr_tweets = pd.read_csv(os.path.join(preprocessed_dir,'apr_tweets.csv'),index_col=0)
feb_tweets = pd.read_pickle(os.path.join(preprocessed_dir,'feb_tweets.pkl'))
mar_tweets = pd.read_pickle(os.path.join(preprocessed_dir,'mar_tweets.pkl'))
apr_tweets = pd.read_pickle(os.path.join(preprocessed_dir,'apr_tweets.pkl'))

# COVID -19 Confirmed Cases 1/Feb. - 29/Apr.

Expand Down Expand Up @@ -55,13 +57,13 @@

feb_tweets['day'] = pd.DatetimeIndex(feb_tweets['datetime']).day
mar_tweets['day'] = pd.DatetimeIndex(mar_tweets['datetime']).day
#apr_tweets['day'] = pd.DatetimeIndex(apr_tweets['datetime']).day
apr_tweets['day'] = pd.DatetimeIndex(apr_tweets['datetime']).day


# Sentimet frequency per month extraction
feb_freq = list(feb_tweets.groupby(['label','day'])['day'].count().to_dict().values())
mar_freq = list(mar_tweets.groupby(['label','day'])['day'].count().to_dict().values())
#apr_freq = list(apr_tweets.groupby(['label','day'])['day'].count().to_dict().values())
apr_freq = list(apr_tweets.groupby(['label','day'])['day'].count().to_dict().values())

feb_nuet = feb_freq[0:29]
feb_neg = feb_freq[29:58]
Expand All @@ -71,9 +73,11 @@
mar_neg = mar_freq[31:62]
mar_pos = mar_freq[62:]

#apr_nuet = apr_freq[0:29]
#apr_neg = apr_freq[29:58]
#apr_pos = apr_freq[58:]
apr_nuet = apr_freq[0:29]
apr_neg = apr_freq[29:58]
apr_pos = apr_freq[58:]




# Tweets sentiment associated with number of COVID-19 cases in February.
Expand All @@ -100,12 +104,12 @@


# Tweets sentiment associated with number of COVID-19 cases in April.
#plot_sentiment(apr_days,
# apr_cases,
# apr_death,
# apr_rec,
# apr_pos,
# apr_neg,
# apr_nuet,
# os.path.join(plots_dir,'apr_sent.jpg'))
plot_sentiment(apr_days,
apr_cases,
apr_death,
apr_rec,
apr_pos,
apr_neg,
apr_nuet,
os.path.join(plots_dir,'apr_sent.jpg'))

0 comments on commit 1991f8e

Please sign in to comment.