Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,13 @@
enron_mail_20110402.tgz
enron_mail_20110402/
enron_mail_20150507.tgz
enron_mail_20150507.tar.gz
enron_mail_20150507.tar
maildir/
text_learning/your_word_data.pkl
text_learning/your_email_authors.pkl
my_classifier.pkl
my_dataset.pkl
my_feature_list.pkl
.idea

Binary file added Project report.docx
Binary file not shown.
34 changes: 32 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,34 @@
ud120-projects
<h1> ud120-projects </h1>
==============

Starter project code for students taking Udacity ud120
My repo for Udacity ud120 course

<h2> Content </h2>
* Session excercises / mini projects
* Enron project


<h2> IDE </h2>
PyCharm community Edition By Jet Brain

<h2> Commands used </h2>
**install sklearn**

pip install scikit-learn

**install natural language toolkit**

pip install nltk

**install matplotlib**

pip install matplotlib

<h2> Environment from requirements.txt</h2>

nltk==3.2.1<br>
numpy==1.13.3<br>
scikit-learn==0.18<br>
scipy==0.19.1<br>


2 changes: 1 addition & 1 deletion choose_your_own/class_vis.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,5 +46,5 @@ def output_image(name, format, bytes):
data['name'] = name
data['format'] = format
data['bytes'] = base64.encodestring(bytes)
print image_start+json.dumps(data)+image_end
print( image_start+json.dumps(data)+image_end)

Binary file added choose_your_own/test.PNG
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
11 changes: 10 additions & 1 deletion choose_your_own/your_algorithm.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,16 @@

### your code here! name your classifier object clf if you want the
### visualization code (prettyPicture) to show you the decision boundary

# KNN
clf_knn = KNeighborsClassifier(n_neighbors=4)
clf_knn.fit(features_train, labels_train)
pred_knn = clf_knn.predict(features_test)
print( "Accuracy for KNeighborsClassifier:", accuracy_score(labels_test, pred_knn))

clf_rf = RandomForestClassifier(n_estimators=15, min_samples_split=6)
clf_rf.fit(features_train, labels_train)
clf_rf = clf_rf.predict(features_test)
print( "Accuracy RandomForestClassifier:", accuracy_score(labels_test, clf_rf))



Expand Down
43 changes: 42 additions & 1 deletion datasets_questions/explore_enron_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,48 @@
"""

import pickle
import numpy as np

enron_data = pickle.load(open("../final_project/final_project_dataset.pkl", "rb"))


print(len(enron_data))
print(len(list(enron_data.values())[0]))

count = 0
for person_name in enron_data.keys():
if(enron_data[person_name]["poi"]==1):
count = count+1
print(count)

total_poi = 0
with open('../final_project/poi_names.txt', 'r') as file:
for line in file:
if('\(y\)' or '\(n\)' in line):
total_poi= total_poi+1
print(total_poi)
file.close()
print("Net Stock value of James Prentice: ", enron_data['PRENTICE JAMES']['total_stock_value'])
print("Wesley Colwell to POI emails: ", enron_data['COLWELL WESLEY']['from_this_person_to_poi'])
print("Stock options of Jeffrey Skilling: ", enron_data['SKILLING JEFFREY K']['exercised_stock_options'])

most_value_taken = max([(enron_data[person_name]['total_payments']) for person_name in ("LAY KENNETH L", "SKILLING JEFFREY K", "FASTOW ANDREW S")])
print(most_value_taken)

salaries_not_nan = 0
known_emails = 0
total_payments_not_nan = 0
total_payments_not_nan_poi = 0
for person_name in enron_data:
if not np.isnan(float(enron_data[person_name]['salary'])):
salaries_not_nan += 1
if(enron_data[person_name]['email_address'] != 'NaN'):
known_emails+=1
if np.isnan(float(enron_data[person_name]['total_payments'])):
total_payments_not_nan +=1
if np.isnan(enron_data[person_name]["poi"]==1 ):
total_payments_not_nan_poi += 1

print('Salaries available:: ', salaries_not_nan)
print('Available emails: ', known_emails)
print('Number Percentage people NaN -> their total payments: ',total_payments_not_nan, total_payments_not_nan*100/len(enron_data))
print('Number and Percentage Pois NaN -> their total payments: ',total_payments_not_nan_poi, total_payments_not_nan_poi*100/count)
16 changes: 16 additions & 0 deletions decision_tree/dt_author_id.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,23 @@

#########################################################
### your code goes here ###
#imports
from sklearn import tree
from sklearn.metrics import accuracy_score
#
# create classifer
clf = tree.DecisionTreeClassifier(min_samples_split=40)

# fit the classifier on training features and labels
clf.fit(features_train, labels_train)

#predict
pred = clf.predict(features_test)

# print
print( "Accuracy:", accuracy_score(labels_test, pred))

print( "No of features in date:", len(features_train[0]))

#########################################################

Expand Down
60 changes: 60 additions & 0 deletions evaluation/evaluate_poi_identifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,12 @@

import pickle
import sys
import numpy as np
sys.path.append("../tools/")
from feature_format import featureFormat, targetFeatureSplit
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.cross_validation import train_test_split

data_dict = pickle.load(open("../final_project/final_project_dataset.pkl", "r") )

Expand All @@ -27,5 +31,61 @@


### your code goes here
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.30,
random_state=42)
# create DT Classifier
clf = DecisionTreeClassifier()

# fit/train it
clf.fit(features_train, labels_train)

# predict
pred = clf.predict(features_test)

#print
print( "accuracy:", accuracy_score(labels_test, pred))

### evaluation
values, counts = np.unique(pred, return_counts=True)
test_size = len(features_test)

# print
print("Predicted POIs:", zip(values, counts))
print( "Total number in test set:", test_size)
print( "Accuracy - all poi=0:", counts[0] / test_size)

true_positives = 0
for actual, predicted in zip(labels_test, pred):
if actual == 1 and predicted == 1:
true_positives += 1

# print
print( "TP - true positives:", true_positives)
print( "Precision score:", precision_score(labels_test, pred))
print( "Recall score:", recall_score(labels_test, pred))

prediction_labels = [0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1]
true_labels = [0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0]


def calc_precision_and_recall(actual, predicted):
print( "Doing precision and recall...")
true_positives = 0
false_positives = 0
false_negatives = 0
true_negatives = 0
for a, p in zip(actual, predicted):
if a == 1 and p == 1:
true_positives += 1
elif a == 1 and p == 0:
false_negatives += 1
elif a == 0 and p == 1:
false_positives += 1
else:
true_negatives += 1
print( "Precision:", true_positives / (true_positives + false_positives))
print( "Recall:", true_positives / (true_positives + false_negatives))


calc_precision_and_recall(true_labels, prediction_labels)

17 changes: 16 additions & 1 deletion feature_selection/find_signature.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@
features_train = vectorizer.fit_transform(features_train)
features_test = vectorizer.transform(features_test).toarray()

# get words
words = vectorizer.get_feature_names()

### a classic way to overfit is to use a small number
### of data points and a large number of features;
Expand All @@ -38,6 +40,19 @@


### your code goes here

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

clf = DecisionTreeClassifier(min_samples_split=40)
clf.fit(features_train, labels_train)
pred = clf.predict(features_test)
print( "Accuracy:", accuracy_score(labels_test, pred))

print( "Important features:")
for index, feature in enumerate(clf.feature_importances_):
if feature>0.2:
print( "Feature number", index)
print( "Importance", feature)
print( "Word", words[index])


Loading