udacity · ritumalhotra · Jul 31, 2020 · Jul 31, 2020 · Jul 31, 2020 · Jul 31, 2020
@@ -2,9 +2,13 @@
 enron_mail_20110402.tgz
 enron_mail_20110402/
 enron_mail_20150507.tgz
+enron_mail_20150507.tar.gz
+enron_mail_20150507.tar
 maildir/
 text_learning/your_word_data.pkl
 text_learning/your_email_authors.pkl
 my_classifier.pkl
 my_dataset.pkl
 my_feature_list.pkl
+.idea
+
@@ -1,4 +1,34 @@
-ud120-projects
+<h1> ud120-projects </h1> 
 ==============
 
-Starter project code for students taking Udacity ud120
+My repo for Udacity ud120 course
+
+<h2> Content </h2> 
+* Session excercises / mini projects
+* Enron project
+
+
+<h2> IDE </h2> 
+PyCharm community Edition By Jet Brain
+
+<h2> Commands used </h2> 
+**install sklearn**
+
+pip install scikit-learn
+
+**install natural language toolkit** 
+
+pip install nltk
+
+**install matplotlib**
+
+pip install matplotlib
+
+<h2> Environment from requirements.txt</h2> 
+
+nltk==3.2.1<br>
+numpy==1.13.3<br>
+scikit-learn==0.18<br>
+scipy==0.19.1<br>
+
+
@@ -46,5 +46,5 @@ def output_image(name, format, bytes):
     data['name'] = name
     data['format'] = format
     data['bytes'] = base64.encodestring(bytes)
-    print image_start+json.dumps(data)+image_end
+    print( image_start+json.dumps(data)+image_end)
 
@@ -30,7 +30,16 @@
 
 ### your code here!  name your classifier object clf if you want the 
 ### visualization code (prettyPicture) to show you the decision boundary
-
+# KNN
+clf_knn = KNeighborsClassifier(n_neighbors=4)
+clf_knn.fit(features_train, labels_train)
+pred_knn = clf_knn.predict(features_test)
+print( "Accuracy for KNeighborsClassifier:", accuracy_score(labels_test, pred_knn))
+
+clf_rf = RandomForestClassifier(n_estimators=15, min_samples_split=6)
+clf_rf.fit(features_train, labels_train)
+clf_rf = clf_rf.predict(features_test)
+print( "Accuracy RandomForestClassifier:", accuracy_score(labels_test, clf_rf))
 
 
 

@@ -16,7 +16,48 @@
 """
 
 import pickle
+import numpy as np
 
 enron_data = pickle.load(open("../final_project/final_project_dataset.pkl", "rb"))
 
-
+print(len(enron_data))
+print(len(list(enron_data.values())[0]))
+
+count = 0
+for person_name in enron_data.keys():
+	if(enron_data[person_name]["poi"]==1):
+		count = count+1
+print(count)
+
+total_poi = 0
+with open('../final_project/poi_names.txt', 'r') as file:
+	for line in file:
+		if('\(y\)' or '\(n\)' in line):
+			total_poi= total_poi+1
+print(total_poi)
+file.close()
+print("Net Stock value of James Prentice: ", enron_data['PRENTICE JAMES']['total_stock_value'])
+print("Wesley Colwell to POI emails: ", enron_data['COLWELL WESLEY']['from_this_person_to_poi'])
+print("Stock options of Jeffrey Skilling: ", enron_data['SKILLING JEFFREY K']['exercised_stock_options'])
+
+most_value_taken = max([(enron_data[person_name]['total_payments']) for person_name in ("LAY KENNETH L", "SKILLING JEFFREY K", "FASTOW ANDREW S")])
+print(most_value_taken)
+
+salaries_not_nan = 0
+known_emails = 0
+total_payments_not_nan = 0
+total_payments_not_nan_poi = 0
+for person_name in enron_data:
+	if not np.isnan(float(enron_data[person_name]['salary'])):
+		salaries_not_nan += 1
+	if(enron_data[person_name]['email_address'] != 'NaN'):
+		known_emails+=1
+	if np.isnan(float(enron_data[person_name]['total_payments'])):
+		total_payments_not_nan +=1
+		if np.isnan(enron_data[person_name]["poi"]==1 ):
+			total_payments_not_nan_poi += 1
+
+print('Salaries available:: ', salaries_not_nan)
+print('Available emails: ', known_emails)
+print('Number Percentage people NaN -> their total payments: ',total_payments_not_nan, total_payments_not_nan*100/len(enron_data))
+print('Number and Percentage Pois NaN ->  their total payments: ',total_payments_not_nan_poi, total_payments_not_nan_poi*100/count)
@@ -24,7 +24,23 @@
 
 #########################################################
 ### your code goes here ###
+#imports
+from sklearn import tree
+from sklearn.metrics import accuracy_score
+#
+# create classifer
+clf = tree.DecisionTreeClassifier(min_samples_split=40)
 
+# fit the classifier on  training features and labels
+clf.fit(features_train, labels_train)
+
+#predict
+pred = clf.predict(features_test)
+
+# print
+print( "Accuracy:", accuracy_score(labels_test, pred))
+
+print( "No of features in date:", len(features_train[0]))
 
 #########################################################
 

@@ -13,8 +13,12 @@
 
 import pickle
 import sys
+import numpy as np
 sys.path.append("../tools/")
 from feature_format import featureFormat, targetFeatureSplit
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.metrics import accuracy_score, precision_score, recall_score
+from sklearn.cross_validation import train_test_split
 
 data_dict = pickle.load(open("../final_project/final_project_dataset.pkl", "r") )
 
@@ -27,5 +31,61 @@
 
 
 ### your code goes here 
+features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.30,
+                                                                            random_state=42)
+# create DT Classifier
+clf = DecisionTreeClassifier()
 
+# fit/train it
+clf.fit(features_train, labels_train)
+
+# predict
+pred = clf.predict(features_test)
+
+#print
+print( "accuracy:", accuracy_score(labels_test, pred))
+
+### evaluation
+values, counts = np.unique(pred, return_counts=True)
+test_size = len(features_test)
+
+# print
+print("Predicted POIs:", zip(values, counts))
+print( "Total number in test set:", test_size)
+print( "Accuracy - all poi=0:", counts[0] / test_size)
+
+true_positives = 0
+for actual, predicted in zip(labels_test, pred):
+    if actual == 1 and predicted == 1:
+        true_positives += 1
+
+# print
+print( "TP - true positives:", true_positives)
+print( "Precision score:", precision_score(labels_test, pred))
+print( "Recall score:", recall_score(labels_test, pred))
+
+prediction_labels = [0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1]
+true_labels = [0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0]
+
+
+def calc_precision_and_recall(actual, predicted):
+    print( "Doing precision and recall...")
+    true_positives = 0
+    false_positives = 0
+    false_negatives = 0
+    true_negatives = 0
+    for a, p in zip(actual, predicted):
+        if a == 1 and p == 1:
+            true_positives += 1
+        elif a == 1 and p == 0:
+            false_negatives += 1
+        elif a == 0 and p == 1:
+            false_positives += 1
+        else:
+            true_negatives += 1
+    print( "Precision:", true_positives / (true_positives + false_positives))
+    print( "Recall:", true_positives / (true_positives + false_negatives))
+
+
+calc_precision_and_recall(true_labels, prediction_labels)
 
@@ -28,6 +28,8 @@
 features_train = vectorizer.fit_transform(features_train)
 features_test  = vectorizer.transform(features_test).toarray()
 
+# get words
+words = vectorizer.get_feature_names()
 
 ### a classic way to overfit is to use a small number
 ### of data points and a large number of features;
@@ -38,6 +40,19 @@
 
 
 ### your code goes here
-
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.metrics import accuracy_score
+
+clf = DecisionTreeClassifier(min_samples_split=40)
+clf.fit(features_train, labels_train)
+pred = clf.predict(features_test)
+print( "Accuracy:", accuracy_score(labels_test, pred))
+
+print( "Important features:")
+for index, feature in enumerate(clf.feature_importances_):
+    if feature>0.2:
+        print( "Feature number", index)
+        print( "Importance", feature)
+        print( "Word", words[index])