add docs 4 utils, fix error analysis

sgarda · sgarda · commit 3c0eed8eabf4 · 2018-02-16T11:34:56.000+01:00
diff --git a/classify_tweets.py b/classify_tweets.py
@@ -125,15 +125,15 @@ def parse_arguments():
   df = load_data(dep_file = args.tweets_file, annotations = args.annotations)
   
   # replace column of tokens with preprocessed ones 
-  df['toks'] = df['toks_pos'].apply(preprocessing,rm_url = args.rm_url, red_len = args.red_len,lower = args.lower,
+  df['proc_toks'] = df['toks_pos'].apply(preprocessing,rm_url = args.rm_url, red_len = args.red_len,lower = args.lower,
     rm_sw = args.rm_sw, rm_tags_mentions = args.rm_tagsmen, stem = args.stem) 
   # still dataframe with all columns
   
   print("Shuffling data")
   np.random.seed(42)
   df = df.reindex(np.random.permutation(df.index))
   
-  tweets = list(df['toks'])
+  tweets = list(df['proc_toks'])
   labels = list(df['label'])
   pos = list(df['pos'])
   deps = list(df['dep'])
diff --git a/utils.py b/utils.py
@@ -25,6 +25,12 @@ def plot_confusion_matrix(cm, classes,
     """
     This function prints and plots the confusion matrix.
     Normalization can be applied by setting `normalize=True`.
+    
+    :params:
+      cm (np.ndarray) : confusion matrix
+      normalize (bool) : normalize counts by # of class instance
+      title (string) : plot title
+      cmap (matplotlib.colors.LinearSegmentedColormap) : color map for image
     """
     if normalize:
         cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
@@ -79,24 +85,37 @@ def base_clf_hp(clf,text):
     
 def by_class_error_analysis(df,y_true,y_pred,limit,error,out_path):
   """
-  False Positive, False Negative estimation in a one-vs-rest way.
+  Write to file randomly selected False Positive or False Negative. For multiclass FP,FN are estimated in one-vs-all.
+  
+  :params:
+    df (pandas.DataFrame) : data set having `toks` column
+    y_true (array) : original labels
+    y_pred (array) : predicted labels
+    limit (int) : # of FP/FN to be printed
+    error (str) : type of misclassification. Choices : `FP` (false positive), `FN` (false negative)
+    out_path (str) : folder where errors will be saved
   """
   
+  errors = ['FP','FN']
+  
+  assert error in errors, "Invalid error choice! Received `{}` :  choose from `{}`! ".format(error,errors)
+  
   if error == 'FP':
     out_file = open(os.path.join(out_path, 'error.FP'),'w+') 
-  else :
+  elif error == 'FN' :
     out_file = open(os.path.join(out_path, 'error.FN'),'w+')
   
   unique_labels = np.unique(y_true)
   
   y_true = np.asarray(y_true)
+  y_pred = np.asarray(y_pred)
   
   for label in unique_labels:
     out_file.write("{}\n".format(str(label).upper()))
     
     if error == 'FP':
       error_idx = np.where((y_true!=label) & (y_pred==label))[0] #take indices
-    else:
+    elif error == 'FN':
       error_idx = np.where((y_true==label) & (y_pred!=label))[0] #take indices
             
     if len(error_idx) < 1: