Nance-Lab
diff --git a/‎build/lib/diff_predictor/__init__.py
Lines changed: 1 addition & 0 deletions b/‎build/lib/diff_predictor/__init__.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎build/lib/diff_predictor/data_process.py
Lines changed: 146 additions & 0 deletions b/‎build/lib/diff_predictor/data_process.py
Lines changed: 146 additions & 0 deletions
diff --git a/‎build/lib/diff_predictor/dataio.py
Lines changed: 154 additions & 0 deletions b/‎build/lib/diff_predictor/dataio.py
Lines changed: 154 additions & 0 deletions
diff --git a/‎build/lib/diff_predictor/eval.py
Lines changed: 63 additions & 0 deletions b/‎build/lib/diff_predictor/eval.py
Lines changed: 63 additions & 0 deletions
@@ -0,0 +1 @@
+
@@ -0,0 +1,146 @@
+import sys
+import pandas as pd
+import numpy as np
+from sklearn.model_selection import train_test_split
+from sklearn import preprocessing
+
+def generate_fullstats(dataset_path, filelist, targets, target_col_name='Target'):
+    """
+    Generates single csv of all statatistics from list of files
+    Parameters
+    ---------
+    dataset_path: string
+        string of path to folder containing data files
+    filelist: list
+        list containing filenames of all files to be processed
+    targets: list
+        list containing strings that state which class/group a file is from,
+        string must be in the filename of the data files
+    Target: string
+        
+    Returns
+    -------
+    fstats_tot: pandas.DataFrame
+        dataframe containing all rows from data files and with new column
+        for the class/group the row came from
+    """
+    fstats_tot = None
+    video_num = 0
+    for filename in filelist:
+            fstats = pd.read_csv(dataset_path + filename, encoding = "ISO-8859-1", index_col='Unnamed: 0')
+            #print('{} size: {}'.format(filename, fstats.shape))
+            
+            for i in range(0, len(targets)):
+                if targets[i] in filename:
+                    print('Adding file {} size: {}'.format(filename, fstats.shape))
+                    fstats[target_col_name] = pd.Series(fstats.shape[0]*[targets[i]], index=fstats.index)
+                    fstats['Filename'] = pd.Series(fstats.shape[0]*[filename], index=fstats.index)
+                    fstats['Video Number'] = pd.Series(fstats.shape[0]*[video_num], index=fstats.index)
+                    if fstats_tot is None:
+                        fstats_tot = fstats
+                    else:
+                        fstats_tot = pd.concat([fstats_tot, fstats], ignore_index=True)
+                    video_num += 1   
+    return fstats_tot
+
+def balance_data(df, target, **kwargs):
+    """
+    Balance spatial data using undersampling. Assumes input will
+    be a dataframe and data will be used for categorical classification
+    Parameters
+    ----------
+    df : pandas.DataFrame
+        pandas dataframe to be balanced
+    target : string
+        the name of the target/tag/y-value column to balance data around
+        
+    Optional Parameters
+    -------------------
+    random_state : int : 1
+        seed to base random sampling from
+    Returns
+    -------
+    A fully balanced pandas dataframe
+    """
+    if 'random_state' not in kwargs:
+        random_state = 1
+    else:
+        random_state = kwargs['random_state']
+    df_target = []
+    bal_df = []
+    for name in df[target].unique():
+        df_target.append((name, df[df[target] == name]))
+    print(f"Ratio before data balance " +
+          f"({':'.join([str(i[0]) for i in df_target])}) = " +
+          f"{':'.join([str(len(i[1])) for i in df_target])}")
+    for i in range(len(df_target)):
+        ratio = min([len(i[1]) for i in df_target])/len(df_target[i][1])
+        bal_df.append(df_target[i][1].sample(frac=ratio,
+                                             random_state=random_state))
+    print(f"Ratio after balance " +
+          f"({':'.join([str(i[0]) for i in df_target])}) = " +
+          f"{':'.join([str(len(i)) for i in bal_df])}")
+    assert len(bal_df) > 0, 'DataFrame cant be empty'
+    return pd.concat(bal_df)
+
+
+def bin_data(bal_ecm, resolution=128):
+    """
+    Takes in a dataframe that has a binx and a biny column, and uses
+    those columns to generate a bin column based on the resolution
+    This is necessary for eventual cross validation to prevent data leakage
+
+    Parameters
+    ----------
+    bal_ecm: pandas.DataFrame
+        dataframe to be processed. Dataframe may need to have balanced classes - use balance_data function
+    resolution: int
+        integer representing the size of the bins. Resolution must be a factor of 2048 and > 128
+        default is 128
+
+    Returns
+    -------
+    bal_ecm: pandas.DataFrame
+        dataframe with new column indicating which bin a give row is in
+    """
+    assert not 2048%resolution and resolution >= 128, "resolution needs to be a factor of 2048 and > 128"
+    bins = list(range(0, 2048+1, resolution))
+    bin_labels = [int(i/resolution) for i in bins][:-1]
+    bal_ecm['binx'] = pd.cut(bal_ecm['X'], bins, labels=bin_labels, include_lowest=True)
+    bal_ecm.loc[bal_ecm['X'] < 0] = 0
+    bal_ecm['biny'] = pd.cut(bal_ecm.Y, bins, labels=bin_labels, include_lowest=True)
+    bal_ecm['bins'] = (len(bins)-1)*bal_ecm['binx'].astype(np.int32) + bal_ecm['biny'].astype(np.int32)
+    bal_ecm = bal_ecm[np.isfinite(bal_ecm['bins'])]
+    bal_ecm['bins'] = bal_ecm['bins'].astype(int)
+    return bal_ecm
+
+def split_data(df, target, train_split, test_val_split=1.0, seed=1234):
+    """
+    Parameters
+    ----------
+    
+    Returns
+    -------
+    
+    """
+    np.random.seed(seed)
+    le = preprocessing.LabelEncoder()
+    df['encoded_target'] = le.fit_transform(df[target])
+    training_bins = np.random.choice(df.bins.unique(),
+                                     int(len(df.bins.unique())*train_split),
+                                     replace=False)
+    X_train = df[df.bins.isin(training_bins)]
+    X_test_val = df[~df.bins.isin(training_bins)]
+    result = []
+    if test_val_split == 1.0:
+        X_test = X_test_val
+    else:
+        X_val, X_test = train_test_split(X_test_val,
+                                         test_size=test_val_split,
+                                         random_state=seed)
+        y_val = X_val['encoded_target']
+        result = [(X_val, y_val)]
+    y_train = X_train['encoded_target']
+    y_test = X_test['encoded_target']
+    result = np.append([(X_train, y_train), (X_test, y_test)], result)
+    return result, le
@@ -0,0 +1,154 @@
+import os
+import sys
+from os import listdir
+from os.path import isfile, join
+import numpy as np
+import csv
+import boto3
+import pandas as pd
+import diff_classifier.aws as aws
+from itertools import cycle
+
+
+if 'core' not in sys.modules:
+    import core
+
+
+def load_data(folder, filenames=[], **kwargs):
+    """
+    Load data either through the system or through aws S3.
+
+    Parameters
+    ----------
+    folder : string :
+        desired folder to import files from
+    filenames : list of strings :
+        desired files to import
+
+    Optional Parameters
+    -------------------
+    download_list_file : string :
+        if using a textfile containing multiple filenames, use this to designate location
+        of this file within the folder.
+        ex: folder/download_file_names.txt
+    tag : list of strings :
+        if tagging a dataframe file with a variable, use this to tag each file. Will cycle
+        through list if list reaches end and there are stile files in the filenames list
+    bucket_name : string :
+        if using aws S3, declare this variable as an S3 bucket to look through. This will
+        trigger the function so that folder is the folder in the bucket and filenames are
+        the filenames to download in the bucket
+
+    """
+    data = pd.DataFrame()
+    tag = None
+    if 'download_list_file' in kwargs:
+        list_path = os.path.join(folder, kwargs['download_list_file'][0])
+        assert os.path.isfile(list_path) and os.access(list_path, os.R_OK), \
+            f'{list_path} does not exhist or can not be read'
+        try:
+            with open(list_path, 'r') as f:
+                filenames = f.read().splitlines()
+        except IOError as err:
+            print(f"Could not read {f}: {err}")
+    if 'tag' in kwargs:
+        tag = cycle(kwargs['tag'])
+    if 'bucket_name' in kwargs:
+        s3 = boto3.resource('s3')
+        bucket = s3.Bucket(kwargs['bucket_name'])
+        for filename in filenames:
+            if tag:
+                file_tag = next(tag)
+            else:
+                file_tag = None
+            try:
+                file_path = os.path.join(folder, filename)
+                print(file_path)
+                aws.download_s3(file_path, filename, bucket_name=bucket)
+                file_data = pd.read_csv(filename, encoding="ISO-8859-1", index_col='Unnamed: 0')
+                if file_tag:
+                    size = file_data.shape[0]
+                    file_data['Tag'] = pd.Series(size*[file_tag], index=file_data.index)
+                data = pd.concat([data, file_data])
+                del file_data
+            except IOError as err:
+                print(f'Skipped!: {filename}: {err}')
+        return data
+    for filename in filenames:
+        if tag:
+            file_tag = next(tag)
+        else:
+            file_tag = None
+        try:
+            file_path = os.path.join(folder, filename)
+            print(file_path)
+
+            if file_tag:
+                size = file_data.shape[0]
+
+            data = pd.concat([data, file_data])
+        except IOError as err:
+            print(f'Skipped!: {filename}: {err}')
+    return data
+
+
+def get_files(path, keywords = ["features_ OR msd_"]):
+    """
+    Takes in a path and list of keywords. Returns a list of filenames
+    that are within the path that contain one of the keyword in the list.
+    Set keyword to "" to get all files in the path.
+    
+    Parameters
+    ----------
+    path : string
+        file path
+    keywords : string or [string] : ["features_ OR msd_"]
+        keywords to look for in the file path. 
+        
+    Returns
+    -------
+    file_list : list
+        list of files in the path
+    """
+    keywords = [i.split('OR') for i in list(keywords)]
+    keywords = [list(map(lambda x:x.strip(), i)) for i in keywords]
+    files = [f for f in listdir(path) if isfile(join(path, f))]
+    file_list = []
+    for filename in files:
+        kwds_in = all(any(k in filename for k in ([keyword]*isinstance(keyword, str) or keyword)) for keyword in keywords)
+        if (kwds_in):
+            file_list.append(filename)
+    return file_list
+
+
+# Pre: Both files must exhist; Feature must be in the feature file
+# Throws a FileNotFoundError exception if preconditions not met
+#
+# Adds a feature from produced features file to the track file.
+def combine_track(trackFile, feature="type"):
+    trackDF = pd.read_csv(trackFile)
+    featureDF = find_pair(trackFile)
+    trackDF[feature] = np.nan
+    maxFrames = int(trackDF["Frame"].max())
+    maxTracks = int(trackDF["Track_ID"].max())
+    for i in range(int(maxTracks)+1):
+        trackFeature = featureDF[feature].iloc[i]
+        trackDF[feature].iloc[(maxFrames)*(i + 1) + i] = trackFeature
+        return trackDF
+
+
+# Trys to find the feature file pair for either msd_ or Traj_
+# Return the pd.DataFrame of that pair if found.
+def find_pair(filename):
+    '''
+    Trys to find the feature file pair for either msd_ or Traj_ and 
+    Returns the pd.DataFrame of that pair if found.
+    '''
+    try:
+        filename = filename.replace("msd_", "").replace("Traj_", "")
+        filename = filename.split("/")
+        filename[-1] = "features_" + filename[-1]
+        featureFile = "/".join(filename)
+        return pd.read_csv(featureFile)
+    except FileNotFoundError:
+        print("File pair could not be found")
@@ -0,0 +1,63 @@
+import sys
+import numpy
+import scipy.stats
+from seaborn import heatmap
+
+
+def perf_meas(y_actual, y_pred, cls, verbose=True):
+    '''
+    Shows the performance measurements of resulting prediction.
+    Performance measures include true positive, true negative,
+    false positive, false negative
+    Parameters
+    ----------
+    y_actual : list
+        Actual values of y
+    y_pred : list
+        Predicted values of y
+    cls : int
+        class to run performance measure on
+    verbose : boolean : True
+        report performance as a string
+    Returns
+    -------
+    tuple of four performance values (TP, FP, TN, FN)
+    '''
+
+    assert len(y_actual) == len(y_pred), 'Must be same number of actual and predicted values'
+
+    TP = 0
+    FP = 0
+    TN = 0
+    FN = 0
+    for i in range(len(y_actual)): 
+        if (y_actual[i]==y_pred[i]) and (y_pred[i]==cls):
+           TP += 1
+        if (y_pred[i]==cls) and (y_actual[i]!=y_pred[i]):
+           FP += 1
+        if (y_actual[i]==y_pred[i]) and (y_pred[i]!=cls):
+           TN += 1
+        if (y_pred[i]!=cls) and (y_actual[i]!=y_pred[i]):
+           FN += 1
+    if verbose is True:
+        print(f'(TP, FP, TN, FN) = {(TP, FP, TN, FN)}')
+    return(TP, FP, TN, FN)
+
+
+def corrmat(df, method='pearson', show_plot=True, **kwargs):
+    '''
+    
+    '''
+    plot_options = {'annot': True, 
+                    'fmt': "f",
+                    }
+    plot_options.update(kwargs)
+    error_msg = "Correlation type not available. Select" +\
+                "from pearson, spearman, or kendall corr."
+    switch_case = {'pearson': df.corr(),
+                   'spearman': df.corr(method=method),
+                   'kendall': df.corr(method=method)}
+    corr_mat = switch_case.get(method, lambda: error_msg)
+    if show_plot:
+        return heatmap(corr_mat, **plot_options)
+    return corr_mat