|
| 1 | +import sys |
| 2 | +import pandas as pd |
| 3 | +import numpy as np |
| 4 | +from sklearn.model_selection import train_test_split |
| 5 | +from sklearn import preprocessing |
| 6 | + |
| 7 | +def generate_fullstats(dataset_path, filelist, targets, target_col_name='Target'): |
| 8 | + """ |
| 9 | + Generates single csv of all statatistics from list of files |
| 10 | + Parameters |
| 11 | + --------- |
| 12 | + dataset_path: string |
| 13 | + string of path to folder containing data files |
| 14 | + filelist: list |
| 15 | + list containing filenames of all files to be processed |
| 16 | + targets: list |
| 17 | + list containing strings that state which class/group a file is from, |
| 18 | + string must be in the filename of the data files |
| 19 | + Target: string |
| 20 | + |
| 21 | + Returns |
| 22 | + ------- |
| 23 | + fstats_tot: pandas.DataFrame |
| 24 | + dataframe containing all rows from data files and with new column |
| 25 | + for the class/group the row came from |
| 26 | + """ |
| 27 | + fstats_tot = None |
| 28 | + video_num = 0 |
| 29 | + for filename in filelist: |
| 30 | + fstats = pd.read_csv(dataset_path + filename, encoding = "ISO-8859-1", index_col='Unnamed: 0') |
| 31 | + #print('{} size: {}'.format(filename, fstats.shape)) |
| 32 | + |
| 33 | + for i in range(0, len(targets)): |
| 34 | + if targets[i] in filename: |
| 35 | + print('Adding file {} size: {}'.format(filename, fstats.shape)) |
| 36 | + fstats[target_col_name] = pd.Series(fstats.shape[0]*[targets[i]], index=fstats.index) |
| 37 | + fstats['Filename'] = pd.Series(fstats.shape[0]*[filename], index=fstats.index) |
| 38 | + fstats['Video Number'] = pd.Series(fstats.shape[0]*[video_num], index=fstats.index) |
| 39 | + if fstats_tot is None: |
| 40 | + fstats_tot = fstats |
| 41 | + else: |
| 42 | + fstats_tot = pd.concat([fstats_tot, fstats], ignore_index=True) |
| 43 | + video_num += 1 |
| 44 | + return fstats_tot |
| 45 | + |
| 46 | +def balance_data(df, target, **kwargs): |
| 47 | + """ |
| 48 | + Balance spatial data using undersampling. Assumes input will |
| 49 | + be a dataframe and data will be used for categorical classification |
| 50 | + Parameters |
| 51 | + ---------- |
| 52 | + df : pandas.DataFrame |
| 53 | + pandas dataframe to be balanced |
| 54 | + target : string |
| 55 | + the name of the target/tag/y-value column to balance data around |
| 56 | + |
| 57 | + Optional Parameters |
| 58 | + ------------------- |
| 59 | + random_state : int : 1 |
| 60 | + seed to base random sampling from |
| 61 | + Returns |
| 62 | + ------- |
| 63 | + A fully balanced pandas dataframe |
| 64 | + """ |
| 65 | + if 'random_state' not in kwargs: |
| 66 | + random_state = 1 |
| 67 | + else: |
| 68 | + random_state = kwargs['random_state'] |
| 69 | + df_target = [] |
| 70 | + bal_df = [] |
| 71 | + for name in df[target].unique(): |
| 72 | + df_target.append((name, df[df[target] == name])) |
| 73 | + print(f"Ratio before data balance " + |
| 74 | + f"({':'.join([str(i[0]) for i in df_target])}) = " + |
| 75 | + f"{':'.join([str(len(i[1])) for i in df_target])}") |
| 76 | + for i in range(len(df_target)): |
| 77 | + ratio = min([len(i[1]) for i in df_target])/len(df_target[i][1]) |
| 78 | + bal_df.append(df_target[i][1].sample(frac=ratio, |
| 79 | + random_state=random_state)) |
| 80 | + print(f"Ratio after balance " + |
| 81 | + f"({':'.join([str(i[0]) for i in df_target])}) = " + |
| 82 | + f"{':'.join([str(len(i)) for i in bal_df])}") |
| 83 | + assert len(bal_df) > 0, 'DataFrame cant be empty' |
| 84 | + return pd.concat(bal_df) |
| 85 | + |
| 86 | + |
| 87 | +def bin_data(bal_ecm, resolution=128): |
| 88 | + """ |
| 89 | + Takes in a dataframe that has a binx and a biny column, and uses |
| 90 | + those columns to generate a bin column based on the resolution |
| 91 | + This is necessary for eventual cross validation to prevent data leakage |
| 92 | +
|
| 93 | + Parameters |
| 94 | + ---------- |
| 95 | + bal_ecm: pandas.DataFrame |
| 96 | + dataframe to be processed. Dataframe may need to have balanced classes - use balance_data function |
| 97 | + resolution: int |
| 98 | + integer representing the size of the bins. Resolution must be a factor of 2048 and > 128 |
| 99 | + default is 128 |
| 100 | +
|
| 101 | + Returns |
| 102 | + ------- |
| 103 | + bal_ecm: pandas.DataFrame |
| 104 | + dataframe with new column indicating which bin a give row is in |
| 105 | + """ |
| 106 | + assert not 2048%resolution and resolution >= 128, "resolution needs to be a factor of 2048 and > 128" |
| 107 | + bins = list(range(0, 2048+1, resolution)) |
| 108 | + bin_labels = [int(i/resolution) for i in bins][:-1] |
| 109 | + bal_ecm['binx'] = pd.cut(bal_ecm['X'], bins, labels=bin_labels, include_lowest=True) |
| 110 | + bal_ecm.loc[bal_ecm['X'] < 0] = 0 |
| 111 | + bal_ecm['biny'] = pd.cut(bal_ecm.Y, bins, labels=bin_labels, include_lowest=True) |
| 112 | + bal_ecm['bins'] = (len(bins)-1)*bal_ecm['binx'].astype(np.int32) + bal_ecm['biny'].astype(np.int32) |
| 113 | + bal_ecm = bal_ecm[np.isfinite(bal_ecm['bins'])] |
| 114 | + bal_ecm['bins'] = bal_ecm['bins'].astype(int) |
| 115 | + return bal_ecm |
| 116 | + |
| 117 | +def split_data(df, target, train_split, test_val_split=1.0, seed=1234): |
| 118 | + """ |
| 119 | + Parameters |
| 120 | + ---------- |
| 121 | + |
| 122 | + Returns |
| 123 | + ------- |
| 124 | + |
| 125 | + """ |
| 126 | + np.random.seed(seed) |
| 127 | + le = preprocessing.LabelEncoder() |
| 128 | + df['encoded_target'] = le.fit_transform(df[target]) |
| 129 | + training_bins = np.random.choice(df.bins.unique(), |
| 130 | + int(len(df.bins.unique())*train_split), |
| 131 | + replace=False) |
| 132 | + X_train = df[df.bins.isin(training_bins)] |
| 133 | + X_test_val = df[~df.bins.isin(training_bins)] |
| 134 | + result = [] |
| 135 | + if test_val_split == 1.0: |
| 136 | + X_test = X_test_val |
| 137 | + else: |
| 138 | + X_val, X_test = train_test_split(X_test_val, |
| 139 | + test_size=test_val_split, |
| 140 | + random_state=seed) |
| 141 | + y_val = X_val['encoded_target'] |
| 142 | + result = [(X_val, y_val)] |
| 143 | + y_train = X_train['encoded_target'] |
| 144 | + y_test = X_test['encoded_target'] |
| 145 | + result = np.append([(X_train, y_train), (X_test, y_test)], result) |
| 146 | + return result, le |
0 commit comments