Skip to content

Commit 108a729

Browse files
committed
merged
2 parents 5ab7c2b + 11a6e19 commit 108a729

File tree

11 files changed

+1972
-1
lines changed

11 files changed

+1972
-1
lines changed

build/lib/diff_predictor/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
import sys
2+
import pandas as pd
3+
import numpy as np
4+
from sklearn.model_selection import train_test_split
5+
from sklearn import preprocessing
6+
7+
def generate_fullstats(dataset_path, filelist, targets, target_col_name='Target'):
8+
"""
9+
Generates single csv of all statatistics from list of files
10+
Parameters
11+
---------
12+
dataset_path: string
13+
string of path to folder containing data files
14+
filelist: list
15+
list containing filenames of all files to be processed
16+
targets: list
17+
list containing strings that state which class/group a file is from,
18+
string must be in the filename of the data files
19+
Target: string
20+
21+
Returns
22+
-------
23+
fstats_tot: pandas.DataFrame
24+
dataframe containing all rows from data files and with new column
25+
for the class/group the row came from
26+
"""
27+
fstats_tot = None
28+
video_num = 0
29+
for filename in filelist:
30+
fstats = pd.read_csv(dataset_path + filename, encoding = "ISO-8859-1", index_col='Unnamed: 0')
31+
#print('{} size: {}'.format(filename, fstats.shape))
32+
33+
for i in range(0, len(targets)):
34+
if targets[i] in filename:
35+
print('Adding file {} size: {}'.format(filename, fstats.shape))
36+
fstats[target_col_name] = pd.Series(fstats.shape[0]*[targets[i]], index=fstats.index)
37+
fstats['Filename'] = pd.Series(fstats.shape[0]*[filename], index=fstats.index)
38+
fstats['Video Number'] = pd.Series(fstats.shape[0]*[video_num], index=fstats.index)
39+
if fstats_tot is None:
40+
fstats_tot = fstats
41+
else:
42+
fstats_tot = pd.concat([fstats_tot, fstats], ignore_index=True)
43+
video_num += 1
44+
return fstats_tot
45+
46+
def balance_data(df, target, **kwargs):
47+
"""
48+
Balance spatial data using undersampling. Assumes input will
49+
be a dataframe and data will be used for categorical classification
50+
Parameters
51+
----------
52+
df : pandas.DataFrame
53+
pandas dataframe to be balanced
54+
target : string
55+
the name of the target/tag/y-value column to balance data around
56+
57+
Optional Parameters
58+
-------------------
59+
random_state : int : 1
60+
seed to base random sampling from
61+
Returns
62+
-------
63+
A fully balanced pandas dataframe
64+
"""
65+
if 'random_state' not in kwargs:
66+
random_state = 1
67+
else:
68+
random_state = kwargs['random_state']
69+
df_target = []
70+
bal_df = []
71+
for name in df[target].unique():
72+
df_target.append((name, df[df[target] == name]))
73+
print(f"Ratio before data balance " +
74+
f"({':'.join([str(i[0]) for i in df_target])}) = " +
75+
f"{':'.join([str(len(i[1])) for i in df_target])}")
76+
for i in range(len(df_target)):
77+
ratio = min([len(i[1]) for i in df_target])/len(df_target[i][1])
78+
bal_df.append(df_target[i][1].sample(frac=ratio,
79+
random_state=random_state))
80+
print(f"Ratio after balance " +
81+
f"({':'.join([str(i[0]) for i in df_target])}) = " +
82+
f"{':'.join([str(len(i)) for i in bal_df])}")
83+
assert len(bal_df) > 0, 'DataFrame cant be empty'
84+
return pd.concat(bal_df)
85+
86+
87+
def bin_data(bal_ecm, resolution=128):
88+
"""
89+
Takes in a dataframe that has a binx and a biny column, and uses
90+
those columns to generate a bin column based on the resolution
91+
This is necessary for eventual cross validation to prevent data leakage
92+
93+
Parameters
94+
----------
95+
bal_ecm: pandas.DataFrame
96+
dataframe to be processed. Dataframe may need to have balanced classes - use balance_data function
97+
resolution: int
98+
integer representing the size of the bins. Resolution must be a factor of 2048 and > 128
99+
default is 128
100+
101+
Returns
102+
-------
103+
bal_ecm: pandas.DataFrame
104+
dataframe with new column indicating which bin a give row is in
105+
"""
106+
assert not 2048%resolution and resolution >= 128, "resolution needs to be a factor of 2048 and > 128"
107+
bins = list(range(0, 2048+1, resolution))
108+
bin_labels = [int(i/resolution) for i in bins][:-1]
109+
bal_ecm['binx'] = pd.cut(bal_ecm['X'], bins, labels=bin_labels, include_lowest=True)
110+
bal_ecm.loc[bal_ecm['X'] < 0] = 0
111+
bal_ecm['biny'] = pd.cut(bal_ecm.Y, bins, labels=bin_labels, include_lowest=True)
112+
bal_ecm['bins'] = (len(bins)-1)*bal_ecm['binx'].astype(np.int32) + bal_ecm['biny'].astype(np.int32)
113+
bal_ecm = bal_ecm[np.isfinite(bal_ecm['bins'])]
114+
bal_ecm['bins'] = bal_ecm['bins'].astype(int)
115+
return bal_ecm
116+
117+
def split_data(df, target, train_split, test_val_split=1.0, seed=1234):
118+
"""
119+
Parameters
120+
----------
121+
122+
Returns
123+
-------
124+
125+
"""
126+
np.random.seed(seed)
127+
le = preprocessing.LabelEncoder()
128+
df['encoded_target'] = le.fit_transform(df[target])
129+
training_bins = np.random.choice(df.bins.unique(),
130+
int(len(df.bins.unique())*train_split),
131+
replace=False)
132+
X_train = df[df.bins.isin(training_bins)]
133+
X_test_val = df[~df.bins.isin(training_bins)]
134+
result = []
135+
if test_val_split == 1.0:
136+
X_test = X_test_val
137+
else:
138+
X_val, X_test = train_test_split(X_test_val,
139+
test_size=test_val_split,
140+
random_state=seed)
141+
y_val = X_val['encoded_target']
142+
result = [(X_val, y_val)]
143+
y_train = X_train['encoded_target']
144+
y_test = X_test['encoded_target']
145+
result = np.append([(X_train, y_train), (X_test, y_test)], result)
146+
return result, le

build/lib/diff_predictor/dataio.py

Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,154 @@
1+
import os
2+
import sys
3+
from os import listdir
4+
from os.path import isfile, join
5+
import numpy as np
6+
import csv
7+
import boto3
8+
import pandas as pd
9+
import diff_classifier.aws as aws
10+
from itertools import cycle
11+
12+
13+
if 'core' not in sys.modules:
14+
import core
15+
16+
17+
def load_data(folder, filenames=[], **kwargs):
18+
"""
19+
Load data either through the system or through aws S3.
20+
21+
Parameters
22+
----------
23+
folder : string :
24+
desired folder to import files from
25+
filenames : list of strings :
26+
desired files to import
27+
28+
Optional Parameters
29+
-------------------
30+
download_list_file : string :
31+
if using a textfile containing multiple filenames, use this to designate location
32+
of this file within the folder.
33+
ex: folder/download_file_names.txt
34+
tag : list of strings :
35+
if tagging a dataframe file with a variable, use this to tag each file. Will cycle
36+
through list if list reaches end and there are stile files in the filenames list
37+
bucket_name : string :
38+
if using aws S3, declare this variable as an S3 bucket to look through. This will
39+
trigger the function so that folder is the folder in the bucket and filenames are
40+
the filenames to download in the bucket
41+
42+
"""
43+
data = pd.DataFrame()
44+
tag = None
45+
if 'download_list_file' in kwargs:
46+
list_path = os.path.join(folder, kwargs['download_list_file'][0])
47+
assert os.path.isfile(list_path) and os.access(list_path, os.R_OK), \
48+
f'{list_path} does not exhist or can not be read'
49+
try:
50+
with open(list_path, 'r') as f:
51+
filenames = f.read().splitlines()
52+
except IOError as err:
53+
print(f"Could not read {f}: {err}")
54+
if 'tag' in kwargs:
55+
tag = cycle(kwargs['tag'])
56+
if 'bucket_name' in kwargs:
57+
s3 = boto3.resource('s3')
58+
bucket = s3.Bucket(kwargs['bucket_name'])
59+
for filename in filenames:
60+
if tag:
61+
file_tag = next(tag)
62+
else:
63+
file_tag = None
64+
try:
65+
file_path = os.path.join(folder, filename)
66+
print(file_path)
67+
aws.download_s3(file_path, filename, bucket_name=bucket)
68+
file_data = pd.read_csv(filename, encoding="ISO-8859-1", index_col='Unnamed: 0')
69+
if file_tag:
70+
size = file_data.shape[0]
71+
file_data['Tag'] = pd.Series(size*[file_tag], index=file_data.index)
72+
data = pd.concat([data, file_data])
73+
del file_data
74+
except IOError as err:
75+
print(f'Skipped!: {filename}: {err}')
76+
return data
77+
for filename in filenames:
78+
if tag:
79+
file_tag = next(tag)
80+
else:
81+
file_tag = None
82+
try:
83+
file_path = os.path.join(folder, filename)
84+
print(file_path)
85+
86+
if file_tag:
87+
size = file_data.shape[0]
88+
89+
data = pd.concat([data, file_data])
90+
except IOError as err:
91+
print(f'Skipped!: {filename}: {err}')
92+
return data
93+
94+
95+
def get_files(path, keywords = ["features_ OR msd_"]):
96+
"""
97+
Takes in a path and list of keywords. Returns a list of filenames
98+
that are within the path that contain one of the keyword in the list.
99+
Set keyword to "" to get all files in the path.
100+
101+
Parameters
102+
----------
103+
path : string
104+
file path
105+
keywords : string or [string] : ["features_ OR msd_"]
106+
keywords to look for in the file path.
107+
108+
Returns
109+
-------
110+
file_list : list
111+
list of files in the path
112+
"""
113+
keywords = [i.split('OR') for i in list(keywords)]
114+
keywords = [list(map(lambda x:x.strip(), i)) for i in keywords]
115+
files = [f for f in listdir(path) if isfile(join(path, f))]
116+
file_list = []
117+
for filename in files:
118+
kwds_in = all(any(k in filename for k in ([keyword]*isinstance(keyword, str) or keyword)) for keyword in keywords)
119+
if (kwds_in):
120+
file_list.append(filename)
121+
return file_list
122+
123+
124+
# Pre: Both files must exhist; Feature must be in the feature file
125+
# Throws a FileNotFoundError exception if preconditions not met
126+
#
127+
# Adds a feature from produced features file to the track file.
128+
def combine_track(trackFile, feature="type"):
129+
trackDF = pd.read_csv(trackFile)
130+
featureDF = find_pair(trackFile)
131+
trackDF[feature] = np.nan
132+
maxFrames = int(trackDF["Frame"].max())
133+
maxTracks = int(trackDF["Track_ID"].max())
134+
for i in range(int(maxTracks)+1):
135+
trackFeature = featureDF[feature].iloc[i]
136+
trackDF[feature].iloc[(maxFrames)*(i + 1) + i] = trackFeature
137+
return trackDF
138+
139+
140+
# Trys to find the feature file pair for either msd_ or Traj_
141+
# Return the pd.DataFrame of that pair if found.
142+
def find_pair(filename):
143+
'''
144+
Trys to find the feature file pair for either msd_ or Traj_ and
145+
Returns the pd.DataFrame of that pair if found.
146+
'''
147+
try:
148+
filename = filename.replace("msd_", "").replace("Traj_", "")
149+
filename = filename.split("/")
150+
filename[-1] = "features_" + filename[-1]
151+
featureFile = "/".join(filename)
152+
return pd.read_csv(featureFile)
153+
except FileNotFoundError:
154+
print("File pair could not be found")

build/lib/diff_predictor/eval.py

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
import sys
2+
import numpy
3+
import scipy.stats
4+
from seaborn import heatmap
5+
6+
7+
def perf_meas(y_actual, y_pred, cls, verbose=True):
8+
'''
9+
Shows the performance measurements of resulting prediction.
10+
Performance measures include true positive, true negative,
11+
false positive, false negative
12+
Parameters
13+
----------
14+
y_actual : list
15+
Actual values of y
16+
y_pred : list
17+
Predicted values of y
18+
cls : int
19+
class to run performance measure on
20+
verbose : boolean : True
21+
report performance as a string
22+
Returns
23+
-------
24+
tuple of four performance values (TP, FP, TN, FN)
25+
'''
26+
27+
assert len(y_actual) == len(y_pred), 'Must be same number of actual and predicted values'
28+
29+
TP = 0
30+
FP = 0
31+
TN = 0
32+
FN = 0
33+
for i in range(len(y_actual)):
34+
if (y_actual[i]==y_pred[i]) and (y_pred[i]==cls):
35+
TP += 1
36+
if (y_pred[i]==cls) and (y_actual[i]!=y_pred[i]):
37+
FP += 1
38+
if (y_actual[i]==y_pred[i]) and (y_pred[i]!=cls):
39+
TN += 1
40+
if (y_pred[i]!=cls) and (y_actual[i]!=y_pred[i]):
41+
FN += 1
42+
if verbose is True:
43+
print(f'(TP, FP, TN, FN) = {(TP, FP, TN, FN)}')
44+
return(TP, FP, TN, FN)
45+
46+
47+
def corrmat(df, method='pearson', show_plot=True, **kwargs):
48+
'''
49+
50+
'''
51+
plot_options = {'annot': True,
52+
'fmt': "f",
53+
}
54+
plot_options.update(kwargs)
55+
error_msg = "Correlation type not available. Select" +\
56+
"from pearson, spearman, or kendall corr."
57+
switch_case = {'pearson': df.corr(),
58+
'spearman': df.corr(method=method),
59+
'kendall': df.corr(method=method)}
60+
corr_mat = switch_case.get(method, lambda: error_msg)
61+
if show_plot:
62+
return heatmap(corr_mat, **plot_options)
63+
return corr_mat

0 commit comments

Comments
 (0)