Skip to content

Commit

Permalink
sources and datasets
Browse files Browse the repository at this point in the history
  • Loading branch information
DavideNardone committed Feb 13, 2018
1 parent ea07e88 commit 2562b19
Show file tree
Hide file tree
Showing 12 changed files with 1,211 additions and 0 deletions.
Binary file added dataset/BIOLOGICAL/ALLAML/ALLAML.mat
Binary file not shown.
Binary file added dataset/BIOLOGICAL/COLON/COLON.mat
Binary file not shown.
Binary file not shown.
Binary file added dataset/BIOLOGICAL/LYMPHOMA/LYMPHOMA.mat
Binary file not shown.
Binary file added dataset/BIOLOGICAL/WISCONSIN/WISCONSIN.mat
Binary file not shown.
429 changes: 429 additions & 0 deletions src/CSFS_SCBA.py

Large diffs are not rendered by default.

65 changes: 65 additions & 0 deletions src/Classifier.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
#add the need classifiers when using this class



class Classifier:

def __init__(self, names=None, classifiers=None):

self.cv_scores = {}

#Default classifiers and parameters
if names == None:

self.names = [
"KNN", "Logistic Regression", "SVM",
"Decision Tree", "Random Forest", "AdaBoost"
]

self.classifiers = [

KNeighborsClassifier(n_neighbors=1),
LogisticRegression(C=1e5),
SVC(kernel="linear"),
DecisionTreeClassifier(max_depth=5),
RandomForestClassifier(max_depth=5, n_estimators=10),
AdaBoostClassifier()
]

else:
self.names = names
self.classifiers = classifiers

for name in self.names:
self.cv_scores[name] = []



def train(self, X_train, y_train):

for name, clf in zip(self.names, self.classifiers):

# Training the algorithm using the selected predictors and target.
clf.fit(X_train, y_train)

def classify(self, X_test, y_test):

# Record error for training and testing
DTS = {}

for name, clf in zip(self.names, self.classifiers):

preds = clf.predict(X_test)

dic_label = {
name: preds
}

DTS.update(dic_label)

return DTS
83 changes: 83 additions & 0 deletions src/Dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

import hdf5storage #dependency
import numpy as np
np.set_printoptions(threshold=np.inf)


class Dataset:
def __init__(self, X, y):

self.data = X
self.target = y.flatten()

# removing any row with at least one NaN value
# TODO: remove also the corresponding target value
self.data = self.data[~np.isnan(self.data).any(axis=1)]

self.num_sample, self.num_features = self.data.shape[0], self.data.shape[1]

# retrieving unique label for Dataset
self.classes = np.unique(self.target)

def standardizeDataset(self):

# it simply standardize the data [mean 0 and std 1]
if np.sum(np.std(self.data, axis=0)).astype('int32') == self.num_features and np.sum(
np.mean(self.data, axis=0)) < 1 ** -7:
print ('\tThe data were already standardized!')
else:
print ('Standardizing data....')
self.data = StandardScaler().fit_transform(self.data)

def normalizeDataset(self, norm):

normalizer = preprocessing.Normalizer(norm=norm)
self.data = normalizer.fit_transform(self.data)

def scalingDataset(self):

min_max_scaler = preprocessing.MinMaxScaler()
self.data = min_max_scaler.fit_transform(self.data)

def shufflingDataset(self):

idx = np.random.permutation(self.data.shape[0])
self.data = self.data[idx]
self.target = self.target[idx]


def split(self, split_ratio=0.8):

# shuffling data
indices = np.random.permutation(self.num_sample)

start = int(split_ratio * self.num_sample)
training_idx, test_idx = indices[:start], indices[start:]
X_train, X_test = self.data[training_idx, :], self.data[test_idx, :]
y_train, y_test = self.target[training_idx], self.target[test_idx]

return X_train, y_train, X_test, y_test, training_idx, test_idx

def separateSampleClass(self):

# Discriminating the classes sample
self.ind_class = []
for i in xrange(0, len(self.classes)):
self.ind_class.append(np.where(self.target == self.classes[i]))

def getSampleClass(self):

data = []
target = []
# Selecting the 'train sample' on the basis of the previously retrieved indices
for i in xrange(0, len(self.classes)):
data.append(self.data[self.ind_class[i]])
target.append(self.target[self.ind_class[i]])

return data, target

def getIndClass(self):

return self.ind_class
64 changes: 64 additions & 0 deletions src/FeatureSelector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import numpy as np
np.set_printoptions(threshold=np.inf)
import sys
sys.path.insert(0, './src')
import SCBA as fs



class FeatureSelector:

def __init__(self, model=None, name=None, tp=None, params=None):


self.name = name
self.model = model
self.tp = tp
self.params = params



def fit(self, X, y):

idx = []

#add custom 'type' of Feature Selector
if self.tp == 'filter':

if self.name == 'Relief':
'''
add a custom Feature Selector such as:
score = reliefF.reliefF(X, y)
idx = reliefF.feature_ranking(score)
'''

elif self.tp == 'SLB':


# SCBA method
if self.name == 'GAD':

alg = gd.GAD(X, self.params)
_, idx = alg.iterative_GAD()

if self.name == 'SCBA':
scba = fs.SCBA(data=X, alpha=self.params['alpha'], norm_type=self.params['norm_type'],
verbose=self.params['verbose'], thr=self.params['thr'], max_iter=self.params['max_iter'],
affine=self.params['affine'],
normalize=self.params['normalize'],
step=self.params['step'],
PCA=self.params['PCA'],
GPU=self.params['GPU'],
device = self.params['device'])

nrmInd, sInd, repInd, _ = scba.admm()
if self.params['type_indices'] == 'nrmInd':
idx = nrmInd
elif self.params['type_indices'] == 'repInd':
idx = repInd
else:
idx = sInd

return idx
51 changes: 51 additions & 0 deletions src/Loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import hdf5storage #dependency
import numpy as np

np.set_printoptions(threshold=np.inf)
import scipy.io as sio

class Loader:
def __init__(self, file_path, name, variables, format, k_fold=None):


# This Class provides several method for loading many type of dataset (matlab, csv, txt, etc)

if format == 'matlab': # classic workspace

mc = sio.loadmat(file_path)

for variable in variables:
setattr(self, variable, mc[variable])

elif format == 'matlab_struct': # struct one level
print ('Loading data...')

mc = sio.loadmat(file_path)
mc = mc[name][0, 0]

for variable in variables:
setattr(self, variable, mc[variable])

elif format == 'custom_matlab':
print ('Loading data...')

mc = sio.loadmat(file_path)
mc = mc[name][0, 0]

for variable in variables:
setattr(self, variable, mc[variable][0, 0])

elif format == 'matlab_v73':
mc = hdf5storage.loadmat(file_path)

for variable in variables:
setattr(self, variable, mc[variable])

def getVariables(self, variables):

D = {}

for variable in variables:
D[variable] = getattr(self, variable)

return D
Loading

0 comments on commit 2562b19

Please sign in to comment.