sources and datasets

DavideNardone · Feb 13, 2018 · 2562b19 · 2562b19
1 parent ea07e88
commit 2562b19
Show file tree

Hide file tree

Showing 12 changed files with 1,211 additions and 0 deletions.
diff --git a/dataset/BIOLOGICAL/ALLAML/ALLAML.mat b/dataset/BIOLOGICAL/ALLAML/ALLAML.mat
diff --git a/dataset/BIOLOGICAL/COLON/COLON.mat b/dataset/BIOLOGICAL/COLON/COLON.mat
diff --git a/dataset/BIOLOGICAL/LUNG_DISCRETE/LUNG_DISCRETE.mat b/dataset/BIOLOGICAL/LUNG_DISCRETE/LUNG_DISCRETE.mat
diff --git a/dataset/BIOLOGICAL/LYMPHOMA/LYMPHOMA.mat b/dataset/BIOLOGICAL/LYMPHOMA/LYMPHOMA.mat
diff --git a/dataset/BIOLOGICAL/WISCONSIN/WISCONSIN.mat b/dataset/BIOLOGICAL/WISCONSIN/WISCONSIN.mat
diff --git a/src/CSFS_SCBA.py b/src/CSFS_SCBA.py
diff --git a/src/Classifier.py b/src/Classifier.py
@@ -0,0 +1,65 @@
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.svm import SVC
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
+from sklearn.linear_model import LogisticRegression
+#add the need classifiers when using this class
+
+
+
+class Classifier:
+
+    def __init__(self, names=None, classifiers=None):
+
+        self.cv_scores = {}
+
+        #Default classifiers and parameters
+        if names == None:
+
+            self.names = [
+                "KNN", "Logistic Regression", "SVM",
+                "Decision Tree", "Random Forest", "AdaBoost"
+            ]
+
+            self.classifiers = [
+
+                KNeighborsClassifier(n_neighbors=1),
+                LogisticRegression(C=1e5),
+                SVC(kernel="linear"),
+                DecisionTreeClassifier(max_depth=5),
+                RandomForestClassifier(max_depth=5, n_estimators=10),
+                AdaBoostClassifier()
+            ]
+
+        else:
+            self.names = names
+            self.classifiers = classifiers
+
+        for name in self.names:
+            self.cv_scores[name] = []
+
+
+
+    def train(self, X_train, y_train):
+
+        for name, clf in zip(self.names, self.classifiers):
+
+            # Training the algorithm using the selected predictors and target.
+            clf.fit(X_train, y_train)
+
+    def classify(self, X_test, y_test):
+
+        # Record error for training and testing
+        DTS = {}
+
+        for name, clf in zip(self.names, self.classifiers):
+
+            preds = clf.predict(X_test)
+
+            dic_label = {
+                name: preds
+            }
+
+            DTS.update(dic_label)
+
+        return DTS
diff --git a/src/Dataset.py b/src/Dataset.py
@@ -0,0 +1,83 @@
+from sklearn import preprocessing
+from sklearn.preprocessing import StandardScaler
+
+import hdf5storage #dependency
+import numpy as np
+np.set_printoptions(threshold=np.inf)
+
+
+class Dataset:
+    def __init__(self, X, y):
+
+        self.data = X
+        self.target = y.flatten()
+
+        # removing any row with at least one NaN value
+        # TODO: remove also the corresponding target value
+        self.data = self.data[~np.isnan(self.data).any(axis=1)]
+
+        self.num_sample, self.num_features = self.data.shape[0], self.data.shape[1]
+
+        # retrieving unique label for Dataset
+        self.classes = np.unique(self.target)
+
+    def standardizeDataset(self):
+
+        # it simply standardize the data [mean 0 and std 1]
+        if np.sum(np.std(self.data, axis=0)).astype('int32') == self.num_features and np.sum(
+                np.mean(self.data, axis=0)) < 1 ** -7:
+            print ('\tThe data were already standardized!')
+        else:
+            print ('Standardizing data....')
+            self.data = StandardScaler().fit_transform(self.data)
+
+    def normalizeDataset(self, norm):
+
+        normalizer = preprocessing.Normalizer(norm=norm)
+        self.data = normalizer.fit_transform(self.data)
+
+    def scalingDataset(self):
+
+        min_max_scaler = preprocessing.MinMaxScaler()
+        self.data = min_max_scaler.fit_transform(self.data)
+
+    def shufflingDataset(self):
+
+        idx = np.random.permutation(self.data.shape[0])
+        self.data = self.data[idx]
+        self.target = self.target[idx]
+
+
+    def split(self, split_ratio=0.8):
+
+        # shuffling data
+        indices = np.random.permutation(self.num_sample)
+
+        start = int(split_ratio * self.num_sample)
+        training_idx, test_idx = indices[:start], indices[start:]
+        X_train, X_test = self.data[training_idx, :], self.data[test_idx, :]
+        y_train, y_test = self.target[training_idx], self.target[test_idx]
+
+        return X_train, y_train, X_test, y_test, training_idx, test_idx
+
+    def separateSampleClass(self):
+
+        # Discriminating the classes sample
+        self.ind_class = []
+        for i in xrange(0, len(self.classes)):
+            self.ind_class.append(np.where(self.target == self.classes[i]))
+
+    def getSampleClass(self):
+
+        data = []
+        target = []
+        # Selecting the 'train sample' on the basis of the previously retrieved indices
+        for i in xrange(0, len(self.classes)):
+            data.append(self.data[self.ind_class[i]])
+            target.append(self.target[self.ind_class[i]])
+
+        return data, target
+
+    def getIndClass(self):
+
+        return self.ind_class
diff --git a/src/FeatureSelector.py b/src/FeatureSelector.py
@@ -0,0 +1,64 @@
+import numpy as np
+np.set_printoptions(threshold=np.inf)
+import sys
+sys.path.insert(0, './src')
+import SCBA as fs
+
+
+
+class FeatureSelector:
+
+    def __init__(self, model=None, name=None, tp=None, params=None):
+
+
+        self.name = name
+        self.model = model
+        self.tp = tp
+        self.params = params
+
+
+
+    def fit(self, X, y):
+
+        idx = []
+
+        #add custom 'type' of Feature Selector
+        if self.tp == 'filter':
+
+            if self.name == 'Relief':
+                '''
+                add a custom Feature Selector such as:
+
+                score = reliefF.reliefF(X, y)
+                idx = reliefF.feature_ranking(score)
+
+                '''
+
+        elif self.tp == 'SLB':
+
+
+            # SCBA method
+            if self.name == 'GAD':
+
+                alg = gd.GAD(X, self.params)
+                _, idx = alg.iterative_GAD()
+
+            if self.name == 'SCBA':
+                scba = fs.SCBA(data=X, alpha=self.params['alpha'], norm_type=self.params['norm_type'],
+                               verbose=self.params['verbose'], thr=self.params['thr'], max_iter=self.params['max_iter'],
+                               affine=self.params['affine'],
+                               normalize=self.params['normalize'],
+                               step=self.params['step'],
+                               PCA=self.params['PCA'],
+                               GPU=self.params['GPU'],
+                               device = self.params['device'])
+
+                nrmInd, sInd, repInd, _ = scba.admm()
+                if self.params['type_indices'] == 'nrmInd':
+                    idx = nrmInd
+                elif self.params['type_indices'] == 'repInd':
+                    idx = repInd
+                else:
+                    idx = sInd
+
+        return idx
diff --git a/src/Loader.py b/src/Loader.py
@@ -0,0 +1,51 @@
+import hdf5storage #dependency
+import numpy as np
+
+np.set_printoptions(threshold=np.inf)
+import scipy.io as sio
+
+class Loader:
+    def __init__(self, file_path, name, variables, format, k_fold=None):
+
+
+        # This Class provides several method for loading many type of dataset (matlab, csv, txt, etc)
+
+        if format == 'matlab':  # classic workspace
+
+            mc = sio.loadmat(file_path)
+
+            for variable in variables:
+                setattr(self, variable, mc[variable])
+
+        elif format == 'matlab_struct':  # struct one level
+            print ('Loading data...')
+
+            mc = sio.loadmat(file_path)
+            mc = mc[name][0, 0]
+
+            for variable in variables:
+                setattr(self, variable, mc[variable])
+
+        elif format == 'custom_matlab':
+            print ('Loading data...')
+
+            mc = sio.loadmat(file_path)
+            mc = mc[name][0, 0]
+
+            for variable in variables:
+                setattr(self, variable, mc[variable][0, 0])
+
+        elif format == 'matlab_v73':
+            mc = hdf5storage.loadmat(file_path)
+
+            for variable in variables:
+                setattr(self, variable, mc[variable])
+
+    def getVariables(self, variables):
+
+        D = {}
+
+        for variable in variables:
+            D[variable] = getattr(self, variable)
+
+        return D