diff --git a/dataset/BIOLOGICAL/ALLAML/ALLAML.mat b/dataset/BIOLOGICAL/ALLAML/ALLAML.mat new file mode 100644 index 0000000..b592b0a Binary files /dev/null and b/dataset/BIOLOGICAL/ALLAML/ALLAML.mat differ diff --git a/dataset/BIOLOGICAL/COLON/COLON.mat b/dataset/BIOLOGICAL/COLON/COLON.mat new file mode 100644 index 0000000..0aec8a9 Binary files /dev/null and b/dataset/BIOLOGICAL/COLON/COLON.mat differ diff --git a/dataset/BIOLOGICAL/LUNG_DISCRETE/LUNG_DISCRETE.mat b/dataset/BIOLOGICAL/LUNG_DISCRETE/LUNG_DISCRETE.mat new file mode 100644 index 0000000..fbefb85 Binary files /dev/null and b/dataset/BIOLOGICAL/LUNG_DISCRETE/LUNG_DISCRETE.mat differ diff --git a/dataset/BIOLOGICAL/LYMPHOMA/LYMPHOMA.mat b/dataset/BIOLOGICAL/LYMPHOMA/LYMPHOMA.mat new file mode 100644 index 0000000..42cb35c Binary files /dev/null and b/dataset/BIOLOGICAL/LYMPHOMA/LYMPHOMA.mat differ diff --git a/dataset/BIOLOGICAL/WISCONSIN/WISCONSIN.mat b/dataset/BIOLOGICAL/WISCONSIN/WISCONSIN.mat new file mode 100644 index 0000000..8269d23 Binary files /dev/null and b/dataset/BIOLOGICAL/WISCONSIN/WISCONSIN.mat differ diff --git a/src/CSFS_SCBA.py b/src/CSFS_SCBA.py new file mode 100644 index 0000000..db53fae --- /dev/null +++ b/src/CSFS_SCBA.py @@ -0,0 +1,429 @@ +from __future__ import division + +from sklearn.svm import SVC +from sklearn.metrics import accuracy_score +from sklearn.model_selection import KFold +from imblearn.over_sampling import SMOTE #dependency + +import numpy as np +np.set_printoptions(threshold=np.inf) +import random as rnd +import time +import os +import errno +import pickle +import sys + +# sys.path.insert(0, './src') +import Loader as lr +import Dataset as ds +import Classifier as i_clf +import FeatureSelector as fs + + +def checkFolder(root, path_output): + + #folders to generate recursively + path = root+'/'+path_output + + try: + os.makedirs(path) + except OSError as exc: # Python >2.5 + if exc.errno == errno.EEXIST and os.path.isdir(path): + pass + else: + raise + +def OCC_DecisioneRule(clf_score, cls, clf_name, target): + + n_classes = len(cls) + + DTS = {} + for ccn in clf_name: + hits = [] + res = [] + preds = [] + + for i in xrange(0,n_classes): + e_th = np.asarray(clf_score['C'+str(cls[i])]['accuracy'][ccn]) + + e_th[np.where(e_th==-1)] = 0 + + hits.append(e_th) + + ensemble_hits = np.vstack(hits) + + for i in xrange(0, ensemble_hits.shape[1]): # number of sample + hits = ensemble_hits[:,i] + cond = np.sum(hits) + + if cond == 1: #rule 1 + pred = np.where(hits==1)[0] + pred = cls[pred][0] + preds.append(pred) + elif cond == 0: #rule 2 (tie among all OCC) + pred = cls[rnd.randint(0, len(cls) - 1)] + preds.append(pred) + elif cond > 0: + tied_cls = np.where(hits==1)[0] + pred = tied_cls[rnd.randint(0, len(tied_cls) - 1)] + preds.append(pred) + + test_score = accuracy_score(target, preds) + + dic_test_score = { + ccn: test_score + } + + DTS.update(dic_test_score) + + return DTS + +def classificationDecisionRule(clf_score, cls, clf_name, target): + + n_classes = len(cls) + DTS = {} + + for ccn in clf_name: + hits = [] + res = [] + preds = [] + + for i in xrange(0,n_classes): + + #ensemble scores on class 'C' for the testing set + e_th = clf_score['C'+str(cls[i])]['accuracy'][ccn] + res.append(e_th) + + hits.append((e_th == cls[i]).astype('int').flatten()) + + # ensemble scores and hits for the testing set + ensemble_res = np.vstack(res) + ensemble_hits = np.vstack(hits) + + # Applying decision rules + for i in xrange(0, ensemble_hits.shape[1]): # number of sample + hits = ensemble_hits[:,i] #it has a 1 in a position whether the classifier e_i has predicted the class w_i for the i-th pattern + ens_preds = ensemble_res[:,i] #it's simply the predictions of all the trained classifier for the i-th pattern + cond = np.sum(hits) #count the number of true positive for the i-th pattern + + if cond == 1: #rule 1 + pred = cls[np.where(hits==1)[0].squeeze()] #retrieve the cls for the 'only' true positive + preds.append(pred) + + elif cond == 0 or cond > 1: # rule 1-2 (tie) + + # we find the majority votes (frequency) among all classifier (e.g., ) [[4 2][5 1][6 2][7 2]] + unique, counts = np.unique(ens_preds, return_counts=True) + maj_rule = np.asarray((unique, counts)).T + + # we find the 'majority' index, then its class + ind_max = np.argmax(maj_rule[:, 1]) + pred = maj_rule[ind_max, 0] + max = maj_rule[ind_max, 1] + + # we look for a 'tie of the tie', then we look for the majority class among all the tied classes + tied_cls = np.where(maj_rule[:, 1] == max)[0] + if ( len(np.where(maj_rule[:, 1] == max)[0]) ) > 1: #tie of the tie + pred = maj_rule[tied_cls,0] + + # pick one tied cls randomly + pred = pred[rnd.randint(0,len(pred)-1)] + preds.append(pred) + + else: + preds.append(pred) + + #compute accuracy + test_score = accuracy_score(target, preds) + + dic_test_score = { + ccn: test_score + } + + DTS.update(dic_test_score) + + return DTS + +def main(): + ''' LOADING ANY DATASET ''' + dataset_dir = '/dataset' + dataset_type = '/BIOLOGICAL' + dataset_name = '/WISCONSIN' + + #this variable decide whether to balance or not the dataset + resample = True + p_step = 1 + + # defining directory paths for saving partial and complete result + path_data_folder = dataset_dir + dataset_type + dataset_name + path_data_file = path_data_folder + dataset_name + variables = ['X', 'Y'] + + print ('%d.Loading and pre-processing the data...\n' % p_step) + p_step += 1 + # NB: If you get an error such as: 'Please use HDF reader for matlab v7.3 files',please change the 'format variable' to 'matlab_v73' + D = lr.Loader(file_path=path_data_file, + format='matlab', + variables=variables, + name=dataset_name[1:] + ).getVariables(variables=variables) + + dataset = ds.Dataset(D['X'], D['Y']) + + n_classes = dataset.classes.shape[0] + cls = np.unique(dataset.classes) + + # check if the data are already standardized, if not standardize it + dataset.standardizeDataset() + + # re-sampling dataset + num_min_cls = 9999999 + print ('%d.Class-sample separation...\n' % p_step) + p_step += 1 + if resample == True: + + print ('\tDataset %s before resampling w/ size: %s and number of classes: %s---> %s' % ( + dataset_name[1:], dataset.data.shape, n_classes, cls)) + + # discriminating classes of the whole dataset + dataset_train = ds.Dataset(dataset.data, dataset.target) + dataset_train.separateSampleClass() + data, target = dataset_train.getSampleClass() + + for i in xrange(0, n_classes): + print ('\t\t#sample for class C%s: %s' % (i + 1, data[i].shape)) + if data[i].shape[0] < num_min_cls: + num_min_cls = data[i].shape[0] + + resample = '/BALANCED' + print ('%d.Class balancing...' % p_step) + dataset.data, dataset.target = SMOTE(kind='regular', k_neighbors=num_min_cls - 1).fit_sample(dataset.data, + dataset.target) + p_step += 1 + else: + resample = '/UNBALANCED' + + # shuffling data + print ('\tShuffling data...') + dataset.shufflingDataset() + + print ('\tDataset %s w/ size: %s and number of classes: %s---> %s' % ( + dataset_name[1:], dataset.data.shape, n_classes, cls)) + + # discriminating classes the whole dataset + dataset_train = ds.Dataset(dataset.data, dataset.target) + dataset_train.separateSampleClass() + data, target = dataset_train.getSampleClass() + + for i in xrange(0, n_classes): + print ('\t\t#sample for class C%s: %s' % (i + 1, data[i].shape)) + + # Max number of features to use + max_num_feat = 300 + step = 1 + # max_num_feat = dataset.data.shape[1] + + if max_num_feat > dataset.data.shape[1]: + max_num_feat = dataset.data.shape[1] + + alpha = 10 #regularizatio parameter (typically alpha in [2,50]) + + params = { + + 'SCBA': + # the smaller is alpha the sparser is the C matrix (fewer representatives) + { + 'alpha': alpha, + 'norm_type': 1, + 'max_iter': 3000, + 'thr': [10 ** -8], + 'type_indices': 'nrmInd', + 'normalize': False, + 'GPU': False, + 'device': 0, + 'PCA': False, + 'verbose': False, + 'step': 1, + 'affine': False, + } + # it's possible to add other FS methods by modifying the correct file + } + + fs_model = fs.FeatureSelector(name='SCBA', tp='SLB', params=params['SCBA']) + fs_name = 'SCBA' + + # CLASSIFIERS (it's possible to add other classifier methods by adding entries into this list) + clf_name = [ + "SVM" + # "Decision Tree", + # "KNN" + ] + model = [ + SVC(kernel="linear") + # DecisionTreeClassifier(max_depth=5), + # KNeighborsClassifier(n_neighbors=1) + ] + + '''Perform K-fold Cross Validation...''' + k_fold = 10 + + #defining result folders + fs_path_output = '/CSFS/FS/K_FOLD' + checkFolder(path_data_folder, fs_path_output) + + res_path_output = '/CSFS/RESULTS/K_FOLD' + checkFolder(path_data_folder, fs_path_output) + + all_scores = {} + all_scores.update({fs_name: []}) + + cc_fold = 0 + conf_dataset = {} + + X = dataset.data + y = dataset.target + kf = KFold(n_splits=k_fold) + + print ('%d.Running the Intra-Class-Specific Feature Selection and building the ensemble classifier...\n' % p_step) + p_step += 1 + for train_index, test_index in kf.split(X): + + X_train_kth, X_test_kth = X[train_index], X[test_index] + y_train, y_test = y[train_index], y[test_index] + + print ('\tDOING %s-CROSS VALIDATION W/ TRAINING SET SIZE: %s' % (cc_fold + 1, X_train_kth.shape)) + + ''' For the training data in each class we find the representative features and use them as a best subset feature + (in representing each class sample) to perform classification + ''' + + csfs_res = {} + + for i in xrange(0, n_classes): + cls_res = { + 'C' + str(cls[i]): {} + } + csfs_res.update(cls_res) + + kth_scores = {} + for i in xrange(0, len(clf_name)): + kth_scores.update({clf_name[i]: []}) + + # check whether the 'curr_res_fs_fold' directory exists, otherwise create it + curr_res_fs_fold = path_data_folder + '/' + fs_path_output + '/' + fs_name + resample + checkFolder(path_data_folder, fs_path_output + '/' + fs_name + resample) + + # discriminating classes for the k-th fold of the training set + data_train = ds.Dataset(X_train_kth, y_train) + data_train.separateSampleClass() + ktrain_data, ktrain_target = data_train.getSampleClass() + K_cls_ind_train = data_train.ind_class + + for i in xrange(0, n_classes): + # print ('Train set size C' + str(i + 1) + ':', ktrain_data[i].shape) + + print ('\tPerforming feature selection on class %d with shape %s' % (cls[i] + 1, ktrain_data[i].shape)) + + start_time = time.time() + idx = fs_model.fit(ktrain_data[i], ktrain_target[i]) + + # print idx + + print('\tTotal Time = %s seconds\n' % (time.time() - start_time)) + + csfs_res['C' + str(cls[i])]['idx'] = idx + csfs_res['C' + str(cls[i])]['params'] = params[fs_name] + + # with open(curr_res_fs_fold + '/' + str(cc_fold + 1) + '-fold' + '.pickle', 'wb') as handle: + # pickle.dump(csfs_res, handle, protocol=pickle.HIGHEST_PROTOCOL) + + ens_class = {} + # learning a classifier (ccn) for each subset of 'n_rep' feature + for j in xrange(0, max_num_feat): + n_rep = j + 1 # first n_rep indices + + for i in xrange(0, n_classes): + # get subset of feature from the i-th class + idx = csfs_res['C' + str(cls[i])]['idx'] + + # print idx[0:n_rep] + + X_train_fs = X_train_kth[:, idx[0:n_rep]] + + _clf = i_clf.Classifier(names=clf_name, classifiers=model) + _clf.train(X_train_fs, y_train) + + csfs_res['C' + str(cls[i])]['accuracy'] = _clf.classify(X_test_kth[:, idx[0:n_rep]], y_test) + + DTS = classificationDecisionRule(csfs_res, cls, clf_name, y_test) + + for i in xrange(0, len(clf_name)): + _score = DTS[clf_name[i]] + # print ('Accuracy w/ %d feature: %f' % (n_rep, _score)) + kth_scores[clf_name[i]].append(_score) + + x = np.arange(1, max_num_feat + 1) + + kth_results = { + 'clf_name': clf_name, + 'x': x, + 'scores': kth_scores, + } + + all_scores[fs_name].append(kth_results) + + # saving k-th dataset configuration + # with open(path_data_folder + fs_path_output + '/' + str(cc_fold + 1) + '-fold_conf_dataset.pickle', + # 'wb') as handle: # TODO: customize output name for recognizing FS parameters' method + # pickle.dump(conf_dataset, handle, protocol=pickle.HIGHEST_PROTOCOL) + + cc_fold += 1 + + # print all_scores + + print('%s.Averaging results...\n' % p_step) + p_step += 1 + # Averaging results on k-fold + + # check whether the 'curr_res_fs_fold' directory exists, otherwise create it + curr_res_output_fold = path_data_folder + '/' + res_path_output + '/' + fs_name + resample + checkFolder(path_data_folder, res_path_output + '/' + fs_name + resample) + + M = {} + for i in xrange(0, len(clf_name)): + M.update({clf_name[i]: np.ones([k_fold, max_num_feat]) * 0}) + + avg_scores = {} + std_scores = {} + for i in xrange(0, len(clf_name)): + avg_scores.update({clf_name[i]: []}) + std_scores.update({clf_name[i]: []}) + + # k-fold results for each classifier + for k in xrange(0, k_fold): + for clf in clf_name: + M[clf][k, :] = all_scores[fs_name][k]['scores'][clf][:max_num_feat] + + for clf in clf_name: + avg_scores[clf] = np.mean(M[clf], axis=0) + std_scores[clf] = np.std(M[clf], axis=0) + + x = np.arange(1, max_num_feat + 1) + results = { + 'clf_name': clf_name, + 'x': x, + 'M': M, + 'scores': avg_scores, + 'std': std_scores + } + + # print avg_scores + + with open(curr_res_output_fold + '/clf_results.pickle', 'wb') as handle: + pickle.dump(results, handle, protocol=pickle.HIGHEST_PROTOCOL) + print ('Done with %s, [%d-cross validation] ' % (dataset_name[1:], k_fold)) + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/src/Classifier.py b/src/Classifier.py new file mode 100644 index 0000000..490f6de --- /dev/null +++ b/src/Classifier.py @@ -0,0 +1,65 @@ +from sklearn.neighbors import KNeighborsClassifier +from sklearn.svm import SVC +from sklearn.tree import DecisionTreeClassifier +from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier +from sklearn.linear_model import LogisticRegression +#add the need classifiers when using this class + + + +class Classifier: + + def __init__(self, names=None, classifiers=None): + + self.cv_scores = {} + + #Default classifiers and parameters + if names == None: + + self.names = [ + "KNN", "Logistic Regression", "SVM", + "Decision Tree", "Random Forest", "AdaBoost" + ] + + self.classifiers = [ + + KNeighborsClassifier(n_neighbors=1), + LogisticRegression(C=1e5), + SVC(kernel="linear"), + DecisionTreeClassifier(max_depth=5), + RandomForestClassifier(max_depth=5, n_estimators=10), + AdaBoostClassifier() + ] + + else: + self.names = names + self.classifiers = classifiers + + for name in self.names: + self.cv_scores[name] = [] + + + + def train(self, X_train, y_train): + + for name, clf in zip(self.names, self.classifiers): + + # Training the algorithm using the selected predictors and target. + clf.fit(X_train, y_train) + + def classify(self, X_test, y_test): + + # Record error for training and testing + DTS = {} + + for name, clf in zip(self.names, self.classifiers): + + preds = clf.predict(X_test) + + dic_label = { + name: preds + } + + DTS.update(dic_label) + + return DTS \ No newline at end of file diff --git a/src/Dataset.py b/src/Dataset.py new file mode 100644 index 0000000..1e3dbc3 --- /dev/null +++ b/src/Dataset.py @@ -0,0 +1,83 @@ +from sklearn import preprocessing +from sklearn.preprocessing import StandardScaler + +import hdf5storage #dependency +import numpy as np +np.set_printoptions(threshold=np.inf) + + +class Dataset: + def __init__(self, X, y): + + self.data = X + self.target = y.flatten() + + # removing any row with at least one NaN value + # TODO: remove also the corresponding target value + self.data = self.data[~np.isnan(self.data).any(axis=1)] + + self.num_sample, self.num_features = self.data.shape[0], self.data.shape[1] + + # retrieving unique label for Dataset + self.classes = np.unique(self.target) + + def standardizeDataset(self): + + # it simply standardize the data [mean 0 and std 1] + if np.sum(np.std(self.data, axis=0)).astype('int32') == self.num_features and np.sum( + np.mean(self.data, axis=0)) < 1 ** -7: + print ('\tThe data were already standardized!') + else: + print ('Standardizing data....') + self.data = StandardScaler().fit_transform(self.data) + + def normalizeDataset(self, norm): + + normalizer = preprocessing.Normalizer(norm=norm) + self.data = normalizer.fit_transform(self.data) + + def scalingDataset(self): + + min_max_scaler = preprocessing.MinMaxScaler() + self.data = min_max_scaler.fit_transform(self.data) + + def shufflingDataset(self): + + idx = np.random.permutation(self.data.shape[0]) + self.data = self.data[idx] + self.target = self.target[idx] + + + def split(self, split_ratio=0.8): + + # shuffling data + indices = np.random.permutation(self.num_sample) + + start = int(split_ratio * self.num_sample) + training_idx, test_idx = indices[:start], indices[start:] + X_train, X_test = self.data[training_idx, :], self.data[test_idx, :] + y_train, y_test = self.target[training_idx], self.target[test_idx] + + return X_train, y_train, X_test, y_test, training_idx, test_idx + + def separateSampleClass(self): + + # Discriminating the classes sample + self.ind_class = [] + for i in xrange(0, len(self.classes)): + self.ind_class.append(np.where(self.target == self.classes[i])) + + def getSampleClass(self): + + data = [] + target = [] + # Selecting the 'train sample' on the basis of the previously retrieved indices + for i in xrange(0, len(self.classes)): + data.append(self.data[self.ind_class[i]]) + target.append(self.target[self.ind_class[i]]) + + return data, target + + def getIndClass(self): + + return self.ind_class \ No newline at end of file diff --git a/src/FeatureSelector.py b/src/FeatureSelector.py new file mode 100644 index 0000000..0f8129f --- /dev/null +++ b/src/FeatureSelector.py @@ -0,0 +1,64 @@ +import numpy as np +np.set_printoptions(threshold=np.inf) +import sys +sys.path.insert(0, './src') +import SCBA as fs + + + +class FeatureSelector: + + def __init__(self, model=None, name=None, tp=None, params=None): + + + self.name = name + self.model = model + self.tp = tp + self.params = params + + + + def fit(self, X, y): + + idx = [] + + #add custom 'type' of Feature Selector + if self.tp == 'filter': + + if self.name == 'Relief': + ''' + add a custom Feature Selector such as: + + score = reliefF.reliefF(X, y) + idx = reliefF.feature_ranking(score) + + ''' + + elif self.tp == 'SLB': + + + # SCBA method + if self.name == 'GAD': + + alg = gd.GAD(X, self.params) + _, idx = alg.iterative_GAD() + + if self.name == 'SCBA': + scba = fs.SCBA(data=X, alpha=self.params['alpha'], norm_type=self.params['norm_type'], + verbose=self.params['verbose'], thr=self.params['thr'], max_iter=self.params['max_iter'], + affine=self.params['affine'], + normalize=self.params['normalize'], + step=self.params['step'], + PCA=self.params['PCA'], + GPU=self.params['GPU'], + device = self.params['device']) + + nrmInd, sInd, repInd, _ = scba.admm() + if self.params['type_indices'] == 'nrmInd': + idx = nrmInd + elif self.params['type_indices'] == 'repInd': + idx = repInd + else: + idx = sInd + + return idx \ No newline at end of file diff --git a/src/Loader.py b/src/Loader.py new file mode 100644 index 0000000..4d456a1 --- /dev/null +++ b/src/Loader.py @@ -0,0 +1,51 @@ +import hdf5storage #dependency +import numpy as np + +np.set_printoptions(threshold=np.inf) +import scipy.io as sio + +class Loader: + def __init__(self, file_path, name, variables, format, k_fold=None): + + + # This Class provides several method for loading many type of dataset (matlab, csv, txt, etc) + + if format == 'matlab': # classic workspace + + mc = sio.loadmat(file_path) + + for variable in variables: + setattr(self, variable, mc[variable]) + + elif format == 'matlab_struct': # struct one level + print ('Loading data...') + + mc = sio.loadmat(file_path) + mc = mc[name][0, 0] + + for variable in variables: + setattr(self, variable, mc[variable]) + + elif format == 'custom_matlab': + print ('Loading data...') + + mc = sio.loadmat(file_path) + mc = mc[name][0, 0] + + for variable in variables: + setattr(self, variable, mc[variable][0, 0]) + + elif format == 'matlab_v73': + mc = hdf5storage.loadmat(file_path) + + for variable in variables: + setattr(self, variable, mc[variable]) + + def getVariables(self, variables): + + D = {} + + for variable in variables: + D[variable] = getattr(self, variable) + + return D \ No newline at end of file diff --git a/src/SCBA.py b/src/SCBA.py new file mode 100644 index 0000000..3399db5 --- /dev/null +++ b/src/SCBA.py @@ -0,0 +1,519 @@ +from __future__ import division + +from sklearn.decomposition import PCA + +import numpy as np +import numpy.matlib +from matplotlib import pyplot as plt +np.set_printoptions(threshold=np.inf) +import pycuda.autoinit +import pycuda.gpuarray as gpuarray +import skcuda.linalg as linalg +import skcuda.misc as misc +import time + + +class SCBA(): + + def __init__(self, data, alpha=10, norm_type=1, + verbose=False, step=5, thr=[10**-8,-1], max_iter=5000, + affine=False, + normalize=True, + PCA=False, npc=10, GPU=False, device=0): + + self.data = data + self.alpha = alpha + self.norm_type=norm_type + self.verbose = verbose + self.step = step + self.thr = thr + self.max_iter = max_iter + self.affine = affine + self.normalize = normalize + self.device = device + self.PCA = PCA + self.npc = npc + self.GPU = GPU + + self.num_rows = data.shape[0] + self.num_columns = data.shape[1] + + if(self.GPU==True): + # self.data = self.data.astype('float32') + linalg.init() + # dev = misc.get_current_device() + # dev = misc.init_device(n=self.device) + # print misc.get_dev_attrs(dev) + + + def computeLambda(self): + print ('\t\tComputing lambda...') + + T = np.zeros(self.num_columns) + + if (self.GPU == True): + + if not self.affine: + + gpu_data = gpuarray.to_gpu(self.data) + C_gpu = linalg.dot(gpu_data, gpu_data, transa='T') + + for i in xrange(self.num_columns): + T[i] = linalg.norm(C_gpu[i,:]) + + else: + + gpu_data = gpuarray.to_gpu(self.data) + + # affine transformation + y_mean_gpu = misc.mean(gpu_data,axis=1) + + # creating affine matrix to subtract to the data (may encounter problem with strides) + aff_mat = np.zeros([self.num_rows,self.num_columns]).astype('f') + for i in xrange(0,self.num_columns): + aff_mat[:,i] = y_mean_gpu.get() + + + aff_mat_gpu = gpuarray.to_gpu(aff_mat) + gpu_data_aff = misc.subtract(aff_mat_gpu,gpu_data) + + C_gpu = linalg.dot(gpu_data, gpu_data_aff, transa='T') + + #computing euclidean norm (rows) + for i in xrange(self.num_columns): + T[i] = linalg.norm(C_gpu[i,:]) + else: + + if not self.affine: + + T = np.linalg.norm(np.dot(self.data.T, self.data), axis=1) + + else: + #affine transformation + y_mean = np.mean(self.data, axis=1) + + tmp_mat = np.outer(y_mean, np.ones(self.num_columns)) - self.data + + T = np.linalg.norm(np.dot(self.data.T, tmp_mat),axis=1) + + _lambda = np.amax(T) + + return _lambda + + def shrinkL1Lq(self, C1, _lambda): + + D,N = C1.shape + C2 = [] + if self.norm_type == 1: + + #TODO: incapsulate into one function + # soft thresholding + C2 = np.abs(C1) - _lambda + ind = C2 < 0 + C2[ind] = 0 + C2 = np.multiply(C2, np.sign(C1)) + elif self.norm_type == 2: + r = np.zeros([D,1]) + for j in xrange(0,D): + th = np.linalg.norm(C1[j,:]) - _lambda + r[j] = 0 if th < 0 else th + C2 = np.multiply(np.matlib.repmat(np.divide(r, (r + _lambda )), 1, N), C1) + elif self.norm_type == 'inf': + # TODO: write it + print '' + + return C2 + + def errorCoef(self, Z, C): + + err = np.sum(np.abs(Z-C)) / (np.shape(C)[0] * np.shape(C)[1]) + + return err + # err = sum(sum(abs(Z - C))) / (size(C, 1) * size(C, 2)); + + def almLasso_mat_fun(self): + + ''' + This function represents the Augumented Lagrangian Multipliers method for Lasso problem. + The lagrangian form of the Lasso can be expressed as following: + + MIN{ 1/2||Y-XBHETA||_2^2 + lambda||THETA||_1} s.t B-T=0 + + When applied to this problem, the ADMM updates take the form + + BHETA^t+1 = (XtX + rhoI)^-1(Xty + rho^t - mu^t) + THETA^t+1 = Shrinkage_lambda/rho(BHETA(t+1) + mu(t)/rho) + mu(t+1) = mu(t) + rho(BHETA(t+1) - BHETA(t+1)) + + The algorithm involves a 'ridge regression' update for BHETA, a soft-thresholding (shrinkage) step for THETA and + then a simple linear update for mu + + NB: Actually, this ADMM version contains several variations such as the using of two penalty parameters instead + of just one of them (mu1, mu2) + ''' + + print ('\tADMM processing...') + + alpha1 = alpha2 = 0 + if (len(self.reg_params) == 1): + alpha1 = self.reg_params[0] + alpha2 = self.reg_params[0] + elif (len(self.reg_params) == 2): + alpha1 = self.reg_params[0] + alpha2 = self.reg_params[1] + + #thresholds parameters for stopping criteria + if (len(self.thr) == 1): + thr1 = self.thr[0] + thr2 = self.thr[0] + elif (len(self.thr) == 2): + thr1 = self.thr[0] + thr2 = self.thr[1] + + # entry condition + err1 = 10 * thr1 + err2 = 10 * thr2 + + start_time = time.time() + + # setting penalty parameters for the ALM + mu1p = alpha1 * 1/self.computeLambda() + print("\t\t-Compute Lambda- Time = %s seconds" % (time.time() - start_time)) + mu2p = alpha2 * 1 + + mu1 = mu1p + mu2 = mu2p + + i = 1 + start_time = time.time() + if self.GPU == True: + + # defining penalty parameters e constraint to minimize, lambda and C matrix respectively + THETA = misc.zeros((self.num_columns,self.num_columns),dtype='float64') + lambda2 = misc.zeros((self.num_columns,self.num_columns),dtype='float64') + + gpu_data = gpuarray.to_gpu(self.data) + P_GPU = linalg.dot(gpu_data,gpu_data,transa='T') + + OP1 = P_GPU + linalg.scale(np.float32(mu1), OP1) + + OP2 = linalg.eye(self.num_columns) + linalg.scale(mu2,OP2) + + + if self.affine == True: + + print ('\t\tGPU affine...') + + OP3 = misc.ones((self.num_columns, self.num_columns), dtype='float64') + linalg.scale(mu2, OP3) + lambda3 = misc.zeros((1, self.num_columns), dtype='float64') + + # TODO: Because of some problem with linalg.inv version of scikit-cuda we fix it using np.linalg.inv of numpy + A = np.linalg.inv(misc.add(misc.add(OP1.get(), OP2.get()), OP3.get())) + + A_GPU = gpuarray.to_gpu(A) + + while ( (err1 > thr1 or err2 > thr1) and i < self.max_iter): + + _lambda2 = gpuarray.to_gpu(lambda2) + _lambda3 = gpuarray.to_gpu(lambda3) + + linalg.scale(1/mu2, _lambda2) + term_OP2 = gpuarray.to_gpu(_lambda2.get()) + + OP2 = gpuarray.to_gpu(misc.subtract(THETA, term_OP2)) + linalg.scale(mu2,OP2) + + OP4 = gpuarray.to_gpu(np.matlib.repmat(_lambda3.get(), self.num_columns, 1)) + + # updating Z + BHETA = linalg.dot(A_GPU,misc.add(misc.add(misc.add(OP1,OP2),OP3),OP4)) + + # deallocating unnecessary GPU variables + OP2.gpudata.free() + OP4.gpudata.free() + _lambda2.gpudata.free() + _lambda3.gpudata.free() + + # updating C + THETA = misc.add(BHETA,term_OP2) + THETA = self.shrinkL1Lq(THETA.get(),1/mu2) + THETA = THETA.astype('float64') + + # updating Lagrange multipliers + term_lambda2 = misc.subtract(BHETA, gpuarray.to_gpu(THETA)) + + linalg.scale(mu2,term_lambda2) + term_lambda2 = gpuarray.to_gpu(term_lambda2.get()) + lambda2 = misc.add(lambda2, term_lambda2) # on GPU + + term_lambda3 = misc.subtract(misc.ones((1, self.num_columns), dtype='float64'), misc.sum(BHETA,axis=0)) + linalg.scale(mu2,term_lambda3) + term_lambda3 = gpuarray.to_gpu(term_lambda3.get()) + lambda3 = misc.add(lambda3, term_lambda3) # on GPU + + # deallocating unnecessary GPU variables + term_OP2.gpudata.free() + term_lambda2.gpudata.free() + term_lambda3.gpudata.free() + + err1 = self.errorCoef(BHETA.get(), THETA) + err2 = self.errorCoef(np.sum(BHETA.get(), axis=0), np.ones([1, self.num_columns])) + + # deallocating unnecessary GPU variables + BHETA.gpudata.free() + + THETA = gpuarray.to_gpu((THETA)) + + # reporting errors + if (self.verbose and (i % self.step == 0)): + print('\t\tIteration = %d, ||Z - C|| = %2.5e, ||1 - C^T 1|| = %2.5e' % (i, err1, err2)) + i += 1 + + THETA = THETA.get() + + Err = [err1, err2] + if(self.verbose): + print ('\t\tTerminating ADMM at iteration %5.0f, \n ||Z - C|| = %2.5e, ||1 - C^T 1|| = %2.5e. \n' % (i, err1, err2)) + + else: + print '\t\tGPU not affine' + + # TODO: Because of some problem with linalg.inv version of scikit-cuda we fix it using np.linalg.inv of numpy + A = np.linalg.inv(misc.add(OP1.get(), OP2.get())) + A_GPU = gpuarray.to_gpu(A) + + while ( err1 > thr1 and i < self.max_iter): + + _lambda2 = gpuarray.to_gpu(lambda2) + + term_OP2 = THETA + linalg.scale(mu2, term_OP2) + + term_OP2 = misc.subtract(term_OP2, _lambda2) + + OP2 = gpuarray.to_gpu(term_OP2.get()) + + + BHETA = linalg.dot(A_GPU, misc.add(OP1 , OP2)) + + linalg.scale(1 / mu2, _lambda2) + term_THETA = gpuarray.to_gpu(_lambda2.get()) + + THETA = misc.add(BHETA,term_THETA) + THETA = self.shrinkL1Lq(THETA.get(),1/mu2) + + THETA = THETA.astype('float32') + + # updating Lagrange multipliers + term_lambda2 = misc.subtract(BHETA, gpuarray.to_gpu(THETA)) + linalg.scale(mu2,term_lambda2) + term_lambda2 = gpuarray.to_gpu(term_lambda2.get()) + lambda2 = misc.add(lambda2, term_lambda2) # on GPU + + err1 = self.errorCoef(BHETA.get(), THETA) + + THETA = gpuarray.to_gpu((THETA)) + + # reporting errors + if (self.verbose and (i % self.step == 0)): + print('\t\tIteration %5.0f, ||Z - C|| = %2.5e' % (i, err1)) + i += 1 + + + THETA = THETA.get() + Err = [err1, err2] + if(self.verbose): + print ('\t\tTerminating ADMM at iteration %5.0f, \n ||Z - C|| = %2.5e' % (i, err1)) + + else: #CPU version + + # defining penalty parameters e constraint to minimize, lambda and C matrix respectively + THETA = np.zeros([self.num_columns, self.num_columns]) + lambda2 = np.zeros([self.num_columns, self.num_columns]) + + P = self.data.T.dot(self.data) + OP1 = np.multiply(P, mu1) + + if self.affine == True: + + # INITIALIZATION + lambda3 = np.zeros(self.num_columns).T + + A = np.linalg.inv(np.multiply(mu1,P) + np.multiply(mu2, np.eye(self.num_columns, dtype=int)) + np.multiply(mu2, np.ones([self.num_columns,self.num_columns]) )) + + OP3 = np.multiply(mu2, np.ones([self.num_columns, self.num_columns])) + + while ( (err1 > thr1 or err2 > thr1) and i < self.max_iter): + + # updating Bheta + OP2 = np.multiply(THETA - np.divide(lambda2,mu2), mu2) + OP4 = np.matlib.repmat(lambda3, self.num_columns, 1) + BHETA = A.dot(OP1 + OP2 + OP3 + OP4 ) + + # updating C + THETA = BHETA + np.divide(lambda2,mu2) + THETA = self.shrinkL1Lq(THETA, 1/mu2) + + # updating Lagrange multipliers + lambda2 = lambda2 + np.multiply(mu2,BHETA - THETA) + lambda3 = lambda3 + np.multiply(mu2, np.ones([1,self.num_columns]) - np.sum(BHETA,axis=0)) + + err1 = self.errorCoef(BHETA, THETA) + err2 = self.errorCoef(np.sum(BHETA,axis=0), np.ones([1, self.num_columns])) + + # mu1 = min(mu1 * (1 + 10 ^ -5), 10 ^ 2 * mu1p); + # mu2 = min(mu2 * (1 + 10 ^ -5), 10 ^ 2 * mu2p); + + # reporting errors + if (self.verbose and (i % self.step == 0)): + print('\t\tIteration = %d, ||Z - C|| = %2.5e, ||1 - C^T 1|| = %2.5e' % (i, err1, err2)) + i += 1 + + Err = [err1, err2] + + if(self.verbose): + print ('\t\tTerminating ADMM at iteration %5.0f, \n ||Z - C|| = %2.5e, ||1 - C^T 1|| = %2.5e. \n' % (i, err1,err2)) + else: + print '\t\tCPU not affine' + + A = np.linalg.inv(OP1 + np.multiply(mu2, np.eye(self.num_columns, dtype=int))) + + while ( err1 > thr1 and i < self.max_iter): + + # updating Z + OP2 = np.multiply(mu2, THETA) - lambda2 + BHETA = A.dot(OP1 + OP2) + + # updating C + THETA = BHETA + np.divide(lambda2, mu2) + THETA = self.shrinkL1Lq(THETA, 1/mu2) + + # updating Lagrange multipliers + lambda2 = lambda2 + np.multiply(mu2,BHETA - THETA) + + # computing errors + err1 = self.errorCoef(BHETA, THETA) + + # reporting errors + if (self.verbose and (i % self.step == 0)): + print('\t\tIteration %5.0f, ||Z - C|| = %2.5e' % (i, err1)) + i += 1 + + Err = [err1, err2] + if(self.verbose): + print ('\t\tTerminating ADMM at iteration %5.0f, \n ||Z - C|| = %2.5e' % (i, err1)) + + print("\t\t-ADMM- Time = %s seconds" % (time.time() - start_time)) + + return THETA, Err + + def rmRep(self, sInd, thr): + + ''' + This function takes the data matrix and the indices of the representatives and removes the representatives + that are too close to each other + + :param sInd: indices of the representatives + :param thr: threshold for pruning the representatives, typically in [0.9,0.99] + :return: representatives indices + ''' + + Ys = self.data[:, sInd] + + Ns = Ys.shape[1] + d = np.zeros([Ns, Ns]) + + # Computes a the distance matrix for all selected columns by the algorithm + for i in xrange(0,Ns-1): + for j in xrange(i+1,Ns): + d[i,j] = np.linalg.norm(Ys[:,i] - Ys[:,j]) + + d = d + d.T # define symmetric matrix + + dsorti = np.argsort(d,axis=0)[::-1] + dsort = np.flipud(np.sort(d,axis=0)) + + pind = np.arange(0,Ns) + for i in xrange(0, Ns): + if np.any(pind==i) == True: + cum = 0 + t = -1 + while cum <= (thr * np.sum(dsort[:,i])): + t += 1 + cum += dsort[t, i] + + pind = np.setdiff1d(pind, np.setdiff1d( dsorti[t:,i], np.arange(0,i+1), assume_unique=True), assume_unique=True) + + ind = sInd[pind] + + return ind + + def findRep(self,C, thr, norm): + + ''' + This function takes the coefficient matrix with few nonzero rows and computes the indices of the nonzero rows + :param C: NxN coefficient matrix + :param thr: threshold for selecting the nonzero rows of C, typically in [0.9,0.99] + :param norm: value of norm used in the L1/Lq minimization program in {1,2,inf} + :return: the representatives indices on the basis of the ascending norm of the row of C (larger is the norm of + a generic row most representative it is) + ''' + + N = C.shape[0] + + r = np.zeros([1,N]) + + for i in xrange(0, N): + + r[:,i] = np.linalg.norm(C[i,:],norm) + + nrmInd = np.argsort(r)[0][::-1] #descending order + nrm = r[0,nrmInd] + + # pick norm indices basing on the thresholding of the 'cumulative norm's sum' + cssInd = nrmInd[np.cumsum(nrm)/np.sum(nrm) < thr] + + return cssInd, nrmInd + + + def admm(self): + + ''' + ''' + # initializing penalty parameters + self.reg_params = [self.alpha, self.alpha] + + thrS = 0.99 + thrP = 0.95 + + #subtract mean from sample + if self.normalize == True: + self.data = self.data - np.matlib.repmat(np.mean(self.data, axis=1), self.num_columns,1).T + + self.repInd = [] + if (self.PCA == True): + print ('\t\tPerforming PCA...') + pca = PCA(n_components = self.npc) + self.data = pca.fit_transform(self.data) + self.num_columns = self.data.shape[0] + self.num_row = self.data.shape[0] + self.num_columns = self.data.shape[1] + + + self.C,_ = self.almLasso_mat_fun() + + self.sInd, self.nrmInd = self.findRep(self.C, thrS, self.norm_type) + + # custom procedure for removing redundant indices + # self.repInd = self.rmRep(self.sInd, thrP) + self.repInd = [] + + + return self.nrmInd, self.sInd, self.repInd, self.C + + def plot_sparsness(self): + plt.spy(self.C, markersize=1, precision=0.01) + plt.show() \ No newline at end of file diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..e69de29