TensorMLP2.py

import os, glob, numpy

os.chdir('/Desktop/malimg_dataset')  # the parent folder with sub-folders

list_fams = os.listdir(os.getcwd())  # vector of strings with family names

no_imgs = []  # No. of samples per family

for i in range(len(list_fams)):
    os.chdir(list_fams[i])
    len1 = len(glob.glob('*.png'))  # assuming the images are stored as 'png'
    no_imgs.append(len1)
    os.chdir('..')

total = sum(no_imgs)  # total number of all samples
y = numpy.zeros(total)  # label vector

temp1 = numpy.zeros(len(no_imgs) + 1)
temp1[1:len(temp1)] = no_imgs
temp2 = int(temp1[0])  # now temp2 is [0 no_imgs]

for jj in range(len(no_imgs)):
    temp3 = temp2 + int(temp1[jj + 1])
    for ii in range(temp2, temp3):
        y[ii] = jj
    temp2 = temp2 + int(temp1[jj + 1])

import Image, leargist

X = numpy.zeros((sum(no_imgs), 320))  # Feature Matrix
cnt = 0
for i in range(len(list_fams)):
    os.chdir(list_fams[i])
    img_list = glob.glob('*.png')  # Getting only 'png' files in a folder
    for j in range(len(img_list)):
        im = Image.open(img_list[j])
        im1 = im.resize((64, 64), Image.ANTIALIAS);  # for faster computation
        des = leargist.color_gist(im1)
        X[cnt] = des[0:320]
        cnt = cnt + 1
    os.chdir('..')
import random
from sklearn.cross_validation import StratifiedKFold
from sklearn.utils import shuffle

n_samples, n_features = X.shape
p = range(n_samples)  # an index array, 0:n_samples
random.seed(random.random())
random.shuffle(p)  # the index array is now shuffled

X, y = X[p], y[p]  # both the arrays are now shuffled

kfold = 10  # no. of folds (better to have this at the start of the code)

skf = StratifiedKFold(y, kfold)  # indices='true'

# Stratified KFold: This first divides the data into k folds. Then it also makes sure that the distribution of the data in each fold follows the original input distribution
# Note: in future versions of scikit.learn, this module will be fused with kfold

skfind = [None] * len(skf)  # indices
cnt = 0
for train_index in skf:
    skfind[cnt] = train_index
    cnt = cnt + 1


conf_mat = numpy.zeros((len(no_imgs), len(no_imgs)))  # Initializing the Confusion Matrix

n_neighbors = 1  # better to have this at the start of the code

# 10-fold Cross Validation

for i in range(kfold):
    train_indices = skfind[i][0]
    test_indices = skfind[i][1]
    clf = []

    X_train = X[train_indices]
    X_val = X[train_indices]
    y_train= y[train_indices]
    y_val  = y[train_indices]
    X_test = X[test_indices]
    y_test = y[test_indices]

###############################################################################
# Training

# Hyper-parameters. These were set by cross-validation,
# using a GridSearchCV. Here we are not performing cross-validation to
# save time.

# More components tend to give better prediction performance, but larger
# fitting time


# Training RBM-Logistic Pipeline


# Training Logistic regression


import tensorflow as tf
import numpy as np

# This function was copied verbatim from the TensorFlow tutorial at
# https://www.tensorflow.org/versions/master/tutorials/index.html
def dense_to_one_hot(labels_dense, num_classes=10):
    """Convert class labels from scalars to one-hot vectors."""
    num_labels = labels_dense.shape[0]
    index_offset = np.arange(num_labels) * num_classes
    labels_one_hot = np.zeros((num_labels, num_classes))
    labels_one_hot.flat[index_offset + labels_dense.ravel()] = 1
    return labels_one_hot


# Adapted from the TensorFlow tutorial at
# https://www.tensorflow.org/versions/master/tutorials/index.html
class DataSet(object):
    def __init__(self, images, labels):
        assert images.shape[0] == labels.shape[0], (
            "images.shape: %s labels.shape: %s" % (images.shape,
                                                   labels.shape))
        self._num_examples = images.shape[0]
        self._images = images
        self._labels = labels
        self._epochs_completed = 0
        self._index_in_epoch = 0

    @property
    def images(self):
        return self._images

    @property
    def labels(self):
        return self._labels

    @property
    def num_examples(self):
        return self._num_examples

    @property
    def epochs_completed(self):
        return self._epochs_completed

    def next_batch(self, batch_size):
        """Return the next `batch_size` examples from this data set."""
        start = self._index_in_epoch
        self._index_in_epoch += batch_size
        if self._index_in_epoch > self._num_examples:
            # Finished epoch
            self._epochs_completed += 1
            # Shuffle the data
            perm = np.arange(self._num_examples)
            np.random.shuffle(perm)
            self._images = self._images[perm]
            self._labels = self._labels[perm]
            # Start next epoch
            start = 0
            self._index_in_epoch = batch_size
            assert batch_size <= self._num_examples
        end = self._index_in_epoch
        return self._images[start:end], self._labels[start:end]


def read_data_sets(train_images, train_labels, validation_images, validation_labels, test_images, test_labels):
    class DataSets(object):
        pass

    data_sets = DataSets()
    data_sets.train = DataSet(train_images, dense_to_one_hot(train_labels))
    data_sets.validation = DataSet(validation_images, dense_to_one_hot(validation_labels))
    data_sets.test = DataSet(test_images, dense_to_one_hot(test_labels))
    return data_sets


# Adapted from the TensorFlow tutorial at
# https://www.tensorflow.org/versions/master/tutorials/index.html
def tensorFlowBasic(X_train, y_train, X_val, y_val, X_test, y_test):
    sess = tf.InteractiveSession()
    x = tf.placeholder("float", shape=[None, 400])
    y_ = tf.placeholder("float", shape=[None, 10])
    W = tf.Variable(tf.zeros([400, 10]))
    b = tf.Variable(tf.zeros([10]))
    sess.run(tf.initialize_all_variables())
    y = tf.nn.softmax(tf.matmul(x, W) + b)
    cross_entropy = -tf.reduce_sum(y_ * tf.log(y))
    train_step = tf.train.GradientDescentOptimizer(0.01).minimize(cross_entropy)
    mydata = read_data_sets(X_train, y_train, X_val, y_val, X_test, y_test)

    for i in range(1000):
        batch = mydata.train.next_batch(50)
        train_step.run(feed_dict={x: batch[0], y_: batch[1]})

    correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
    return accuracy.eval(feed_dict={x: mydata.test.images, y_: mydata.test.labels})


def weight_variable(shape):
    initial = tf.truncated_normal(shape, stddev=0.1)
    return tf.Variable(initial)


def bias_variable(shape):
    initial = tf.constant(0.1, shape=shape)
    return tf.Variable(initial)


def conv2d(x, W):
    return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')


def max_pool_2x2(x):
    return tf.nn.max_pool(x, ksize=[1, 2, 2, 1],
                          strides=[1, 2, 2, 1], padding='SAME')


def tensorFlowCNN(X_train, y_train, X_val, y_val, X_test, y_test, add_second_conv_layer=True):
    x = tf.placeholder("float", shape=[None, 400])
    y_ = tf.placeholder("float", shape=[None, 10])
    sess = tf.InteractiveSession()
    # First Convolutional Layer
    W_conv1 = weight_variable([5, 5, 1, 32])
    b_conv1 = bias_variable([32])
    x_image = tf.reshape(x, [-1, 20, 20, 1])
    h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1)
    h_pool1 = max_pool_2x2(h_conv1)
    if add_second_conv_layer:
        # Second Convolutional Layer
        W_conv2 = weight_variable([5, 5, 32, 64])
        b_conv2 = bias_variable([64])
        h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
        h_pool2 = max_pool_2x2(h_conv2)

        # Densely Connected Layer
        W_fc1 = weight_variable([5 * 5 * 64, 1024])
        b_fc1 = bias_variable([1024])
        h_pool2_flat = tf.reshape(h_pool2, [-1, 5 * 5 * 64])
        h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)
    else:
        # Densely Connected Layer
        W_fc1 = weight_variable([10 * 10 * 32, 1024])
        b_fc1 = bias_variable([1024])
        h_pool1_flat = tf.reshape(h_pool1, [-1, 10 * 10 * 32])
        h_fc1 = tf.nn.relu(tf.matmul(h_pool1_flat, W_fc1) + b_fc1)

        # Dropout
    keep_prob = tf.placeholder("float")
    h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)
    # Softmax
    W_fc2 = weight_variable([1024, 10])
    b_fc2 = bias_variable([10])
    y_conv = tf.nn.softmax(tf.matmul(h_fc1_drop, W_fc2) + b_fc2)

    # Train the model
    mydata = read_data_sets(X_train, y_train, X_val, y_val, X_test, y_test)
    cross_entropy = -tf.reduce_sum(y_ * tf.log(y_conv))
    train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
    correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
    sess.run(tf.initialize_all_variables())
    for i in range(1000):
        batch = mydata.train.next_batch(50)
        if i % 100 == 0:
            train_accuracy = accuracy.eval(feed_dict={
                x: batch[0], y_: batch[1], keep_prob: 1.0})
            print("step %d, training accuracy %g" % (i, train_accuracy))
        train_step.run(feed_dict={x: batch[0], y_: batch[1], keep_prob: 0.5})

    return accuracy.eval(feed_dict={
        x: mydata.test.images, y_: mydata.test.labels, keep_prob: 1.0})


accuracy = tensorFlowCNN(X_train, y_train, X_val, y_val, X_test, y_test)