diff --git a/machine_learning/Malware analysis.py b/machine_learning/Malware analysis.py new file mode 100644 index 0000000..2db292e --- /dev/null +++ b/machine_learning/Malware analysis.py @@ -0,0 +1,107 @@ +#! /usr/bin/python2 +# used for DATAFrames and DataFrames can hold different types data of multidimensional arrays. +import pandas as pd +# Numpy provides robust data structures for efficient computation of multi-dimensional arrays & matrices. +import numpy as np +import pickle +import sklearn.ensemble as ske +from sklearn import model_selection, tree, linear_model +from sklearn.feature_selection import SelectFromModel +import joblib +from sklearn.naive_bayes import GaussianNB +from sklearn.metrics import confusion_matrix +from sklearn import tree + +data = pd.read_csv('data.csv', sep='|') # generate df as data +# now droping some coloumns as axis 1(mean coloumn) and will show the values in the rows +X = data.drop(['Name', 'md5', 'legitimate'], axis=1).values +y = data['legitimate'].values # values of legitimate data + +print('Researching important feature based on %i total features\n' % + X.shape[1]) # shape() is use in pandas to give number of row/column + +# Feature selection using Trees Classifier +fsel = ske.ExtraTreesClassifier().fit(X, y) +model = SelectFromModel(fsel, prefit=True) +X_new = model.transform(X) +nb_features = X_new.shape[1] # will save value 13 as shape is (138047, 13) :} + +# now converting in training and testing data in 20% range hahhahaha ! as total x is 138047 and testing is 138047*0.2=27610 :) +X_train, X_test, y_train, y_test = model_selection.train_test_split( + X_new, y, test_size=0.2) +features = [] + +print('%i features identified as important:' % + nb_features) # as mentioned above + + +# important features sorted +indices = np.argsort(fsel.feature_importances_)[::-1][:nb_features] +for f in range(nb_features): + print("%d. feature %s (%f)" % ( + f + 1, data.columns[2+indices[f]], fsel.feature_importances_[indices[f]])) + +# mean adding to the empty 'features' array the 'important features' +# [::-1] mean start with last towards first +for f in sorted(np.argsort(fsel.feature_importances_)[::-1][:nb_features]): + features.append(data.columns[2+f]) + +print(features) + +# Algorithm comparison +algorithms = { + # The max_depth parameter denotes maximum depth of the tree. + + # In case, of random forest, these ensemble classifiers are + # the randomly created decision trees. Each decision tree is a single classifier and the target prediction is based on + # the majority voting method. + "RandomForest": ske.RandomForestClassifier(n_estimators=50), + # n_estimators ==The number of trees in the forest. + + "GradientBoosting": ske.GradientBoostingClassifier(n_estimators=50), + "AdaBoost": ske.AdaBoostClassifier(n_estimators=100), + # Ada mean Adaptive + # Both are boosting algorithms which means that they convert a set of weak learners into a single strong learner. They + # both initialize a strong learner (usually a decision tree) and iteratively create a weak learner that is added to the + # strong learner. They differ on how they create the weak learners during the iterative process. + + "GNB": GaussianNB(), + # Bayes theorem is based on conditional probability. The conditional probability helps us calculating the probability + # that something will happen + "DecisionTree": tree.DecisionTreeClassifier(max_depth=10) +} +results = {} +print("\nNow testing algorithms") +for algo in algorithms: + clf = algorithms[algo] + clf.fit(X_train, y_train) # fit may be called as 'trained' + score = clf.score(X_test, y_test) + print("%s : %f %%" % (algo, score*100)) + results[algo] = score + +tree.plot_tree(clf) + +winner = max(results, key=results.get) +print('\nWinner algorithm is %s with a %f %% success' % + (winner, results[winner]*100)) + + +# Save the algorithm and the feature list for later predictions +print('Saving algorithm and feature list in classifier directory...') +# Persist an arbitrary Python object into one file. +joblib.dump(algorithms[winner], 'classifier/classifier.pkl') +open('classifier/features.pkl', 'wb').write(pickle.dumps(features)) +# joblib works especially well with NumPy arrays which are used by sklearn so depending on the classifier type you use you might +# have performance and size benefits using joblib.Otherwise pickle does work correctly so saving a trained classifier and loading +# it again will produce the same results no matter which of the serialization libraries you use +print('Saved') + +# Identify false and true positive rates +clf = algorithms[winner] +res = clf.predict(X_test) +mt = confusion_matrix(y_test, res) +print(mt) +# A confusion matrix, also known as an error matrix,[4] is a specific table layout that allows visualization +# of the performance of an algorithm, typically a supervised learning. +print("False positive rate : %f %%" % ((mt[0][1] / float(sum(mt[0])))*100)) +print('False negative rate : %f %%' % ((mt[1][0] / float(sum(mt[1]))*100))) diff --git a/machine_learning/data.zip b/machine_learning/data.zip new file mode 100644 index 0000000..59820a4 Binary files /dev/null and b/machine_learning/data.zip differ