master

分支 (1)

管理

管理

master

TLS-Malware-Detection-with-Machine-Learning
/
malware_detection.py

import pandas as pd
from math import sqrt;
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression;
from sklearn.metrics import accuracy_score, r2_score, confusion_matrix, mean_absolute_error, mean_squared_error, f1_score, log_loss
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.externals import joblib

#Importing Datasets
malicious_dataset = pd.read_csv('insert malicious file name.csv')
benign_dataset = pd.read_csv('insert benign file name.csv')

# Removing duplicated rows from benign_dataset (5380 rows removed)
benign_dataset = benign_dataset[benign_dataset.duplicated(keep=False) == False]

print(benign_dataset.duplicated(keep=False).value_counts())
print(malicious_dataset.duplicated(keep=False).value_counts())

# Combining both datasets together
all_flows = pd.concat([malicious_dataset, benign_dataset])

#Inspecting datasets for columns and rows with missing values
missing_values = all_flows.isnull().sum()
overall_percentage = (missing_values/all_flows.isnull().count())
print(overall_percentage)

# Reducing the size of the dataset to reduce the amount of time taken in training models
reduced_dataset = all_flows.sample(70000)

validation_dataset = pd.read_csv('insert validation file name')
validation_dataset = validation_dataset[validation_dataset.duplicated(keep=False) == False]

# Examining the distribution of Malicious and Benign flows in the reduced dataset
print(reduced_dataset['isMalware'].value_counts())

# Isolating independent and dependent variables for training dataset
reduced_y = reduced_dataset['isMalware']
reduced_x = reduced_dataset.drop(['isMalware'], axis=1);

# Isolating independent and dependent variables for validation dataset
validation_y = validation_dataset['isMalware']
validation_x = validation_dataset.drop(['isMalware'], axis=1);

# Splitting datasets into training and test data
x_train, x_test, y_train, y_test = train_test_split(reduced_x, reduced_y, test_size=0.2, random_state=42)

# Training random forest classifier
rf_clf = RandomForestClassifier(max_depth=100)
rf_clf.fit(x_train, y_train)
rf_prediction = rf_clf.predict(x_test)
conf_m = confusion_matrix(y_test, rf_prediction)
print(conf_m)
print('Random Forest Classifier Accuracy score: ', accuracy_score(y_test, rf_prediction))

rf_validation_prediction = rf_clf.predict(validation_x)
print('Random Forest Classifier with validation set Accuracy score: ', accuracy_score(rf_validation_prediction, validation_y))

# Training Logistics Regression classifier
lm = LogisticRegression(max_iter=70000)
lm.fit(x_train,y_train)
predictions = lm.predict(x_test)
rms = sqrt(mean_squared_error(y_test, predictions))
print('Mean Absolute Error: ', mean_absolute_error(y_test, predictions))
print('Root Mean Squared Error: ',rms)
print('R Squared Score: ', r2_score(y_test, predictions))
print('Accuracy score: ', accuracy_score(predictions, y_test))

#Testing the accuracy of the logistic regression model on the validation dataset
validation_predictions = lm.predict(validation_x)
rms = sqrt(mean_squared_error(validation_y, validation_predictions))
print('Mean Absolute Error: ', mean_absolute_error(validation_y, validation_predictions))
print('Root Mean Squared Error: ',rms)
print('R Squared Score: ', r2_score(validation_y, validation_predictions))
print('Accuracy score: ', accuracy_score(validation_predictions, validation_y))

#Confusion matrix for test set accuracy
conf_m_lr = confusion_matrix(y_test, predictions)
print(conf_m_lr)

# Importing cipher stunted datasets
cipher_stunted_dataset = pd.read_csv('Insert cipher stunted file name')

# Removing duplicated rows from benign_dataset_test
cipher_stunted_dataset = cipher_stunted_dataset[cipher_stunted_dataset.duplicated(keep=False) == False]

print(cipher_stunted_dataset.duplicated(keep=False).value_counts())

#Inspecting datasets for columns and rows with missing values
test_missing_values = cipher_stunted_dataset.isnull().sum()
test_overall_percentage = (test_missing_values/cipher_stunted_dataset.isnull().count())

cs_test_x, cs_test_y = cipher_stunted_dataset.drop(['isMalware'], axis=1), cipher_stunted_dataset['isMalware']

cs_test_predictions = lm.predict(cs_test_x)
print(test_predictions)

conf_m_lr_cs = confusion_matrix(cs_test_y, cs_test_predictions)
print(conf_m_lr_cs)

print('Logistics Regression Accuracy score: ', accuracy_score(cs_test_predictions, cs_test_y))

rf_validation_prediction_2 = rf_clf.predict(cs_test_x)
print('Random Forest Classifier Accuracy score: ', accuracy_score(rf_validation_prediction_2, cs_test_y))

conf_m_lr_cs = confusion_matrix(cs_test_y, rf_validation_prediction_2)
print(conf_m_lr_cs)

#Function to plot most important features of random forest model

def plot_feature_importance(importance,names,model_type):

    #Create arrays from feature importance and feature names
    feature_importance = np.array(importance)
    feature_names = np.array(names)

    #Create a DataFrame using a Dictionary
    data={'feature_names':feature_names,'feature_importance':feature_importance}
    fi_df = pd.DataFrame(data)

    #Sort the DataFrame in order decreasing feature importance
    fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True)

    #Define size of bar plot
    plt.figure(figsize=(10,8))
    #Plot Searborn bar chart
    sns.barplot(x=fi_df['feature_importance'], y=fi_df['feature_names'])
    plt.ylim(0, 50)
    #Add chart labels
    plt.title(model_type + 'FEATURE IMPORTANCE')
    plt.xlabel('FEATURE IMPORTANCE')
    plt.ylabel('FEATURE NAMES')

plot_feature_importance(rf_clf.feature_importances_,x_train.columns,'RANDOM FOREST')

x1 = reduced_dataset.loc[reduced_dataset.isMalware==1, 'num_of_exts']
x2 = reduced_dataset.loc[reduced_dataset.isMalware==0, 'num_of_exts']

kwargs = dict(hist_kws={'alpha':.6}, kde_kws={'linewidth':2})

plt.figure(figsize=(10,7), dpi= 80)
sns.distplot(x1, color="royalblue", label="Malware", **kwargs, kde=False)
sns.distplot(x2, color="orange", label="Benign", **kwargs, kde=False)

plt.legend();
plt.show()

x1 = reduced_dataset.loc[reduced_dataset.isMalware==1, 'Src_Port']
x2 = reduced_dataset.loc[reduced_dataset.isMalware==0, 'Src_Port']

kwargs = dict(hist_kws={'alpha':.6}, kde_kws={'linewidth':2})

plt.figure(figsize=(10,7), dpi= 80)
sns.distplot(x1, color="royalblue", label="Malware", **kwargs, kde=False)
sns.distplot(x2, color="orange", label="Benign", **kwargs, kde=False)

plt.legend();
plt.show()

#Saving the random forest and logistics regression models
joblib.dump(rf_clf, 'random_forest_model.pkl')
joblib.dump(lm, 'logistics_regression_classifier.pkl')

# Load the model from the file
rf_from_joblib = joblib.load('random_forest_model.pkl')
lm_from_joblib = joblib.load('logistics_regression_classifier.pkl')

# Use the loaded model to make predictions
results = rf_from_joblib.predict(x_test)
results = lm_from_joblib.predict(x_test)
print('Random Forest Classifier Accuracy score: ', accuracy_score(y_test, results))

# Load the model from the file
rf_from_joblib = joblib.load('random_forest_model.pkl')

#Cipher and extension stunded dataset tests
cs_ex_stunted_dataset = pd.read_csv('Insert cipher stunted file name')
cs_ex_stunted_dataset = cs_ex_stunted_dataset[cs_ex_stunted_dataset.duplicated(keep=False) == False]
cs_ex_test_x, cs_ex_test_y = cs_ex_stunted_dataset.drop(['isMalware'], axis=1), cs_ex_stunted_dataset['isMalware']
results = rf_from_joblib.predict(cs_ex_test_x)
print('Random Forest Classifier Accuracy score: ', accuracy_score(cs_ex_test_y, results))

#Cipher, extension, and source port stunted results
cs_ex_sp_stunted_dataset = pd.read_csv('Insert cipher stunted file name')
cs_ex_sp_stunted_dataset = cs_ex_sp_stunted_dataset[cs_ex_sp_stunted_dataset.duplicated(keep=False) == False]
cs_ex_sp_test_x, cs_ex_sp_test_y = cs_ex_sp_stunted_dataset.drop(['isMalware'], axis=1), cs_ex_sp_stunted_dataset['isMalware']
results = rf_from_joblib.predict(cs_ex_sp_test_x)
print('Random Forest Classifier Accuracy score: ', accuracy_score(cs_ex_sp_test_y, results))

#Cipher, extension, source port, and packet out stunted results
cs_ex_sp_po_stunted_dataset = pd.read_csv('Insert cipher stunted file name')
cs_ex_sp_po_stunted_dataset = cs_ex_sp_po_stunted_dataset[cs_ex_sp_po_stunted_dataset.duplicated(keep=False) == False]
cs_ex_sp_po_test_x, cs_ex_sp_po_test_y = cs_ex_sp_po_stunted_dataset.drop(['isMalware'], axis=1), cs_ex_sp_po_stunted_dataset['isMalware']
results = rf_from_joblib.predict(cs_ex_sp_po_test_x)
print('Random Forest Classifier Accuracy score: ', accuracy_score(cs_ex_sp_po_test_y, results))

# Combining all datasets together
new_all_flows = pd.concat([malicious_dataset, benign_dataset, cipher_stunted_dataset, cs_ex_stunted_dataset, cs_ex_sp_stunted_dataset, cs_ex_sp_po_stunted_dataset])

# Examining combined dataset
print(new_all_flows['isMalware'].value_counts())

# Isolating independent and dependent variables for new combined dataset
new_all_flows_y = new_all_flows['isMalware']
new_all_flows_x = new_all_flows.drop(['isMalware'], axis=1);

# Splitting datasets into training and test data
combined_x_train, combined_x_test, combined_y_train, combined_y_test = train_test_split(new_all_flows_x, new_all_flows_y, test_size=0.2, random_state=42)

#Training new RF classifier
combined_rf_clf = RandomForestClassifier(max_depth=100)
combined_rf_clf.fit(combined_x_train, combined_y_train)
combined_rf_prediction = combined_rf_clf.predict(combined_x_test)
combined_conf_m = confusion_matrix(combined_y_test, combined_rf_prediction)
print(combined_conf_m)
print('Random Forest Classifier Accuracy score: ', accuracy_score(combined_y_test, combined_rf_prediction))

# Testing combined Logistics Regression classifier on datasets
combined_lm = LogisticRegression(max_iter=100000)
combined_lm.fit(combined_x_train,combined_y_train)
combined_lm_predictions = combined_lm.predict(combined_x_test)
combined_rms = sqrt(mean_squared_error(combined_y_test, combined_lm_predictions))
print('Mean Absolute Error: ', mean_absolute_error(combined_y_test, combined_lm_predictions))
print('Root Mean Squared Error: ',combined_rms)
print('R Squared Score: ', r2_score(combined_y_test, combined_lm_predictions))
print('Accuracy score: ', accuracy_score(combined_lm_predictions, combined_y_test))

combined_rf_prediction_2 = combined_rf_clf.predict(sample_benign_flows_x)

combined_lm_predictions_2 = combined_lm.predict(sample_benign_flows_x)

# Testing on unseen sample benign dataset for combined rf classifier
print('Random Forest Classifier Accuracy score: ', accuracy_score(sample_benign_flows_y, combined_rf_prediction_2))

# Testing on unseen sample benign dataset for logistics regression classifier
print('Random Forest Classifier Accuracy score: ', accuracy_score(sample_benign_flows_y, combined_lm_predictions_2))

#Cipher stunted results
new_cs_stunted_dataset = pd.read_csv('Insert cipher stunted file name')
new_cs_stunted_dataset = new_cs_stunted_dataset[new_cs_stunted_dataset.duplicated(keep=False) == False]
new_cs_stunted_dataset_x, new_cs_stunted_dataset_y = new_cs_stunted_dataset.drop(['isMalware'], axis=1), new_cs_stunted_dataset['isMalware']
results = combined_rf_clf.predict(new_cs_stunted_dataset_x)
print('Random Forest Classifier Accuracy score: ', accuracy_score(new_cs_stunted_dataset_y, results))

#Cipher and extension stunted results
new_cs_ex_stunted_dataset = pd.read_csv('Insert cipher stunted file name')
new_cs_ex_stunted_dataset = new_cs_ex_stunted_dataset[new_cs_ex_stunted_dataset.duplicated(keep=False) == False]
new_cs_ex_stunted_dataset_x, new_cs_ex_stunted_dataset_y = new_cs_ex_stunted_dataset.drop(['isMalware'], axis=1), new_cs_ex_stunted_dataset['isMalware']
results = combined_rf_clf.predict(new_cs_ex_stunted_dataset_x)
print('Random Forest Classifier Accuracy score: ', accuracy_score(new_cs_ex_stunted_dataset_y, results))

#Cipher, extension, and source port stunted results
new_cs_ex_sp_stunted_malicious_dataset = pd.read_csv('Insert cipher stunted file name')
new_cs_ex_sp_stunted_malicious_dataset = new_cs_ex_sp_stunted_malicious_dataset[new_cs_ex_sp_stunted_malicious_dataset.duplicated(keep=False) == False]
new_cs_ex_sp_stunted_malicious_dataset_x, new_cs_ex_sp_stunted_malicious_dataset_y = new_cs_ex_sp_stunted_malicious_dataset.drop(['isMalware'], axis=1), new_cs_ex_sp_stunted_dataset['isMalware']
results = combined_rf_clf.predict(new_cs_ex_sp_stunted_malicious_dataset_x)
print('Random Forest Classifier Accuracy score: ', accuracy_score(new_cs_ex_sp_stunted_malicious_dataset_y, results))

#Cipher, extension, source port, and packets out stunted results
new_cs_ex_sp_po_stunted_dataset = pd.read_csv('Insert cipher stunted file name')
new_cs_ex_sp_po_stunted_dataset = new_cs_ex_sp_po_stunted_dataset[new_cs_ex_sp_po_stunted_dataset.duplicated(keep=False) == False]
new_cs_ex_sp_po_stunted_dataset_x, new_cs_ex_sp_po_stunted_dataset_y = new_cs_ex_sp_po_stunted_dataset.drop(['isMalware'], axis=1), new_cs_ex_sp_po_stunted_dataset['isMalware']
results = combined_rf_clf.predict(new_cs_ex_sp_po_stunted_dataset_x)
print('Random Forest Classifier Accuracy score: ', accuracy_score(new_cs_ex_sp_po_stunted_dataset_y, results))

lm_predictions_cs_ex = combined_lm.predict(new_cs_ex_stunted_dataset_x)
print('Accuracy score: ', accuracy_score(lm_predictions_cs_ex, new_cs_ex_stunted_dataset_y))

lm_predictions_cs_ex_sp = combined_lm.predict(new_cs_ex_sp_stunted_malicious_dataset_x)
print('Accuracy score: ', accuracy_score(lm_predictions_cs_ex_sp, new_cs_ex_sp_stunted_malicious_dataset_y))

lm_predictions_cs_ex_sp_po = combined_lm.predict(new_cs_ex_sp_po_stunted_dataset_x)
print('Accuracy score: ', accuracy_score(lm_predictions_cs_ex_sp_po, new_cs_ex_sp_po_stunted_dataset_y))

joblib.dump(combined_rf_clf, 'new_random_forest_model.pkl')
joblib.dump(combined_lm, 'new_logistics_regression_classifier.pkl')