1 Star 0 Fork 0

吴政琪/TLS-Malware-Detection-with-Machine-Learning

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
malware_detection.py 13.94 KB
一键复制 编辑 原始数据 按行查看 历史
WalterDiong 提交于 2020-08-15 00:09 . Update malware_detection.py
import pandas as pd
from math import sqrt;
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression;
from sklearn.metrics import accuracy_score, r2_score, confusion_matrix, mean_absolute_error, mean_squared_error, f1_score, log_loss
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.externals import joblib
#Importing Datasets
malicious_dataset = pd.read_csv('insert malicious file name.csv')
benign_dataset = pd.read_csv('insert benign file name.csv')
# Removing duplicated rows from benign_dataset (5380 rows removed)
benign_dataset = benign_dataset[benign_dataset.duplicated(keep=False) == False]
print(benign_dataset.duplicated(keep=False).value_counts())
print(malicious_dataset.duplicated(keep=False).value_counts())
# Combining both datasets together
all_flows = pd.concat([malicious_dataset, benign_dataset])
#Inspecting datasets for columns and rows with missing values
missing_values = all_flows.isnull().sum()
overall_percentage = (missing_values/all_flows.isnull().count())
print(overall_percentage)
# Reducing the size of the dataset to reduce the amount of time taken in training models
reduced_dataset = all_flows.sample(70000)
validation_dataset = pd.read_csv('insert validation file name')
validation_dataset = validation_dataset[validation_dataset.duplicated(keep=False) == False]
# Examining the distribution of Malicious and Benign flows in the reduced dataset
print(reduced_dataset['isMalware'].value_counts())
# Isolating independent and dependent variables for training dataset
reduced_y = reduced_dataset['isMalware']
reduced_x = reduced_dataset.drop(['isMalware'], axis=1);
# Isolating independent and dependent variables for validation dataset
validation_y = validation_dataset['isMalware']
validation_x = validation_dataset.drop(['isMalware'], axis=1);
# Splitting datasets into training and test data
x_train, x_test, y_train, y_test = train_test_split(reduced_x, reduced_y, test_size=0.2, random_state=42)
# Training random forest classifier
rf_clf = RandomForestClassifier(max_depth=100)
rf_clf.fit(x_train, y_train)
rf_prediction = rf_clf.predict(x_test)
conf_m = confusion_matrix(y_test, rf_prediction)
print(conf_m)
print('Random Forest Classifier Accuracy score: ', accuracy_score(y_test, rf_prediction))
rf_validation_prediction = rf_clf.predict(validation_x)
print('Random Forest Classifier with validation set Accuracy score: ', accuracy_score(rf_validation_prediction, validation_y))
# Training Logistics Regression classifier
lm = LogisticRegression(max_iter=70000)
lm.fit(x_train,y_train)
predictions = lm.predict(x_test)
rms = sqrt(mean_squared_error(y_test, predictions))
print('Mean Absolute Error: ', mean_absolute_error(y_test, predictions))
print('Root Mean Squared Error: ',rms)
print('R Squared Score: ', r2_score(y_test, predictions))
print('Accuracy score: ', accuracy_score(predictions, y_test))
#Testing the accuracy of the logistic regression model on the validation dataset
validation_predictions = lm.predict(validation_x)
rms = sqrt(mean_squared_error(validation_y, validation_predictions))
print('Mean Absolute Error: ', mean_absolute_error(validation_y, validation_predictions))
print('Root Mean Squared Error: ',rms)
print('R Squared Score: ', r2_score(validation_y, validation_predictions))
print('Accuracy score: ', accuracy_score(validation_predictions, validation_y))
#Confusion matrix for test set accuracy
conf_m_lr = confusion_matrix(y_test, predictions)
print(conf_m_lr)
# Importing cipher stunted datasets
cipher_stunted_dataset = pd.read_csv('Insert cipher stunted file name')
# Removing duplicated rows from benign_dataset_test
cipher_stunted_dataset = cipher_stunted_dataset[cipher_stunted_dataset.duplicated(keep=False) == False]
print(cipher_stunted_dataset.duplicated(keep=False).value_counts())
#Inspecting datasets for columns and rows with missing values
test_missing_values = cipher_stunted_dataset.isnull().sum()
test_overall_percentage = (test_missing_values/cipher_stunted_dataset.isnull().count())
cs_test_x, cs_test_y = cipher_stunted_dataset.drop(['isMalware'], axis=1), cipher_stunted_dataset['isMalware']
cs_test_predictions = lm.predict(cs_test_x)
print(test_predictions)
conf_m_lr_cs = confusion_matrix(cs_test_y, cs_test_predictions)
print(conf_m_lr_cs)
print('Logistics Regression Accuracy score: ', accuracy_score(cs_test_predictions, cs_test_y))
rf_validation_prediction_2 = rf_clf.predict(cs_test_x)
print('Random Forest Classifier Accuracy score: ', accuracy_score(rf_validation_prediction_2, cs_test_y))
conf_m_lr_cs = confusion_matrix(cs_test_y, rf_validation_prediction_2)
print(conf_m_lr_cs)
#Function to plot most important features of random forest model
def plot_feature_importance(importance,names,model_type):
#Create arrays from feature importance and feature names
feature_importance = np.array(importance)
feature_names = np.array(names)
#Create a DataFrame using a Dictionary
data={'feature_names':feature_names,'feature_importance':feature_importance}
fi_df = pd.DataFrame(data)
#Sort the DataFrame in order decreasing feature importance
fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True)
#Define size of bar plot
plt.figure(figsize=(10,8))
#Plot Searborn bar chart
sns.barplot(x=fi_df['feature_importance'], y=fi_df['feature_names'])
plt.ylim(0, 50)
#Add chart labels
plt.title(model_type + 'FEATURE IMPORTANCE')
plt.xlabel('FEATURE IMPORTANCE')
plt.ylabel('FEATURE NAMES')
plot_feature_importance(rf_clf.feature_importances_,x_train.columns,'RANDOM FOREST')
x1 = reduced_dataset.loc[reduced_dataset.isMalware==1, 'num_of_exts']
x2 = reduced_dataset.loc[reduced_dataset.isMalware==0, 'num_of_exts']
kwargs = dict(hist_kws={'alpha':.6}, kde_kws={'linewidth':2})
plt.figure(figsize=(10,7), dpi= 80)
sns.distplot(x1, color="royalblue", label="Malware", **kwargs, kde=False)
sns.distplot(x2, color="orange", label="Benign", **kwargs, kde=False)
plt.legend();
plt.show()
x1 = reduced_dataset.loc[reduced_dataset.isMalware==1, 'Src_Port']
x2 = reduced_dataset.loc[reduced_dataset.isMalware==0, 'Src_Port']
kwargs = dict(hist_kws={'alpha':.6}, kde_kws={'linewidth':2})
plt.figure(figsize=(10,7), dpi= 80)
sns.distplot(x1, color="royalblue", label="Malware", **kwargs, kde=False)
sns.distplot(x2, color="orange", label="Benign", **kwargs, kde=False)
plt.legend();
plt.show()
#Saving the random forest and logistics regression models
joblib.dump(rf_clf, 'random_forest_model.pkl')
joblib.dump(lm, 'logistics_regression_classifier.pkl')
# Load the model from the file
rf_from_joblib = joblib.load('random_forest_model.pkl')
lm_from_joblib = joblib.load('logistics_regression_classifier.pkl')
# Use the loaded model to make predictions
results = rf_from_joblib.predict(x_test)
results = lm_from_joblib.predict(x_test)
print('Random Forest Classifier Accuracy score: ', accuracy_score(y_test, results))
# Load the model from the file
rf_from_joblib = joblib.load('random_forest_model.pkl')
#Cipher and extension stunded dataset tests
cs_ex_stunted_dataset = pd.read_csv('Insert cipher stunted file name')
cs_ex_stunted_dataset = cs_ex_stunted_dataset[cs_ex_stunted_dataset.duplicated(keep=False) == False]
cs_ex_test_x, cs_ex_test_y = cs_ex_stunted_dataset.drop(['isMalware'], axis=1), cs_ex_stunted_dataset['isMalware']
results = rf_from_joblib.predict(cs_ex_test_x)
print('Random Forest Classifier Accuracy score: ', accuracy_score(cs_ex_test_y, results))
#Cipher, extension, and source port stunted results
cs_ex_sp_stunted_dataset = pd.read_csv('Insert cipher stunted file name')
cs_ex_sp_stunted_dataset = cs_ex_sp_stunted_dataset[cs_ex_sp_stunted_dataset.duplicated(keep=False) == False]
cs_ex_sp_test_x, cs_ex_sp_test_y = cs_ex_sp_stunted_dataset.drop(['isMalware'], axis=1), cs_ex_sp_stunted_dataset['isMalware']
results = rf_from_joblib.predict(cs_ex_sp_test_x)
print('Random Forest Classifier Accuracy score: ', accuracy_score(cs_ex_sp_test_y, results))
#Cipher, extension, source port, and packet out stunted results
cs_ex_sp_po_stunted_dataset = pd.read_csv('Insert cipher stunted file name')
cs_ex_sp_po_stunted_dataset = cs_ex_sp_po_stunted_dataset[cs_ex_sp_po_stunted_dataset.duplicated(keep=False) == False]
cs_ex_sp_po_test_x, cs_ex_sp_po_test_y = cs_ex_sp_po_stunted_dataset.drop(['isMalware'], axis=1), cs_ex_sp_po_stunted_dataset['isMalware']
results = rf_from_joblib.predict(cs_ex_sp_po_test_x)
print('Random Forest Classifier Accuracy score: ', accuracy_score(cs_ex_sp_po_test_y, results))
# Combining all datasets together
new_all_flows = pd.concat([malicious_dataset, benign_dataset, cipher_stunted_dataset, cs_ex_stunted_dataset, cs_ex_sp_stunted_dataset, cs_ex_sp_po_stunted_dataset])
# Examining combined dataset
print(new_all_flows['isMalware'].value_counts())
# Isolating independent and dependent variables for new combined dataset
new_all_flows_y = new_all_flows['isMalware']
new_all_flows_x = new_all_flows.drop(['isMalware'], axis=1);
# Splitting datasets into training and test data
combined_x_train, combined_x_test, combined_y_train, combined_y_test = train_test_split(new_all_flows_x, new_all_flows_y, test_size=0.2, random_state=42)
#Training new RF classifier
combined_rf_clf = RandomForestClassifier(max_depth=100)
combined_rf_clf.fit(combined_x_train, combined_y_train)
combined_rf_prediction = combined_rf_clf.predict(combined_x_test)
combined_conf_m = confusion_matrix(combined_y_test, combined_rf_prediction)
print(combined_conf_m)
print('Random Forest Classifier Accuracy score: ', accuracy_score(combined_y_test, combined_rf_prediction))
# Testing combined Logistics Regression classifier on datasets
combined_lm = LogisticRegression(max_iter=100000)
combined_lm.fit(combined_x_train,combined_y_train)
combined_lm_predictions = combined_lm.predict(combined_x_test)
combined_rms = sqrt(mean_squared_error(combined_y_test, combined_lm_predictions))
print('Mean Absolute Error: ', mean_absolute_error(combined_y_test, combined_lm_predictions))
print('Root Mean Squared Error: ',combined_rms)
print('R Squared Score: ', r2_score(combined_y_test, combined_lm_predictions))
print('Accuracy score: ', accuracy_score(combined_lm_predictions, combined_y_test))
combined_rf_prediction_2 = combined_rf_clf.predict(sample_benign_flows_x)
combined_lm_predictions_2 = combined_lm.predict(sample_benign_flows_x)
# Testing on unseen sample benign dataset for combined rf classifier
print('Random Forest Classifier Accuracy score: ', accuracy_score(sample_benign_flows_y, combined_rf_prediction_2))
# Testing on unseen sample benign dataset for logistics regression classifier
print('Random Forest Classifier Accuracy score: ', accuracy_score(sample_benign_flows_y, combined_lm_predictions_2))
#Cipher stunted results
new_cs_stunted_dataset = pd.read_csv('Insert cipher stunted file name')
new_cs_stunted_dataset = new_cs_stunted_dataset[new_cs_stunted_dataset.duplicated(keep=False) == False]
new_cs_stunted_dataset_x, new_cs_stunted_dataset_y = new_cs_stunted_dataset.drop(['isMalware'], axis=1), new_cs_stunted_dataset['isMalware']
results = combined_rf_clf.predict(new_cs_stunted_dataset_x)
print('Random Forest Classifier Accuracy score: ', accuracy_score(new_cs_stunted_dataset_y, results))
#Cipher and extension stunted results
new_cs_ex_stunted_dataset = pd.read_csv('Insert cipher stunted file name')
new_cs_ex_stunted_dataset = new_cs_ex_stunted_dataset[new_cs_ex_stunted_dataset.duplicated(keep=False) == False]
new_cs_ex_stunted_dataset_x, new_cs_ex_stunted_dataset_y = new_cs_ex_stunted_dataset.drop(['isMalware'], axis=1), new_cs_ex_stunted_dataset['isMalware']
results = combined_rf_clf.predict(new_cs_ex_stunted_dataset_x)
print('Random Forest Classifier Accuracy score: ', accuracy_score(new_cs_ex_stunted_dataset_y, results))
#Cipher, extension, and source port stunted results
new_cs_ex_sp_stunted_malicious_dataset = pd.read_csv('Insert cipher stunted file name')
new_cs_ex_sp_stunted_malicious_dataset = new_cs_ex_sp_stunted_malicious_dataset[new_cs_ex_sp_stunted_malicious_dataset.duplicated(keep=False) == False]
new_cs_ex_sp_stunted_malicious_dataset_x, new_cs_ex_sp_stunted_malicious_dataset_y = new_cs_ex_sp_stunted_malicious_dataset.drop(['isMalware'], axis=1), new_cs_ex_sp_stunted_dataset['isMalware']
results = combined_rf_clf.predict(new_cs_ex_sp_stunted_malicious_dataset_x)
print('Random Forest Classifier Accuracy score: ', accuracy_score(new_cs_ex_sp_stunted_malicious_dataset_y, results))
#Cipher, extension, source port, and packets out stunted results
new_cs_ex_sp_po_stunted_dataset = pd.read_csv('Insert cipher stunted file name')
new_cs_ex_sp_po_stunted_dataset = new_cs_ex_sp_po_stunted_dataset[new_cs_ex_sp_po_stunted_dataset.duplicated(keep=False) == False]
new_cs_ex_sp_po_stunted_dataset_x, new_cs_ex_sp_po_stunted_dataset_y = new_cs_ex_sp_po_stunted_dataset.drop(['isMalware'], axis=1), new_cs_ex_sp_po_stunted_dataset['isMalware']
results = combined_rf_clf.predict(new_cs_ex_sp_po_stunted_dataset_x)
print('Random Forest Classifier Accuracy score: ', accuracy_score(new_cs_ex_sp_po_stunted_dataset_y, results))
lm_predictions_cs_ex = combined_lm.predict(new_cs_ex_stunted_dataset_x)
print('Accuracy score: ', accuracy_score(lm_predictions_cs_ex, new_cs_ex_stunted_dataset_y))
lm_predictions_cs_ex_sp = combined_lm.predict(new_cs_ex_sp_stunted_malicious_dataset_x)
print('Accuracy score: ', accuracy_score(lm_predictions_cs_ex_sp, new_cs_ex_sp_stunted_malicious_dataset_y))
lm_predictions_cs_ex_sp_po = combined_lm.predict(new_cs_ex_sp_po_stunted_dataset_x)
print('Accuracy score: ', accuracy_score(lm_predictions_cs_ex_sp_po, new_cs_ex_sp_po_stunted_dataset_y))
joblib.dump(combined_rf_clf, 'new_random_forest_model.pkl')
joblib.dump(combined_lm, 'new_logistics_regression_classifier.pkl')
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/wu_zhengqi/TLS-Malware-Detection-with-Machine-Learning.git
[email protected]:wu_zhengqi/TLS-Malware-Detection-with-Machine-Learning.git
wu_zhengqi
TLS-Malware-Detection-with-Machine-Learning
TLS-Malware-Detection-with-Machine-Learning
master

搜索帮助