5 Star 2 Fork 0

流火Antares/software_defect_prediction

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
克隆/下载
methods.py 11.60 KB
一键复制 编辑 原始数据 按行查看 历史
feizhouMadao 提交于 2022-09-23 13:58 . 完成了可视化界面
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn import tree
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn import model_selection
from sklearn import metrics
import utils as ut
import numpy as np
import pylab as plt
def run_method(method, X, y, n_classifiers=-1, fs_functions=None, score_name="AUC", experiment="0"):
# 以下用于对比特征选择算法的优劣
if experiment == "0":
# 创建一个SVM,这主要是为了评估不同特征选择方法的优劣(当然,fs_functions也可能为None,此时不对特征进行处理)
svm = SVC(class_weight="balanced", probability=True) # 这里的参数设置有待考究
for fs in fs_functions:
print("FEATURE SELECTION: %s\n" % fs)
if fs in ["pearson", "fisher"]:
print("Ranking features using %s ..." % fs)
# 计算各个特征的score再从高到低进行排序
# ft_ranks中存储排序后特征的索引
# scores中存储排序后的分值(根据一定的数学方法计算)
ft_ranks, scores = ut.rank_features(np.array(X), y, corr=fs)
# 从高分到低分选取特征,然后用这些特征构成的数据集来训练和测试
# 参数step_size是每次增加特征的个数
# scores是分数组成的列表
# selected_features是每次选定的特征(这也是一个列表)的列表
scores, selected_features = ut.compute_feature_curve(svm, X, y,
ft_ranks=ft_ranks,
step_size=1,
score_name=score_name)
elif fs == "greedy":
scores, selected_features = ut.greedy_selection(svm, X, y, score_name=score_name)
# 对结果进行绘制
plt.plot(selected_features, scores, label=fs)
# 以下用于对比各模型的优劣(RF,W-SVM,APE),注意此时输入的fs_function只有一个参数
elif experiment == "1":
if len(fs_functions) == 1:
for model in method:
if model == "RF":
print("MODEL: RANDOM FOREST")
rfc = RandomForestClassifier(random_state=0) # 建立模型:随机森林
scores, selected_features = evaluate_rf(rfc, X, y, fs_functions[0], score_name)
elif model == "W_SVM":
print("MODEL: W_SVM")
w_svm = SVC(class_weight="balanced", probability=True)
scores, selected_features = evaluate_rf(w_svm, X, y, fs_functions[0], score_name)
# classifiers = []
# for c in [1, 10, 100, 500, 1000]:
# for w in [{1: 5}, {1: 10}, {1: 15}, {1: 20}, {1: 25}]:
# classifiers += [SVC(probability=True, C=c, class_weight=w)]
# if n_classifiers == -1:
# scores, selected_features = evaluate_w_svm(classifiers, X, y, 5, fs_functions[0], score_name)
elif model == "APE":
print("MODEL: APE")
ape_classifiers = [SVC(probability=True),
MultinomialNB(alpha=0.001),
BernoulliNB(alpha=0.001),
RandomForestClassifier(n_estimators=20),
GradientBoostingClassifier(n_estimators=300),
SGDClassifier(alpha=.0001, loss='log_loss', n_iter_no_change=50,
penalty="elasticnet", max_iter=10000),
LogisticRegression(penalty='l2', max_iter=10000)]
scores, selected_features = evaluate_ape(ape_classifiers, X, y, 7, fs_functions[0], score_name)
# 对结果进行绘制
plt.plot(selected_features, scores, label=model)
else:
print("只能指定一种特征选择算法!")
# 以下用于对比基分类器数量对同构APE模型的影响
elif experiment == "2":
classifiers = []
for c in [1, 10, 100, 500, 1000]:
for w in [{1: 5}, {1: 10}, {1: 15}, {1: 20}, {1: 25}]:
classifiers += [SVC(probability=True, C=c, class_weight=w)]
scores, x_values = ensemble_forward_pass(classifiers, X, y, n_classifiers, mode=0)
plt.plot(x_values, scores, label="同构APE")
# 以下用于对比基分类器数量对异构APE模型的影响
elif experiment == "3":
classifiers = [LogisticRegression(penalty='l2', max_iter=10000),
KNeighborsClassifier(),
RandomForestClassifier(n_estimators=20),
tree.DecisionTreeClassifier(),
SVC(probability=True),
MultinomialNB(alpha=0.001),
GradientBoostingClassifier(n_estimators=300),
AdaBoostClassifier(),
GaussianNB(),
BernoulliNB(alpha=0.001),
SGDClassifier(alpha=.0001, loss='log_loss', max_iter=50, penalty="elasticnet")]
scores, x_values = ensemble_forward_pass(classifiers, X, y, n_classifiers, mode=0)
plt.plot(x_values, scores, label="异构APE")
# 以下用于对比不同基分类器组合对异构APE模型的影响
elif experiment == "4":
classifiers_a = [BernoulliNB(alpha=0.001),
tree.DecisionTreeClassifier(),
LogisticRegression(penalty='l2', max_iter=10000),
SVC(probability=True),
SGDClassifier(alpha=.0001, loss='log_loss', max_iter=50, penalty="elasticnet"),
GradientBoostingClassifier(n_estimators=300)]
classifiers_b = [MultinomialNB(alpha=0.001),
tree.DecisionTreeClassifier(),
LogisticRegression(penalty='l2', max_iter=10000),
SVC(probability=True),
KNeighborsClassifier(),
AdaBoostClassifier()]
classifiers_c = [BernoulliNB(alpha=0.001),
MultinomialNB(alpha=0.001),
tree.DecisionTreeClassifier(),
RandomForestClassifier(n_estimators=20),
KNeighborsClassifier(),
SGDClassifier(alpha=.0001, loss='log_loss', max_iter=50, penalty="elasticnet")]
classifiers_d = [GaussianNB(),
RandomForestClassifier(n_estimators=20),
GradientBoostingClassifier(n_estimators=300),
SVC(probability=True),
tree.DecisionTreeClassifier(),
AdaBoostClassifier()]
classifiers_e = [BernoulliNB(alpha=0.001),
GaussianNB(),
RandomForestClassifier(n_estimators=20),
SGDClassifier(alpha=.0001, loss='log_loss', max_iter=50, penalty="elasticnet"),
GradientBoostingClassifier(n_estimators=300),
AdaBoostClassifier()]
classifiers_f = [RandomForestClassifier(n_estimators=20),
SGDClassifier(alpha=.0001, loss='log_loss', max_iter=50, penalty="elasticnet"),
LogisticRegression(penalty='l2', max_iter=10000),
SVC(probability=True),
AdaBoostClassifier(),
KNeighborsClassifier()]
classifiers_list = [classifiers_a, classifiers_b, classifiers_c, classifiers_d, classifiers_e, classifiers_f]
scores = []
labels = ["组合A", "组合B", "组合C", "组合D", "组合E", "组合F"]
for i in range(6):
scores += [ensemble_forward_pass(classifiers_list[i], X, y, 6, mode=1)]
plt.bar(labels, scores, label="AUC")
plt.yticks(np.arange(0.90, 1.01, step=0.02))
plt.ylim(ymin=0.89, ymax=1.02)
# 该函数对随机森林模型进行评估
def evaluate_rf(model, X, y, fs_function, score_name="AUC"):
if fs_function in ["pearson", "fisher"]:
# 这是获取各个特征的排名和分数
ft_ranks, scores = ut.rank_features(np.array(X), y, corr=fs_function)
# 以step_size为步长逐步扩大特征集,获取不同保留特征数时模型的表现
scores, selected_features = ut.compute_feature_curve(model, X, y,
ft_ranks=ft_ranks,
step_size=1,
score_name=score_name)
elif fs_function == "greedy":
scores, selected_features = ut.greedy_selection(model, X, y, score_name=score_name)
return scores, selected_features
# 该函数对ape模型在不同特征集下的表现进行评估
def evaluate_ape(classifiers, X, y, n_classifiers, fs_function, score_name="AUC"):
# 下面这行代码应该写到utils中
# clf_list = ut.EnsembleClassifiers(classifiers)
# 要注意的是,[pearson fisher]和[greedy]的逻辑存在很大差异,因此需要用不同的函数来实现
if fs_function in ["pearson", "fisher"]:
ft_ranks, scores = ut.rank_features(np.array(X), y, corr=fs_function)
scores, selected_features = ut.compute_ape_feature_curve(classifiers, n_classifiers, X, y,
ft_ranks=ft_ranks,
step_size=1,
score_name=score_name)
elif fs_function == "greedy":
scores, selected_features = ut.compute_ape_feature_curve_with_greedy(classifiers, n_classifiers, X, y,
score_name=score_name)
return scores, selected_features
# 该函数对ape模型的基分类器数量或者构成进行评估,采用auc指标
# mode=0代表对基分类器数量进行探究,返回的是不同基分类器数量下的表现
# mode=1代表对基分类器构成进行研究,只返回指定基分类器数量下的表现
def ensemble_forward_pass(classifiers, X, y, n_classifiers=None, mode=0):
clf_list = ut.EnsembleClassifiers(classifiers)
auc_scores = np.zeros(n_classifiers)
for i in range(n_classifiers):
# 这里采用10折交叉验证
skf = model_selection.StratifiedKFold(n_splits=10)
scores = []
for train_index, test_index in skf.split(X, y):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
clf_list.fit(X_train, y_train, i)
y_pred = clf_list.predict(X_test)
scores += [metrics.roc_auc_score(y_test, y_pred)]
auc_scores[i] = np.mean(scores)
print("Score: %.3f, n_classifiers: %d" % (auc_scores[i], i + 1))
if mode == 0:
return auc_scores, np.arange(n_classifiers) + 1
elif mode == 1:
return auc_scores[n_classifiers - 1]
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
Python
1
https://gitee.com/liuhuo-antares/software_defect_prediction.git
[email protected]:liuhuo-antares/software_defect_prediction.git
liuhuo-antares
software_defect_prediction
software_defect_prediction
master

搜索帮助