代码拉取完成,页面将自动刷新
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn import tree
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn import model_selection
from sklearn import metrics
import utils as ut
import numpy as np
import pylab as plt
def run_method(method, X, y, n_classifiers=-1, fs_functions=None, score_name="AUC", experiment="0"):
# 以下用于对比特征选择算法的优劣
if experiment == "0":
# 创建一个SVM,这主要是为了评估不同特征选择方法的优劣(当然,fs_functions也可能为None,此时不对特征进行处理)
svm = SVC(class_weight="balanced", probability=True) # 这里的参数设置有待考究
for fs in fs_functions:
print("FEATURE SELECTION: %s\n" % fs)
if fs in ["pearson", "fisher"]:
print("Ranking features using %s ..." % fs)
# 计算各个特征的score再从高到低进行排序
# ft_ranks中存储排序后特征的索引
# scores中存储排序后的分值(根据一定的数学方法计算)
ft_ranks, scores = ut.rank_features(np.array(X), y, corr=fs)
# 从高分到低分选取特征,然后用这些特征构成的数据集来训练和测试
# 参数step_size是每次增加特征的个数
# scores是分数组成的列表
# selected_features是每次选定的特征(这也是一个列表)的列表
scores, selected_features = ut.compute_feature_curve(svm, X, y,
ft_ranks=ft_ranks,
step_size=1,
score_name=score_name)
elif fs == "greedy":
scores, selected_features = ut.greedy_selection(svm, X, y, score_name=score_name)
# 对结果进行绘制
plt.plot(selected_features, scores, label=fs)
# 以下用于对比各模型的优劣(RF,W-SVM,APE),注意此时输入的fs_function只有一个参数
elif experiment == "1":
if len(fs_functions) == 1:
for model in method:
if model == "RF":
print("MODEL: RANDOM FOREST")
rfc = RandomForestClassifier(random_state=0) # 建立模型:随机森林
scores, selected_features = evaluate_rf(rfc, X, y, fs_functions[0], score_name)
elif model == "W_SVM":
print("MODEL: W_SVM")
w_svm = SVC(class_weight="balanced", probability=True)
scores, selected_features = evaluate_rf(w_svm, X, y, fs_functions[0], score_name)
# classifiers = []
# for c in [1, 10, 100, 500, 1000]:
# for w in [{1: 5}, {1: 10}, {1: 15}, {1: 20}, {1: 25}]:
# classifiers += [SVC(probability=True, C=c, class_weight=w)]
# if n_classifiers == -1:
# scores, selected_features = evaluate_w_svm(classifiers, X, y, 5, fs_functions[0], score_name)
elif model == "APE":
print("MODEL: APE")
ape_classifiers = [SVC(probability=True),
MultinomialNB(alpha=0.001),
BernoulliNB(alpha=0.001),
RandomForestClassifier(n_estimators=20),
GradientBoostingClassifier(n_estimators=300),
SGDClassifier(alpha=.0001, loss='log_loss', n_iter_no_change=50,
penalty="elasticnet", max_iter=10000),
LogisticRegression(penalty='l2', max_iter=10000)]
scores, selected_features = evaluate_ape(ape_classifiers, X, y, 7, fs_functions[0], score_name)
# 对结果进行绘制
plt.plot(selected_features, scores, label=model)
else:
print("只能指定一种特征选择算法!")
# 以下用于对比基分类器数量对同构APE模型的影响
elif experiment == "2":
classifiers = []
for c in [1, 10, 100, 500, 1000]:
for w in [{1: 5}, {1: 10}, {1: 15}, {1: 20}, {1: 25}]:
classifiers += [SVC(probability=True, C=c, class_weight=w)]
scores, x_values = ensemble_forward_pass(classifiers, X, y, n_classifiers, mode=0)
plt.plot(x_values, scores, label="同构APE")
# 以下用于对比基分类器数量对异构APE模型的影响
elif experiment == "3":
classifiers = [LogisticRegression(penalty='l2', max_iter=10000),
KNeighborsClassifier(),
RandomForestClassifier(n_estimators=20),
tree.DecisionTreeClassifier(),
SVC(probability=True),
MultinomialNB(alpha=0.001),
GradientBoostingClassifier(n_estimators=300),
AdaBoostClassifier(),
GaussianNB(),
BernoulliNB(alpha=0.001),
SGDClassifier(alpha=.0001, loss='log_loss', max_iter=50, penalty="elasticnet")]
scores, x_values = ensemble_forward_pass(classifiers, X, y, n_classifiers, mode=0)
plt.plot(x_values, scores, label="异构APE")
# 以下用于对比不同基分类器组合对异构APE模型的影响
elif experiment == "4":
classifiers_a = [BernoulliNB(alpha=0.001),
tree.DecisionTreeClassifier(),
LogisticRegression(penalty='l2', max_iter=10000),
SVC(probability=True),
SGDClassifier(alpha=.0001, loss='log_loss', max_iter=50, penalty="elasticnet"),
GradientBoostingClassifier(n_estimators=300)]
classifiers_b = [MultinomialNB(alpha=0.001),
tree.DecisionTreeClassifier(),
LogisticRegression(penalty='l2', max_iter=10000),
SVC(probability=True),
KNeighborsClassifier(),
AdaBoostClassifier()]
classifiers_c = [BernoulliNB(alpha=0.001),
MultinomialNB(alpha=0.001),
tree.DecisionTreeClassifier(),
RandomForestClassifier(n_estimators=20),
KNeighborsClassifier(),
SGDClassifier(alpha=.0001, loss='log_loss', max_iter=50, penalty="elasticnet")]
classifiers_d = [GaussianNB(),
RandomForestClassifier(n_estimators=20),
GradientBoostingClassifier(n_estimators=300),
SVC(probability=True),
tree.DecisionTreeClassifier(),
AdaBoostClassifier()]
classifiers_e = [BernoulliNB(alpha=0.001),
GaussianNB(),
RandomForestClassifier(n_estimators=20),
SGDClassifier(alpha=.0001, loss='log_loss', max_iter=50, penalty="elasticnet"),
GradientBoostingClassifier(n_estimators=300),
AdaBoostClassifier()]
classifiers_f = [RandomForestClassifier(n_estimators=20),
SGDClassifier(alpha=.0001, loss='log_loss', max_iter=50, penalty="elasticnet"),
LogisticRegression(penalty='l2', max_iter=10000),
SVC(probability=True),
AdaBoostClassifier(),
KNeighborsClassifier()]
classifiers_list = [classifiers_a, classifiers_b, classifiers_c, classifiers_d, classifiers_e, classifiers_f]
scores = []
labels = ["组合A", "组合B", "组合C", "组合D", "组合E", "组合F"]
for i in range(6):
scores += [ensemble_forward_pass(classifiers_list[i], X, y, 6, mode=1)]
plt.bar(labels, scores, label="AUC")
plt.yticks(np.arange(0.90, 1.01, step=0.02))
plt.ylim(ymin=0.89, ymax=1.02)
# 该函数对随机森林模型进行评估
def evaluate_rf(model, X, y, fs_function, score_name="AUC"):
if fs_function in ["pearson", "fisher"]:
# 这是获取各个特征的排名和分数
ft_ranks, scores = ut.rank_features(np.array(X), y, corr=fs_function)
# 以step_size为步长逐步扩大特征集,获取不同保留特征数时模型的表现
scores, selected_features = ut.compute_feature_curve(model, X, y,
ft_ranks=ft_ranks,
step_size=1,
score_name=score_name)
elif fs_function == "greedy":
scores, selected_features = ut.greedy_selection(model, X, y, score_name=score_name)
return scores, selected_features
# 该函数对ape模型在不同特征集下的表现进行评估
def evaluate_ape(classifiers, X, y, n_classifiers, fs_function, score_name="AUC"):
# 下面这行代码应该写到utils中
# clf_list = ut.EnsembleClassifiers(classifiers)
# 要注意的是,[pearson fisher]和[greedy]的逻辑存在很大差异,因此需要用不同的函数来实现
if fs_function in ["pearson", "fisher"]:
ft_ranks, scores = ut.rank_features(np.array(X), y, corr=fs_function)
scores, selected_features = ut.compute_ape_feature_curve(classifiers, n_classifiers, X, y,
ft_ranks=ft_ranks,
step_size=1,
score_name=score_name)
elif fs_function == "greedy":
scores, selected_features = ut.compute_ape_feature_curve_with_greedy(classifiers, n_classifiers, X, y,
score_name=score_name)
return scores, selected_features
# 该函数对ape模型的基分类器数量或者构成进行评估,采用auc指标
# mode=0代表对基分类器数量进行探究,返回的是不同基分类器数量下的表现
# mode=1代表对基分类器构成进行研究,只返回指定基分类器数量下的表现
def ensemble_forward_pass(classifiers, X, y, n_classifiers=None, mode=0):
clf_list = ut.EnsembleClassifiers(classifiers)
auc_scores = np.zeros(n_classifiers)
for i in range(n_classifiers):
# 这里采用10折交叉验证
skf = model_selection.StratifiedKFold(n_splits=10)
scores = []
for train_index, test_index in skf.split(X, y):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
clf_list.fit(X_train, y_train, i)
y_pred = clf_list.predict(X_test)
scores += [metrics.roc_auc_score(y_test, y_pred)]
auc_scores[i] = np.mean(scores)
print("Score: %.3f, n_classifiers: %d" % (auc_scores[i], i + 1))
if mode == 0:
return auc_scores, np.arange(n_classifiers) + 1
elif mode == 1:
return auc_scores[n_classifiers - 1]
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。