master

分支 (1)

管理

管理

master

software_defect_prediction
/
methods.py

from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn import tree
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn import model_selection
from sklearn import metrics
import utils as ut
import numpy as np
import pylab as plt


def run_method(method, X, y, n_classifiers=-1, fs_functions=None, score_name="AUC", experiment="0"):
    # 以下用于对比特征选择算法的优劣
    if experiment == "0":
        # 创建一个SVM，这主要是为了评估不同特征选择方法的优劣（当然，fs_functions也可能为None，此时不对特征进行处理）
        svm = SVC(class_weight="balanced", probability=True)  # 这里的参数设置有待考究

        for fs in fs_functions:
            print("FEATURE SELECTION: %s\n" % fs)
            if fs in ["pearson", "fisher"]:
                print("Ranking features using %s ..." % fs)
                # 计算各个特征的score再从高到低进行排序
                # ft_ranks中存储排序后特征的索引
                # scores中存储排序后的分值（根据一定的数学方法计算）
                ft_ranks, scores = ut.rank_features(np.array(X), y, corr=fs)

                # 从高分到低分选取特征，然后用这些特征构成的数据集来训练和测试
                # 参数step_size是每次增加特征的个数
                # scores是分数组成的列表
                # selected_features是每次选定的特征（这也是一个列表）的列表
                scores, selected_features = ut.compute_feature_curve(svm, X, y,
                                                                     ft_ranks=ft_ranks,
                                                                     step_size=1,
                                                                     score_name=score_name)
            elif fs == "greedy":
                scores, selected_features = ut.greedy_selection(svm, X, y, score_name=score_name)

            # 对结果进行绘制
            plt.plot(selected_features, scores, label=fs)

    # 以下用于对比各模型的优劣（RF，W-SVM，APE），注意此时输入的fs_function只有一个参数
    elif experiment == "1":
        if len(fs_functions) == 1:
            for model in method:
                if model == "RF":
                    print("MODEL: RANDOM FOREST")
                    rfc = RandomForestClassifier(random_state=0)  # 建立模型：随机森林
                    scores, selected_features = evaluate_rf(rfc, X, y, fs_functions[0], score_name)
                elif model == "W_SVM":
                    print("MODEL: W_SVM")
                    w_svm = SVC(class_weight="balanced", probability=True)
                    scores, selected_features = evaluate_rf(w_svm, X, y, fs_functions[0], score_name)

                    # classifiers = []
                    # for c in [1, 10, 100, 500, 1000]:
                    #     for w in [{1: 5}, {1: 10}, {1: 15}, {1: 20}, {1: 25}]:
                    #         classifiers += [SVC(probability=True, C=c, class_weight=w)]
                    # if n_classifiers == -1:
                    #     scores, selected_features = evaluate_w_svm(classifiers, X, y, 5, fs_functions[0], score_name)
                elif model == "APE":
                    print("MODEL: APE")
                    ape_classifiers = [SVC(probability=True),
                                       MultinomialNB(alpha=0.001),
                                       BernoulliNB(alpha=0.001),
                                       RandomForestClassifier(n_estimators=20),
                                       GradientBoostingClassifier(n_estimators=300),
                                       SGDClassifier(alpha=.0001, loss='log_loss', n_iter_no_change=50,
                                                     penalty="elasticnet", max_iter=10000),
                                       LogisticRegression(penalty='l2', max_iter=10000)]
                    scores, selected_features = evaluate_ape(ape_classifiers, X, y, 7, fs_functions[0], score_name)
                # 对结果进行绘制
                plt.plot(selected_features, scores, label=model)
        else:
            print("只能指定一种特征选择算法！")

    # 以下用于对比基分类器数量对同构APE模型的影响
    elif experiment == "2":
        classifiers = []
        for c in [1, 10, 100, 500, 1000]:
            for w in [{1: 5}, {1: 10}, {1: 15}, {1: 20}, {1: 25}]:
                classifiers += [SVC(probability=True, C=c, class_weight=w)]

        scores, x_values = ensemble_forward_pass(classifiers, X, y, n_classifiers, mode=0)
        plt.plot(x_values, scores, label="同构APE")

    # 以下用于对比基分类器数量对异构APE模型的影响
    elif experiment == "3":
        classifiers = [LogisticRegression(penalty='l2', max_iter=10000),
                       KNeighborsClassifier(),
                       RandomForestClassifier(n_estimators=20),
                       tree.DecisionTreeClassifier(),
                       SVC(probability=True),
                       MultinomialNB(alpha=0.001),
                       GradientBoostingClassifier(n_estimators=300),
                       AdaBoostClassifier(),
                       GaussianNB(),
                       BernoulliNB(alpha=0.001),
                       SGDClassifier(alpha=.0001, loss='log_loss', max_iter=50, penalty="elasticnet")]
        scores, x_values = ensemble_forward_pass(classifiers, X, y, n_classifiers, mode=0)
        plt.plot(x_values, scores, label="异构APE")

    # 以下用于对比不同基分类器组合对异构APE模型的影响
    elif experiment == "4":
        classifiers_a = [BernoulliNB(alpha=0.001),
                         tree.DecisionTreeClassifier(),
                         LogisticRegression(penalty='l2', max_iter=10000),
                         SVC(probability=True),
                         SGDClassifier(alpha=.0001, loss='log_loss', max_iter=50, penalty="elasticnet"),
                         GradientBoostingClassifier(n_estimators=300)]
        classifiers_b = [MultinomialNB(alpha=0.001),
                         tree.DecisionTreeClassifier(),
                         LogisticRegression(penalty='l2', max_iter=10000),
                         SVC(probability=True),
                         KNeighborsClassifier(),
                         AdaBoostClassifier()]
        classifiers_c = [BernoulliNB(alpha=0.001),
                         MultinomialNB(alpha=0.001),
                         tree.DecisionTreeClassifier(),
                         RandomForestClassifier(n_estimators=20),
                         KNeighborsClassifier(),
                         SGDClassifier(alpha=.0001, loss='log_loss', max_iter=50, penalty="elasticnet")]
        classifiers_d = [GaussianNB(),
                         RandomForestClassifier(n_estimators=20),
                         GradientBoostingClassifier(n_estimators=300),
                         SVC(probability=True),
                         tree.DecisionTreeClassifier(),
                         AdaBoostClassifier()]
        classifiers_e = [BernoulliNB(alpha=0.001),
                         GaussianNB(),
                         RandomForestClassifier(n_estimators=20),
                         SGDClassifier(alpha=.0001, loss='log_loss', max_iter=50, penalty="elasticnet"),
                         GradientBoostingClassifier(n_estimators=300),
                         AdaBoostClassifier()]
        classifiers_f = [RandomForestClassifier(n_estimators=20),
                         SGDClassifier(alpha=.0001, loss='log_loss', max_iter=50, penalty="elasticnet"),
                         LogisticRegression(penalty='l2', max_iter=10000),
                         SVC(probability=True),
                         AdaBoostClassifier(),
                         KNeighborsClassifier()]
        classifiers_list = [classifiers_a, classifiers_b, classifiers_c, classifiers_d, classifiers_e, classifiers_f]
        scores = []
        labels = ["组合A", "组合B", "组合C", "组合D", "组合E", "组合F"]
        for i in range(6):
            scores += [ensemble_forward_pass(classifiers_list[i], X, y, 6, mode=1)]
        plt.bar(labels, scores, label="AUC")
        plt.yticks(np.arange(0.90, 1.01, step=0.02))
        plt.ylim(ymin=0.89, ymax=1.02)


# 该函数对随机森林模型进行评估
def evaluate_rf(model, X, y, fs_function, score_name="AUC"):
    if fs_function in ["pearson", "fisher"]:
        # 这是获取各个特征的排名和分数
        ft_ranks, scores = ut.rank_features(np.array(X), y, corr=fs_function)
        # 以step_size为步长逐步扩大特征集，获取不同保留特征数时模型的表现
        scores, selected_features = ut.compute_feature_curve(model, X, y,
                                                             ft_ranks=ft_ranks,
                                                             step_size=1,
                                                             score_name=score_name)
    elif fs_function == "greedy":
        scores, selected_features = ut.greedy_selection(model, X, y, score_name=score_name)
    return scores, selected_features


# 该函数对ape模型在不同特征集下的表现进行评估
def evaluate_ape(classifiers, X, y, n_classifiers, fs_function, score_name="AUC"):
    # 下面这行代码应该写到utils中
    # clf_list = ut.EnsembleClassifiers(classifiers)

    # 要注意的是，[pearson fisher]和[greedy]的逻辑存在很大差异，因此需要用不同的函数来实现
    if fs_function in ["pearson", "fisher"]:
        ft_ranks, scores = ut.rank_features(np.array(X), y, corr=fs_function)
        scores, selected_features = ut.compute_ape_feature_curve(classifiers, n_classifiers, X, y,
                                                                 ft_ranks=ft_ranks,
                                                                 step_size=1,
                                                                 score_name=score_name)
    elif fs_function == "greedy":
        scores, selected_features = ut.compute_ape_feature_curve_with_greedy(classifiers, n_classifiers, X, y,
                                                                             score_name=score_name)
    return scores, selected_features


# 该函数对ape模型的基分类器数量或者构成进行评估，采用auc指标
# mode=0代表对基分类器数量进行探究，返回的是不同基分类器数量下的表现
# mode=1代表对基分类器构成进行研究，只返回指定基分类器数量下的表现
def ensemble_forward_pass(classifiers, X, y, n_classifiers=None, mode=0):
    clf_list = ut.EnsembleClassifiers(classifiers)
    auc_scores = np.zeros(n_classifiers)

    for i in range(n_classifiers):
        # 这里采用10折交叉验证
        skf = model_selection.StratifiedKFold(n_splits=10)

        scores = []
        for train_index, test_index in skf.split(X, y):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            clf_list.fit(X_train, y_train, i)
            y_pred = clf_list.predict(X_test)

            scores += [metrics.roc_auc_score(y_test, y_pred)]

        auc_scores[i] = np.mean(scores)
        print("Score: %.3f, n_classifiers: %d" % (auc_scores[i], i + 1))
    if mode == 0:
        return auc_scores, np.arange(n_classifiers) + 1
    elif mode == 1:
        return auc_scores[n_classifiers - 1]