Loading [MathJax]/jax/output/HTML-CSS/jax.js
1 Star 3 Fork 2

shiyuxin233/My Machine Learning

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
Machine Learning 04 Linear Regression.ipynb 330.73 KB
一键复制 编辑 原始数据 按行查看 历史
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import PolynomialFeatures
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import make_pipeline

sns.set()
plt.rc('font', family='SimHei')
plt.rc('axes', unicode_minus=False)

线性回归

基函数回归

通过使用基函数对数据进行预处理,可以将线性的模型转变为非线性的模型,常用的基函数有多项式基函数与高斯基函数;

多项式基函数

使用sklearn提供的PolynomialFeatures可以很方便的进行多项式的拟合; 例如,这里使用一个7次的多项式模型来拟合一个带有噪声的正弦波;

x_fit = (np.random.rand(100) * 2 * np.pi)[:, np.newaxis]
y_fit = np.sin(x_fit) + 0.2 * np.random.rand(100)[:, np.newaxis]

plt.figure(figsize=(10, 10))
plt.plot(x_fit, y_fit, 'o')

poly = PolynomialFeatures(degree=7)
model = LinearRegression(fit_intercept=True)
model.fit(poly.fit_transform(x_fit), y_fit)

print(f'斜率拟合结果:\n{model.coef_}\n')
print(f'截距拟合结果:\n{model.intercept_}\n')

res = model.predict(poly.fit_transform(np.linspace(-1, 7, 100)[:, np.newaxis]))

plt.plot(np.linspace(-1, 7, 100), res)
plt.title('使用多项式基函数拟合正弦波(线性回归)')
斜率拟合结果:
[[ 0.00000000e+00  7.62873777e-01  4.94286344e-01 -5.73239384e-01
   1.56268610e-01 -1.90622234e-02  1.26236424e-03 -4.29223064e-05]]

截距拟合结果:
[0.11910499]

Text(0.5, 1.0, '使用多项式基函数拟合正弦波(线性回归)')

高斯基函数

高斯基函数使用若干个高斯分布来对数据进行拟合,sklearn默认不提供已经实现的高斯基函数类,但是我们可以自己写:

class GaussianFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, N:int, width_factor:float = 2.0):
        self.N = N
        self.width_factor=width_factor
        self.centers_ = None
        self.width_ = None

    @staticmethod
    def _gauss_basis(x, y, width, axis=None):
        arg = (x - y) / width
        return np.exp(-0.5 * np.sum(arg ** 2, axis))

    def fit(self, x, y=None):
        self.centers_ = np.linspace(x.min(), x.max(), self.N)
        self.width_ = self.width_factor * (self.centers_[1] - self.centers_[0])
        return self

    def transform(self, x):
        return self._gauss_basis(x[:, :, np.newaxis], self.centers_, self.width_, axis=1)


model = make_pipeline(GaussianFeatures(20), LinearRegression())
x_fit = np.random.rand(100) * 2 * np.pi
y_fit = np.sin(x_fit) + 0.2 * np.random.rand(100)
x_fit = x_fit[:, np.newaxis]; y_fit = y_fit[:, np.newaxis]
x_test = np.linspace(0, 2 * np.pi, 200)[:, np.newaxis]

model.fit(x_fit, y_fit)
res = model.predict(x_test)

plt.figure(figsize=(10, 10))
plt.plot(x_fit, y_fit, 'o')
plt.plot(x_test, res)
plt.title('使用高斯基函数拟合正弦波(线性回归)')
Text(0.5, 1.0, '使用高斯基函数拟合正弦波(线性回归)')

正则化

在线性回归中,如果使用过于复杂的模型会造成过拟合,所以我们需要对较大的模型参数进行抑制,即正则化

岭回归(L2范数正则化)

岭回归的本质就是在线性回归原有的损失函数上加上下面一项: P=αNn=1θ2n

参数α由用户自行设置,用于控制对损失函数的惩罚力度; 书上对岭回归解释的不详细,入门阶段先拿别人写的博客看一下; sklearn中的岭回归由Ridge类实现;

x_fit = np.random.rand(60) * 2 * np.pi - np.pi
y_fit = np.sin(x_fit) + 0.3 * np.random.rand(60)
x_test = np.linspace(-np.pi, np.pi, 200)
x_fit, y_fit, x_test = x_fit[:, np.newaxis], y_fit[:, np.newaxis], x_test[:, np.newaxis]

model1 = make_pipeline(GaussianFeatures(30), LinearRegression())
model2 = make_pipeline(GaussianFeatures(30), Ridge(alpha=0.2))

model1.fit(x_fit, y_fit); model2.fit(x_fit, y_fit)
res1, res2 = model1.predict(x_test), model2.predict(x_test)

fig, axs = plt.subplots(4, 1, figsize=(15, 20))
ax_linear = axs[0]  # type: plt.Axes
ax_coef_linear = axs[1]  # type: plt.Axes
ax_ridge = axs[2]  # type: plt.Axes
ax_coef_ridge = axs[3]  # type: plt.Axes

ax_linear.plot(x_fit, y_fit, 'o')
ax_linear.plot(x_test, res1)
ax_linear.set_xlim(-3.5, 3.5)
ax_linear.set_ylim(-1.5, 1.5)
ax_linear.set_title('简单线性回归结果')

ax_coef_linear.plot(model1.steps[0][1].centers_[:, np.newaxis], model1.steps[1][1].coef_.reshape(30, 1))
ax_coef_linear.set_title('简单线性回归振幅')

ax_ridge.plot(x_fit, y_fit, 'o')
ax_ridge.plot(x_test, res2)
ax_ridge.set_xlim(-3.5, 3.5)
ax_ridge.set_ylim(-1.5, 1.5)
ax_ridge.set_title('岭回归结果')

ax_coef_ridge.plot(model2.steps[0][1].centers_[:, np.newaxis], model2.steps[1][1].coef_.reshape(30, 1))
ax_coef_ridge.set_title('岭回归振幅')
Text(0.5, 1.0, '岭回归振幅')

Lasso回归

公式: P=αNn=1|θn|

Lasso回归通常用于构建稀疏模型(即相关性很小的特征的系数会被设为接近0)

x_fit = np.random.rand(60) * 2 * np.pi - np.pi
y_fit = np.sin(x_fit) + 0.3 * np.random.rand(60)
x_test = np.linspace(-np.pi, np.pi, 200)
x_fit, y_fit, x_test = x_fit[:, np.newaxis], y_fit[:, np.newaxis], x_test[:, np.newaxis]

model1 = make_pipeline(GaussianFeatures(30), Ridge(0.2))
model2 = make_pipeline(GaussianFeatures(30), Lasso(0.002))

model1.fit(x_fit, y_fit); model2.fit(x_fit, y_fit)
res1, res2 = model1.predict(x_test), model2.predict(x_test)

fig, axs = plt.subplots(4, 1, figsize=(15, 20))
ax_linear = axs[0]  # type: plt.Axes
ax_coef_linear = axs[1]  # type: plt.Axes
ax_ridge = axs[2]  # type: plt.Axes
ax_coef_ridge = axs[3]  # type: plt.Axes

ax_linear.plot(x_fit, y_fit, 'o')
ax_linear.plot(x_test, res1)
ax_linear.set_xlim(-3.5, 3.5)
ax_linear.set_ylim(-1.5, 1.5)
ax_linear.set_title('岭回归结果')

ax_coef_linear.plot(model1.steps[0][1].centers_[:, np.newaxis], model1.steps[1][1].coef_.reshape(30, 1))
ax_coef_linear.set_title('岭回归振幅')

ax_ridge.plot(x_fit, y_fit, 'o')
ax_ridge.plot(x_test, res2)
ax_ridge.set_xlim(-3.5, 3.5)
ax_ridge.set_ylim(-1.5, 1.5)
ax_ridge.set_title('Lasso回归结果')

ax_coef_ridge.plot(model2.steps[0][1].centers_[:, np.newaxis], model2.steps[1][1].coef_.reshape(30, 1))
ax_coef_ridge.set_title('Lasso回归振幅')
Text(0.5, 1.0, 'Lasso回归振幅')
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
Python
1
https://gitee.com/shiyuxin233/My-Machine-Learning.git
git@gitee.com:shiyuxin233/My-Machine-Learning.git
shiyuxin233
My-Machine-Learning
My Machine Learning
master

搜索帮助

371d5123 14472233 46e8bd33 14472233