逻辑回归算法动手实现
目录
在《逻辑回归算法》一问中,我们介绍了逻辑回归算法的原理,这一篇文章,我们将通过逻辑回归算法的原理,动手用python实现一遍,以加深对逻辑回归算法的理解。同时跟scikit-learn的逻辑回归工具进行效果对比。
一、问题描述
这一次,我们还是通过一个实际的问题案例来引入。我们使用iris数据集,这是一个比较官方的数据来源。这个数据集市关于鸢尾花的数据,该数据有5个特征,分别是花瓣(petal)的长和宽、萼片(sepal)的长和宽、花的种类。其中三类鸢尾花分别是:setosa, versicolor, virginica。数据集可以参考如下链接:
http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data
为了简化问题,我们使用50-150之间的数据,也就是只使用 versicolor, virginica这两类的鸢尾花数据,共100个样本。同时选取萼片(sepal)的长和宽作为输入变量(特征)。也就是,我们要解决的是一个而分类的问题。
下面,我们先看下这两个鸢尾花的数据分布:
import time import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split # 加载Iris数据集 df = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', header=None) df.columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'category'] print df.head()
输出:
# 鸢尾花的分布图 print '----------------鸢尾花的分布图-------------------' plt.scatter(df.iloc[50:100, 2], df.iloc[50:100, 3],color='red', marker='o', label='versicolor') # 中间50个样本的散点图 plt.scatter(df.iloc[100:150, 2], df.iloc[100:150, 3],color='blue', marker='x', label='virginica') # 后50个样本的散点图 plt.xlabel('petal length') plt.ylabel('petal width') plt.legend(loc=2) plt.show()
二、用python实现逻辑回归算法
1、sigmod函数
\[
g(z) = \frac{1}{1 + e ^ {-z}}
\]
def sigmod(z): """ sigmod函数 :param z: mx1矩阵 :return: mx1矩阵 """ return 1.0 / (1 + np.exp(-z))
2、模型函数
\[
h(X)=g(X \theta)
\]
def model(theta, X): """ 预测函数 :param theta: nx1矩阵 :param X: mxn矩阵 :return: mx1矩阵 """ return sigmod(X.dot(theta))
3、代价函数
def cosst_function(theta, X, y): """ 代价函数 :param theta: nx1矩阵 :param X: mxn矩阵 :param y: mx1矩阵 :return: 实数 """ m = y.size h = sigmod(X.dot(theta)) J = -(1.0/m)*((np.log(h)).T.dot(y) + (np.log(1-h)).T.dot(1-y)) return J[0, 0]
4、梯度下降
def derivative(theta, X, y): """ 对theta求偏导 :param theta: nx1矩阵 :param X: mxn矩阵 :param y: mx1矩阵 :return: nx1矩阵 """ m = y.size h = sigmod(X.dot(theta)) grad = (1.0/m) * X.T.dot(h - y) return grad def gradient(grade_theta, theta, alpha): """ 𝜃更新 :param grade_theta:求导后的𝜃矩阵(nx1) :param theta: 更新后的𝜃矩阵(nx1) :param alpha: 学习率 :return: 返回更新后的𝜃矩阵(nx1) """ return theta - alpha * grade_theta
5、逻辑回归算法
def logistic_regression(X, y): """ 逻辑回归算法 :param X: mxn矩阵 :param y: mx1矩阵 :return: nx1矩阵 """ start = time.clock() m = X.shape[0] # 样本个数 n = X.shape[1] # 特征个数 y = y.reshape(m, 1) cost_record = [] # 记录代价函数的值 alpha = 0.1 # 学习率 maxiters = 1000 # 最大迭代次数 theta = np.zeros(n).reshape(n, 1) # 设置权重参数的初始值 cost_val = cosst_function(theta, X, y) cost_record.append(cost_val) iters = 0 while True: if iters >= maxiters: break grad = derivative(theta, X, y) # 权重参数更新 new_theta = gradient(grad, theta, alpha) theta = new_theta cost_update = cosst_function(new_theta, X, y) cost_val = cost_update cost_record.append(cost_val) iters += 1 end = time.clock() print("cost time: %f s" % (end - start)) return cost_record, theta, iters
6、预测函数
def predict(theta, X, threshold=0.5): """ 预测函数 :param theta: 已求解的theta函数 :param X: 输入变量 :param threshold: 阈值 :return: 输出变量(预测) """ p = model(theta, X) >= threshold return p.astype('int')
最后,我们通过给定的数据集,对逻辑回归模型进行训练(80%训练集,20%测试集)。
""" 通过判定花萼长度,花萼宽度,花瓣长度,花瓣宽度的尺寸大小来识别鸢尾花的类别 前四列分别为:花萼长度(sepal),花萼宽度,花瓣长度(petal),花瓣宽度 第5列为鸢尾花的类别(包括Setosa,Versicolour,Virginica三类) """ # 加载Iris数据集 df = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', header=None) df.columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'category'] print df.head() # 鸢尾花的分布图 print '----------------鸢尾花的分布图-------------------' plt.scatter(df.iloc[50:100, 2], df.iloc[50:100, 3],color='red', marker='o', label='versicolor') # 中间50个样本的散点图 plt.scatter(df.iloc[100:150, 2], df.iloc[100:150, 3],color='blue', marker='x', label='virginica') # 后50个样本的散点图 plt.xlabel('petal length') plt.ylabel('petal width') plt.legend(loc=2) plt.show() print '----------------数据预处理-------------------' df = df.iloc[50:150, :] df.loc[df['category'] == 'Iris-versicolor', 'category'] = 0 df.loc[df['category'] == 'Iris-virginica', 'category'] = 1 df['category'] = df['category'].astype('int') print df.head() print df.shape print df.dtypes # 取其中两个特征,即:花瓣长度,花瓣宽度,即第2列和第3列 X = df[['petal_length', 'petal_width']].values y = df[['category']].values X_std = np.c_[np.ones((df.shape[0], 1)), min_max(X)] X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size=0.2, random_state=0) print X_train.shape print X_test.shape cost_record, theta, iters = logistic_regression(X_train, y_train) print '----------------训练结果-------------------' print "代价函数数组:", cost_record print "theta参数:", theta print "迭代次数:", iters print '-----------------测试-----------------' train_right_counts = 0 for i in range(X_train.shape[0]): original_val = y_train[i, :] predict_val = predict(theta, X_train[i, :]) # print "Original:", original_val, " Predict:", predict_val, ("o" if original_val == predict_val else "x") if original_val == predict_val: train_right_counts += 1 print "训练集准确率:", ((train_right_counts * 1.0) / X_train.shape[0]) test_right_counts = 0 for i in range(X_test.shape[0]): original_val = y_test[i, :] predict_val = predict(theta, X_test[i, :]) # print "Original:", original_val, " Predict:", predict_val, ("o" if original_val == predict_val else "x") if original_val == predict_val: test_right_counts += 1 print "测试集准确率:", ((test_right_counts * 1.0) / X_test.shape[0]) # 代价函数值收敛趋势图 plt.plot(np.arange(len(cost_record)-1), cost_record[1:], 'r') plt.show()
训练的模型,给出的测试效果还是不错的,测试集准确度达到了95%。
代价函数收敛趋势图:
三、scikit-learn的逻辑回归算法工具
# -*- coding: UTF-8 -*- import pandas as pd from sklearn.model_selection import train_test_split """ 通过判定花萼长度,花萼宽度,花瓣长度,花瓣宽度的尺寸大小来识别鸢尾花的类别 前四列分别为:花萼长度(sepal),花萼宽度,花瓣长度(petal),花瓣宽度 第5列为鸢尾花的类别(包括Setosa,Versicolour,Virginica三类) """ from sklearn import datasets iris = datasets.load_iris() # 加载Iris数据集作为DataFrame对象 df = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', header=None) df.columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'category'] df = df.iloc[50:150, :] df.loc[df['category'] == 'Iris-versicolor', 'category'] = 0 df.loc[df['category'] == 'Iris-virginica', 'category'] = 1 df['category'] = df['category'].astype('int') # 取其中两个特征,即:花瓣长度,花瓣宽度,即第2列和第3列 X = df[['petal_length', 'petal_width']].values y = df[['category']].values # plt.scatter(X[:50, 0], X[:50, 1],color='red', marker='o', label='setosa') # 前50个样本的散点图 # plt.scatter(X[50:100, 0], X[50:100, 1],color='blue', marker='x', label='versicolor') # 中间50个样本的散点图 # plt.scatter(X[100:, 0], X[100:, 1],color='green', marker='+', label='Virginica') # 后50个样本的散点图 # plt.xlabel('petal length') # plt.ylabel('sepal length') # plt.legend(loc=2) # 把说明放在左上角,具体请参考官方文档 # plt.show() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) #对特征进行标准化处理(特征缩放) from sklearn.preprocessing import StandardScaler sc = StandardScaler() sc.fit(X_train) X_train_std = sc.transform(X_train) X_test_std = sc.transform(X_test) from sklearn.linear_model import LogisticRegression lr = LogisticRegression(C=1000.0, random_state=0) lr.fit(X_train_std, y_train.ravel()) print lr.coef_ print '-----------------测试-----------------' train_right_counts = 0 for i in range(X_train_std.shape[0]): original_val = y_train[i, :] predict_val = lr.predict(X_train_std[i, :].reshape(1, -1)) # print "Original:", original_val, " Predict:", predict_val, ("o" if original_val == predict_val else "x") if original_val == predict_val: train_right_counts += 1 print "训练集准确率:", ((train_right_counts * 1.0) / X_train_std.shape[0]) test_right_counts = 0 for i in range(X_test_std.shape[0]): original_val = y_test[i, :] predict_val = lr.predict(X_test_std[i, :].reshape(1, -1)) # print "Original:", original_val, " Predict:", predict_val, ("o" if original_val == predict_val else "x") if original_val == predict_val: test_right_counts += 1 print "测试集准确率:", ((test_right_counts * 1.0) / X_test_std.shape[0]) # # 化边界函数 # X_combined_std = np.vstack((X_train_std, X_test_std)) # y_combined = np.hstack((y_train, y_test)) # # plot_decision_regions(X_combined_std, y_combined, classifier=lr, test_idx=range(105,150)) # plt.xlabel('petal length [standardized]') # plt.ylabel('petal width [standardized]') # plt.legend(loc='upper left') # plt.show()
测试结果如下,测试集的准确率跟我们手动实现的逻辑回归算法测试结果是一样的,只是训练集的准确率略高一些。
ddd
ddd