整體流程
數據預處理:標準化->加一列全為1的偏置項
訓練:梯度下降,將數學公式轉換成代碼
預測
模型代碼?
import numpy as np# 標準化函數:對特征做均值-方差標準化
# 返回標準化后的特征、新數據的均值和標準差,用于后續預測def standard(feats):new_feats = np.copy(feats).astype(float)mean = np.mean(new_feats, axis=0)std = np.std(new_feats, axis=0)std[std == 0] = 1new_feats = (new_feats - mean) / stdreturn new_feats, mean, stdclass LinearRegression:def __init__(self, data, labels):# 對訓練數據進行標準化new_data, mean, std = standard(data)# 存儲用于預測的均值和標準差self.mean = meanself.std = std# 樣本數 m 和 原始特征數 nm, n = new_data.shape# 在特征矩陣前加一列 1 作為偏置項X = np.hstack((np.ones((m, 1)), new_data)) # shape (m, n+1)self.X = X # 訓練特征 (m, n+1)self.y = labels # 訓練標簽 (m, 1)self.m = m # 樣本數self.n = n + 1 # 特征數(含偏置)# 初始化參數 thetaself.theta = np.zeros((self.n, 1))def train(self, alpha, num_iterations=500):"""執行梯度下降:param alpha: 學習率:param num_iterations: 迭代次數:return: 學習到的 theta 和每次迭代的損失歷史"""cost_history = []for _ in range(num_iterations):self.gradient_step(alpha)cost_history.append(self.cost_function())return self.theta, cost_historydef gradient_step(self, alpha):# 計算預測值predictions = self.X.dot(self.theta) # shape (m,1)# 計算誤差delta = predictions - self.y # shape (m,1)# 計算梯度并更新 thetagrad = (self.X.T.dot(delta)) / self.m # shape (n+1,1)self.theta -= alpha * graddef cost_function(self):# 計算當前 theta 下的損失delta = self.X.dot(self.theta) - self.y # shape (m,1)return float((delta.T.dot(delta)) / (2 * self.m))def predict(self, data):"""對新數據進行預測:param data: 新數據,shape (m_new, n):return: 預測值,shape (m_new, 1)"""# 確保輸入為二維數組data = np.array(data, ndmin=2)# 使用訓練時的均值和標準差進行標準化new_data = (data - self.mean) / self.std# 加入偏置項m_new = new_data.shape[0]X_new = np.hstack((np.ones((m_new, 1)), new_data))# 返回預測結果return X_new.dot(self.theta)
測試代碼
import numpy as np
import pandas as pd
import matplotlib.pyplot as pltfrom linear_regression import LinearRegression
data = pd.read_csv('../data/world-happiness-report-2017.csv')train_data = data.sample(frac = 0.8)
test_data = data.drop(train_data.index)
input_param_name = 'Economy..GDP.per.Capita.'
output_param_name = 'Happiness.Score'
# 取出城市gdp的值和對應的幸福指數
x_train = train_data[[input_param_name]].values
y_train = train_data[[output_param_name]].values
x_test = test_data[input_param_name].values
y_test = test_data[output_param_name].valuesnum_iterations = 500
learning_rate = 0.01
# 訓練
# x_train是gdp值,y_train是幸福指數
linear_regression = LinearRegression(x_train,y_train)
# 梯度下降比率,訓練輪數
(theta,cost_history) = linear_regression.train(learning_rate,num_iterations)print ('開始時的損失:',cost_history[0])
print ('訓練后的損失:',cost_history[-1])plt.plot(range(num_iterations),cost_history)
plt.xlabel('Iter')
plt.ylabel('cost')
plt.title('GD')
plt.show()predictions_num = 100
# 最小值,最大值,多少個等間隔的數,然后做成列向量的形式
x_predictions = np.linspace(x_train.min(),x_train.max(),predictions_num).reshape(predictions_num,1)y_predictions = linear_regression.predict(x_predictions)plt.scatter(x_train,y_train,label='Train data')
plt.scatter(x_test,y_test,label='test data')
plt.plot(x_predictions,y_predictions,'r',label = 'Prediction')
plt.xlabel(input_param_name)
plt.ylabel(output_param_name)
plt.title('Happy')
plt.legend()
plt.show()