BGD、SGD和MBGD的一些区别

本文主要讲解了 梯度下降(Batch gradient descent),随机梯度下降(Stochastic gradient descent),小批量梯度下降(Mini-batch gradient descent)在实现上的区别

梯度下降(Batch gradient descent)–BGD

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
# 梯度下降(Batch gradient descent)--BGD
def batch_gradient_descent(x, y, learn_rate, epoches):
"""
:param x: 输入的x
:param y: 输入的y
:param learn_rate: 学习率
:param epoches: 迭代次数
:return:
"""
theta = np.array([0.0, 0.0])
for i in range(epoches):
loss = [0.0, 0.0]
# 全部的值带入,计算 梯度
m = len(y)
for j in range(m):
loss[0] = loss[0] + (theta[0] * x[j, 0] + theta[1] * x[j, 1] - y[j]) * x[j, 0] / m
loss[1] = loss[1] + (theta[0] * x[j, 0] + theta[1] * x[j, 1] - y[j]) / m
# 更新 theta
theta[0] = theta[0] - learn_rate * loss[0]
theta[1] = theta[1] - learn_rate * loss[1]
return theta

随机梯度下降(Stochastic gradient descent)–SGD

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
# 这不是随机梯度,随机梯度是每迭代一次,数据就随机一次---但是这也是一种处理手段
def stochastic_gradient_descent_false(x, y, learn_rate, epoches, stochastic_rate):
"""
:param x: 输入的x
:param y: 输入的y
:param learn_rate: 学习率
:param epoches: 迭代次数
:return:
"""
shufflle_data = np.column_stack((y, x))
np.random.shuffle(shufflle_data)
stochastic_count = int(len(y) * stochastic_rate)
# 然后随机取一些数据进行梯度优化, 比如取随机100条数据
y = shufflle_data[:stochastic_count, 0]
x = shufflle_data[:stochastic_count, 1:3]
return batch_gradient_descent(x, y, learn_rate, epoches)
# 正确的随机梯度应该是这样
def stochastic_gradient_descent_true(x, y, learn_rate, epoches, stochastic_rate):
"""
:param x: 输入的x
:param y: 输入的y
:param learn_rate: 学习率
:param epoches: 迭代次数
:return:
"""
theta = np.array([0.0, 0.0])
for i in range(epoches):
shufflle_data = np.column_stack((y, x))
np.random.shuffle(shufflle_data)
stochastic_count = int(len(y) * stochastic_rate)
# 然后随机取一些数据进行梯度优化, 比如取随机100条数据
y = shufflle_data[:stochastic_count, 0]
x = shufflle_data[:stochastic_count, 1:3]
# 随机之后的值,进行梯度计算
loss = [0.0, 0.0]
m = len(y)
for j in range(m):
loss[0] = loss[0] + (theta[0] * x[j, 0] + theta[1] * x[j, 1] - y[j]) * x[j, 0] / m
loss[1] = loss[1] + (theta[0] * x[j, 0] + theta[1] * x[j, 1] - y[j]) / m
# 更新 theta
theta[0] = theta[0] - learn_rate * loss[0]
theta[1] = theta[1] - learn_rate * loss[1]
return theta

小批量梯度下降(Mini-batch gradient descent)–MBGD

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
def mini_batch_gradient_descent(x, y, learn_rate, epoches, mini_length):
"""
:param x: 输入的x
:param y: 输入的y
:param learn_rate: 学习率
:param epoches: 迭代次数
:param mini_length: mini batch length
:return:
"""
# 随机打乱----optional
theta = np.array([0.0, 0.0])
# 随机打乱数据 ----optional
shufflle_data = np.column_stack((y, x))
np.random.shuffle(shufflle_data)
# 然后随机取一些数据进行梯度优化, 比如取随机100条数据
y = shufflle_data[:, 0]
x = shufflle_data[:, 1:3]
for i in range(epoches):
# 0-min_length, mini_length+1 2mini_length, ....... 一小段,一小段距离用于一次优化迭代
loss = [0.0, 0.0]
for j in range(0, len(y), mini_length):
loss[0] = loss[0] + (theta[0] * x[j, 0] + theta[1] * x[j, 1] - y[j]) * x[j, 0] / mini_length
loss[1] = loss[1] + (theta[0] * x[j, 0] + theta[1] * x[j, 1] - y[j]) / mini_length
# 更新 theta
theta[0] = theta[0] - learn_rate * loss[0]
theta[1] = theta[1] - learn_rate * loss[1]
return theta

实验代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
# -*- coding: utf-8 -*-
# @Date : 2017/9/8
# @Author : ryanbing (legotime@qq.com)
import numpy as np
import matplotlib.pyplot as plt
import datetime
rng = np.random.RandomState(1)
x = 10 * rng.rand(500)
y = 3 * x + 2 + rng.randn(500)
# plt.scatter(x, y)
# plt.show()
# 找出 y = wx + b 中的w 和 b, 正确的应该是 w = 3, b = 2
# 我们在计算的时候其看成 y = WX 其中 W= [w, b], X = [x, 1].T
# 梯度下降(Batch gradient descent)--BGD
def batch_gradient_descent(x, y, learn_rate, epoches):
"""
:param x: 输入的x
:param y: 输入的y
:param learn_rate: 学习率
:param epoches: 迭代次数
:return:
"""
start_time = datetime.datetime.now()
theta = np.array([0.0, 0.0])
for i in range(epoches):
loss = [0.0, 0.0]
# 全部的值带入,计算 梯度
m = len(y)
for j in range(m):
loss[0] = loss[0] + (theta[0] * x[j, 0] + theta[1] * x[j, 1] - y[j]) * x[j, 0] / m
loss[1] = loss[1] + (theta[0] * x[j, 0] + theta[1] * x[j, 1] - y[j]) / m
# 更新 theta
theta[0] = theta[0] - learn_rate * loss[0]
theta[1] = theta[1] - learn_rate * loss[1]
end_time = datetime.datetime.now()
return end_time - start_time, theta
# 这不是随机梯度,随机梯度是每迭代一次,数据就随机一次---但是这也是一种处理手段
def stochastic_gradient_descent_false(x, y, learn_rate, epoches, stochastic_rate):
"""
:param x: 输入的x
:param y: 输入的y
:param learn_rate: 学习率
:param epoches: 迭代次数
:return:
"""
start_time = datetime.datetime.now()
shufflle_data = np.column_stack((y, x))
np.random.shuffle(shufflle_data)
stochastic_count = int(len(y) * stochastic_rate)
# 然后随机取一些数据进行梯度优化, 比如取随机100条数据
y = shufflle_data[:stochastic_count, 0]
x = shufflle_data[:stochastic_count, 1:3]
end_time = datetime.datetime.now()
return end_time - start_time, batch_gradient_descent(x, y, learn_rate, epoches)
# 正确的随机梯度应该是这样
def stochastic_gradient_descent_true(x, y, learn_rate, epoches, stochastic_rate):
"""
:param x: 输入的x
:param y: 输入的y
:param learn_rate: 学习率
:param epoches: 迭代次数
:return:
"""
start_time = datetime.datetime.now()
theta = np.array([0.0, 0.0])
for i in range(epoches):
shufflle_data = np.column_stack((y, x))
np.random.shuffle(shufflle_data)
stochastic_count = int(len(y) * stochastic_rate)
# 然后随机取一些数据进行梯度优化, 比如取随机100条数据
y = shufflle_data[:stochastic_count, 0]
x = shufflle_data[:stochastic_count, 1:3]
# 随机之后的值,进行梯度计算
loss = [0.0, 0.0]
m = len(y)
for j in range(m):
loss[0] = loss[0] + (theta[0] * x[j, 0] + theta[1] * x[j, 1] - y[j]) * x[j, 0] / m
loss[1] = loss[1] + (theta[0] * x[j, 0] + theta[1] * x[j, 1] - y[j]) / m
# 更新 theta
theta[0] = theta[0] - learn_rate * loss[0]
theta[1] = theta[1] - learn_rate * loss[1]
end_time = datetime.datetime.now()
return end_time - start_time, theta
# 小批量梯度下降(Mini-batch gradient descent)--MBGD
def mini_batch_gradient_descent(x, y, learn_rate, epoches, mini_length):
"""
:param x: 输入的x
:param y: 输入的y
:param learn_rate: 学习率
:param epoches: 迭代次数
:param mini_length: mini batch length
:return:
"""
start_time = datetime.datetime.now()
# 随机打乱----optional
theta = np.array([0.0, 0.0])
# 随机打乱数据 ----optional
shufflle_data = np.column_stack((y, x))
np.random.shuffle(shufflle_data)
# 然后随机取一些数据进行梯度优化, 比如取随机100条数据
y = shufflle_data[:, 0]
x = shufflle_data[:, 1:3]
for i in range(epoches):
# 0-min_length, mini_length+1 2mini_length, ....... 一小段,一小段距离用于一次优化迭代
loss = [0.0, 0.0]
for j in range(0, len(y), mini_length):
loss[0] = loss[0] + (theta[0] * x[j, 0] + theta[1] * x[j, 1] - y[j]) * x[j, 0] / mini_length
loss[1] = loss[1] + (theta[0] * x[j, 0] + theta[1] * x[j, 1] - y[j]) / mini_length
# 更新 theta
theta[0] = theta[0] - learn_rate * loss[0]
theta[1] = theta[1] - learn_rate * loss[1]
end_time = datetime.datetime.now()
return end_time - start_time, theta
def contro_func(func, **kwargs):
"""
:param func: 函数
:param kwargs: func 中需要的参数
:return:
"""
x = kwargs.get('x', None)
y = kwargs.get('y', None)
learn_rate = kwargs.get('learn_rate', None)
epoches = kwargs.get('epoches', None)
stochastic_rate = kwargs.get('stochastic_rate', None)
mini_length = kwargs.get('mini_length', None)
# change the value is args is not num
if stochastic_rate is not None:
return func(x, y, learn_rate, epoches, stochastic_rate)
if mini_length is not None:
return func(x, y, learn_rate, epoches, mini_length)
return func(x, y, learn_rate, epoches)
def show_trend():
# 画出收敛的的图像和收敛对应的时间
rng = np.random.RandomState(1)
x = 10 * rng.rand(500)
x = np.array([x, np.ones(500)]).T
y = 3 * x + 2 + rng.randn(500)
learn_rate = 0.01
stochastic_rate = 0.4
mini_length = 10
for j in [batch_gradient_descent, stochastic_gradient_descent_false,
stochastic_gradient_descent_true, mini_batch_gradient_descent]:
tmp = []
for epoches in [1, 10, 100, 1000, 10000, 100000]:
tmp.append(contro_func(i, x=x, y=y, learn_rate=learn_rate, stochastic_rate=stochastic_rate,
mini_length=mini_length, epoches=epoches))
if __name__ == '__main__':
# test(func=func, x=1, y=2, learn_rate=3, epoches=4, stochastic_rate=5)
# print(batch_gradient_descent(np.array([x, np.ones(500)]).T, y, learn_rate=0.01, epoches=100000))
# [ 1.14378512 0.17288215]
# [ 3.18801281 0.50870366]
# [ 3.18602557 0.806018 ]
# [ 3.03276102 1.84267445]
# [ 3.01449298 1.96623647]
# [ 3.01449298 1.96623647]
# print(stochastic_gradient_descent_false(np.array([x, np.ones(500)]).T, y, learn_rate=0.01, epoches=100,stochastic_rate=0.4))
# [ 1.11939055 0.16949282]
# [ 3.19877639 0.50404936]
# [ 3.20921332 0.78698163]
# [ 3.04720128 1.82412805]
# [ 3.01920995 1.89883629]
# [ 2.98281143 2.15226071]
# print(stochastic_gradient_descent_true(np.array([x, np.ones(50000)]).T, y, learn_rate=0.01, epoches=1000,stochastic_rate=1))
# print(mini_batch_gradient_descent(np.array([x, np.ones(500)]).T, y, learn_rate=0.01, epoches=100, mini_length=10))
# [ 0.94630842 0.14845568]
# [ 0.8811451 0.15444328]
# [ 3.18337012 0.51049921]
# [ 3.14833317 0.79174635]
# [ 3.03507147 1.87931184]
作者

Ryan

发布于

2017-09-12

更新于

2020-11-25

许可协议

评论