2017-09-12发表2020-11-25更新机器学习11 分钟读完 (大约1684个字)

BGD、SGD和MBGD的一些区别

本文主要讲解了梯度下降(Batch gradient descent)，随机梯度下降(Stochastic gradient descent)，小批量梯度下降(Mini-batch gradient descent)在实现上的区别

梯度下降(Batch gradient descent)–BGD

# 梯度下降(Batch gradient descent)--BGD
def batch_gradient_descent(x, y, learn_rate, epoches):
    """
    :param x:  输入的x
    :param y:  输入的y
    :param learn_rate: 学习率
    :param epoches: 迭代次数
    :return:
    """
    theta = np.array([0.0, 0.0])
    for i in range(epoches):
        loss = [0.0, 0.0]
        # 全部的值带入，计算 梯度
        m = len(y)
        for j in range(m):
            loss[0] = loss[0] + (theta[0] * x[j, 0] + theta[1] * x[j, 1] - y[j]) * x[j, 0] / m
            loss[1] = loss[1] + (theta[0] * x[j, 0] + theta[1] * x[j, 1] - y[j]) / m
        # 更新 theta
        theta[0] = theta[0] - learn_rate * loss[0]
        theta[1] = theta[1] - learn_rate * loss[1]
    return theta

随机梯度下降(Stochastic gradient descent)–SGD

# 这不是随机梯度，随机梯度是每迭代一次，数据就随机一次---但是这也是一种处理手段
def stochastic_gradient_descent_false(x, y, learn_rate, epoches, stochastic_rate):
    """
    :param x: 输入的x
    :param y: 输入的y
    :param learn_rate: 学习率
    :param epoches: 迭代次数
    :return:
    """
    shufflle_data = np.column_stack((y, x))
    np.random.shuffle(shufflle_data)
    stochastic_count = int(len(y) * stochastic_rate)
    # 然后随机取一些数据进行梯度优化， 比如取随机100条数据
    y = shufflle_data[:stochastic_count, 0]
    x = shufflle_data[:stochastic_count, 1:3]
    return batch_gradient_descent(x, y, learn_rate, epoches)
# 正确的随机梯度应该是这样
def stochastic_gradient_descent_true(x, y, learn_rate, epoches, stochastic_rate):
    """
    :param x: 输入的x
    :param y: 输入的y
    :param learn_rate: 学习率
    :param epoches: 迭代次数
    :return:
    """
    theta = np.array([0.0, 0.0])
    for i in range(epoches):
        shufflle_data = np.column_stack((y, x))
        np.random.shuffle(shufflle_data)
        stochastic_count = int(len(y) * stochastic_rate)
        # 然后随机取一些数据进行梯度优化， 比如取随机100条数据
        y = shufflle_data[:stochastic_count, 0]
        x = shufflle_data[:stochastic_count, 1:3]
        # 随机之后的值，进行梯度计算
        loss = [0.0, 0.0]
        m = len(y)
        for j in range(m):
            loss[0] = loss[0] + (theta[0] * x[j, 0] + theta[1] * x[j, 1] - y[j]) * x[j, 0] / m
            loss[1] = loss[1] + (theta[0] * x[j, 0] + theta[1] * x[j, 1] - y[j]) / m
        # 更新 theta
        theta[0] = theta[0] - learn_rate * loss[0]
        theta[1] = theta[1] - learn_rate * loss[1]
    return theta

小批量梯度下降(Mini-batch gradient descent)–MBGD

def mini_batch_gradient_descent(x, y, learn_rate, epoches, mini_length):
    """
    :param x: 输入的x
    :param y: 输入的y
    :param learn_rate: 学习率
    :param epoches: 迭代次数
    :param mini_length: mini batch length
    :return:
    """
    # 随机打乱----optional
    theta = np.array([0.0, 0.0])
    # 随机打乱数据  ----optional
    shufflle_data = np.column_stack((y, x))
    np.random.shuffle(shufflle_data)
    # 然后随机取一些数据进行梯度优化， 比如取随机100条数据
    y = shufflle_data[:, 0]
    x = shufflle_data[:, 1:3]
    for i in range(epoches):
        # 0-min_length， mini_length+1  2mini_length, ....... 一小段，一小段距离用于一次优化迭代
        loss = [0.0, 0.0]
        for j in range(0, len(y), mini_length):
            loss[0] = loss[0] + (theta[0] * x[j, 0] + theta[1] * x[j, 1] - y[j]) * x[j, 0] / mini_length
            loss[1] = loss[1] + (theta[0] * x[j, 0] + theta[1] * x[j, 1] - y[j]) / mini_length
        # 更新 theta
        theta[0] = theta[0] - learn_rate * loss[0]
        theta[1] = theta[1] - learn_rate * loss[1]
    return theta

实验代码

# -*- coding: utf-8 -*-
# @Date    : 2017/9/8
# @Author  : ryanbing (legotime@qq.com)
import numpy as np
import matplotlib.pyplot as plt
import datetime
rng = np.random.RandomState(1)
x = 10 * rng.rand(500)
y = 3 * x + 2 + rng.randn(500)
# plt.scatter(x, y)
# plt.show()
# 找出 y = wx + b 中的w 和 b, 正确的应该是 w = 3, b = 2
# 我们在计算的时候其看成 y = WX 其中 W= [w, b], X = [x, 1].T
# 梯度下降(Batch gradient descent)--BGD
def batch_gradient_descent(x, y, learn_rate, epoches):
    """
    :param x:  输入的x
    :param y:  输入的y
    :param learn_rate: 学习率
    :param epoches: 迭代次数
    :return:
    """
    start_time = datetime.datetime.now()
    theta = np.array([0.0, 0.0])
    for i in range(epoches):
        loss = [0.0, 0.0]
        # 全部的值带入，计算 梯度
        m = len(y)
        for j in range(m):
            loss[0] = loss[0] + (theta[0] * x[j, 0] + theta[1] * x[j, 1] - y[j]) * x[j, 0] / m
            loss[1] = loss[1] + (theta[0] * x[j, 0] + theta[1] * x[j, 1] - y[j]) / m
        # 更新 theta
        theta[0] = theta[0] - learn_rate * loss[0]
        theta[1] = theta[1] - learn_rate * loss[1]
    end_time = datetime.datetime.now()
    return end_time - start_time, theta
# 这不是随机梯度，随机梯度是每迭代一次，数据就随机一次---但是这也是一种处理手段
def stochastic_gradient_descent_false(x, y, learn_rate, epoches, stochastic_rate):
    """
    :param x: 输入的x
    :param y: 输入的y
    :param learn_rate: 学习率
    :param epoches: 迭代次数
    :return:
    """
    start_time = datetime.datetime.now()
    shufflle_data = np.column_stack((y, x))
    np.random.shuffle(shufflle_data)
    stochastic_count = int(len(y) * stochastic_rate)
    # 然后随机取一些数据进行梯度优化， 比如取随机100条数据
    y = shufflle_data[:stochastic_count, 0]
    x = shufflle_data[:stochastic_count, 1:3]
    end_time = datetime.datetime.now()
    return end_time - start_time, batch_gradient_descent(x, y, learn_rate, epoches)
# 正确的随机梯度应该是这样
def stochastic_gradient_descent_true(x, y, learn_rate, epoches, stochastic_rate):
    """
    :param x: 输入的x
    :param y: 输入的y
    :param learn_rate: 学习率
    :param epoches: 迭代次数
    :return:
    """
    start_time = datetime.datetime.now()
    theta = np.array([0.0, 0.0])
    for i in range(epoches):
        shufflle_data = np.column_stack((y, x))
        np.random.shuffle(shufflle_data)
        stochastic_count = int(len(y) * stochastic_rate)
        # 然后随机取一些数据进行梯度优化， 比如取随机100条数据
        y = shufflle_data[:stochastic_count, 0]
        x = shufflle_data[:stochastic_count, 1:3]
        # 随机之后的值，进行梯度计算
        loss = [0.0, 0.0]
        m = len(y)
        for j in range(m):
            loss[0] = loss[0] + (theta[0] * x[j, 0] + theta[1] * x[j, 1] - y[j]) * x[j, 0] / m
            loss[1] = loss[1] + (theta[0] * x[j, 0] + theta[1] * x[j, 1] - y[j]) / m
        # 更新 theta
        theta[0] = theta[0] - learn_rate * loss[0]
        theta[1] = theta[1] - learn_rate * loss[1]
    end_time = datetime.datetime.now()
    return end_time - start_time, theta
# 小批量梯度下降(Mini-batch gradient descent)--MBGD
def mini_batch_gradient_descent(x, y, learn_rate, epoches, mini_length):
    """
    :param x: 输入的x
    :param y: 输入的y
    :param learn_rate: 学习率
    :param epoches: 迭代次数
    :param mini_length: mini batch length
    :return:
    """
    start_time = datetime.datetime.now()
    # 随机打乱----optional
    theta = np.array([0.0, 0.0])
    # 随机打乱数据  ----optional
    shufflle_data = np.column_stack((y, x))
    np.random.shuffle(shufflle_data)
    # 然后随机取一些数据进行梯度优化， 比如取随机100条数据
    y = shufflle_data[:, 0]
    x = shufflle_data[:, 1:3]
    for i in range(epoches):
        # 0-min_length， mini_length+1  2mini_length, ....... 一小段，一小段距离用于一次优化迭代
        loss = [0.0, 0.0]
        for j in range(0, len(y), mini_length):
            loss[0] = loss[0] + (theta[0] * x[j, 0] + theta[1] * x[j, 1] - y[j]) * x[j, 0] / mini_length
            loss[1] = loss[1] + (theta[0] * x[j, 0] + theta[1] * x[j, 1] - y[j]) / mini_length
        # 更新 theta
        theta[0] = theta[0] - learn_rate * loss[0]
        theta[1] = theta[1] - learn_rate * loss[1]
    end_time = datetime.datetime.now()
    return end_time - start_time, theta
def contro_func(func, **kwargs):
    """
    :param func: 函数
    :param kwargs:  func 中需要的参数
    :return:
    """
    x = kwargs.get('x', None)
    y = kwargs.get('y', None)
    learn_rate = kwargs.get('learn_rate', None)
    epoches = kwargs.get('epoches', None)
    stochastic_rate = kwargs.get('stochastic_rate', None)
    mini_length = kwargs.get('mini_length', None)
    # change the value is args is not num
    if stochastic_rate is not None:
        return func(x, y, learn_rate, epoches, stochastic_rate)
    if mini_length is not None:
        return func(x, y, learn_rate, epoches, mini_length)
    return func(x, y, learn_rate, epoches)
def show_trend():
    # 画出收敛的的图像和收敛对应的时间
    rng = np.random.RandomState(1)
    x = 10 * rng.rand(500)
    x = np.array([x, np.ones(500)]).T
    y = 3 * x + 2 + rng.randn(500)
    learn_rate = 0.01
    stochastic_rate = 0.4
    mini_length = 10
    for j in [batch_gradient_descent, stochastic_gradient_descent_false,
              stochastic_gradient_descent_true, mini_batch_gradient_descent]:
        tmp = []
        for epoches in [1, 10, 100, 1000, 10000, 100000]:
            tmp.append(contro_func(i, x=x, y=y, learn_rate=learn_rate, stochastic_rate=stochastic_rate,
                                   mini_length=mini_length, epoches=epoches))
if __name__ == '__main__':
    # test(func=func, x=1, y=2, learn_rate=3, epoches=4, stochastic_rate=5)
    # print(batch_gradient_descent(np.array([x, np.ones(500)]).T, y, learn_rate=0.01, epoches=100000))
    # [ 1.14378512 0.17288215]
    # [ 3.18801281 0.50870366]
    # [ 3.18602557 0.806018 ]
    # [ 3.03276102 1.84267445]
    # [ 3.01449298 1.96623647]
    # [ 3.01449298 1.96623647]
    # print(stochastic_gradient_descent_false(np.array([x, np.ones(500)]).T, y, learn_rate=0.01, epoches=100,stochastic_rate=0.4))
    # [ 1.11939055 0.16949282]
    # [ 3.19877639 0.50404936]
    # [ 3.20921332 0.78698163]
    # [ 3.04720128 1.82412805]
    # [ 3.01920995 1.89883629]
    # [ 2.98281143 2.15226071]
    # print(stochastic_gradient_descent_true(np.array([x, np.ones(50000)]).T, y, learn_rate=0.01, epoches=1000,stochastic_rate=1))
    # print(mini_batch_gradient_descent(np.array([x, np.ones(500)]).T, y, learn_rate=0.01, epoches=100, mini_length=10))
    # [ 0.94630842  0.14845568]
    # [ 0.8811451   0.15444328]
    # [ 3.18337012  0.51049921]
    # [ 3.14833317  0.79174635]
    # [ 3.03507147  1.87931184]

BGD、SGD和MBGD的一些区别

http://www.bingryan.com/2017/09/12/BGD、SGD和MBGD的一些区别/

作者

Ryan

发布于

2017-09-12

更新于

2020-11-25

许可协议

#机器学习

BGD、SGD和MBGD的一些区别

梯度下降(Batch gradient descent)–BGD

随机梯度下降(Stochastic gradient descent)–SGD

小批量梯度下降(Mini-batch gradient descent)–MBGD

实验代码

作者

发布于

更新于

许可协议

评论

链接

分类

最新文章

目录

归档

标签