cs231n作业：assignment2 - Convolutional Networks

2018-10-22

cs231n, homework

阅读数量:次

作业做到这里才真正进入了cnn的范畴。

先用最基本的循环来写forward

def conv_forward_naive(x, w, b, conv_param):
    """
    A naive implementation of the forward pass for a convolutional layer.

    The input consists of N data points, each with C channels, height H and
    width W. We convolve each input with F different filters, where each filter
    spans all C channels and has height HH and width HH.

    Input:
    - x: Input data of shape (N, C, H, W)
    - w: Filter weights of shape (F, C, HH, WW)
    - b: Biases, of shape (F,)
    - conv_param: A dictionary with the following keys:
      - 'stride': The number of pixels between adjacent receptive fields in the
        horizontal and vertical directions.
      - 'pad': The number of pixels that will be used to zero-pad the input.

    Returns a tuple of:
    - out: Output data, of shape (N, F, H', W') where H' and W' are given by
      H' = 1 + (H + 2 * pad - HH) / stride
      W' = 1 + (W + 2 * pad - WW) / stride
    - cache: (x, w, b, conv_param)
    """
    out = None
    ###########################################################################
    # TODO: Implement the convolutional forward pass.                         #
    # Hint: you can use the function np.pad for padding.                      #
    ###########################################################################
    #N个样本，C个通道，H高度，W宽度
    N, C, H, W = x.shape
    #F个filter，C个通道，HH个核高度，WW核宽度
    F, C, HH, WW = w.shape
    #步长
    stride = conv_param['stride']
    #padding 的像素个数
    pad = conv_param['pad']

    #经过卷积核之后的图片大小
    new_H = 1 + int((H + 2 * pad - HH)/stride)
    new_W = 1 + int((W + 2 * pad - WW)/stride)
    out = np.zeros([N, F, new_H, new_W])

    #遍历N个样本卷积
    for n in range(N):
        for f in range(F):
            #需要加上bias
            conv_newH_new_W = np.ones([new_H, new_W]) * b[f]
            for c in range(C):
                #填充原图片x
                padded_x = np.lib.pad(x[n, c], pad_width = pad, mode='constant',constant_values=0)
                #开始计算卷积后的图中的每一个像素，每一个像素就是对应一个卷积核乘上原来的图片的位置
                for i in range(new_H):
                    for j in range(new_W):
                        conv_newH_new_W[i, j] += np.sum(padded_x[i * stride:i * stride+HH, j * stride: j*stride+WW]* w[f, c, :, :])
            #把C个通道中的那些对应像素加在一起，得到了单张图片单个核数的out
            out[n, f] = conv_newH_new_W

    ###########################################################################
    #                             END OF YOUR CODE                            #
    ###########################################################################
    cache = (x, w, b, conv_param)
    return out, cache

backward如图：

def conv_backward_naive(dout, cache):
    """
    A naive implementation of the backward pass for a convolutional layer.

    Inputs:
    - dout: Upstream derivatives.
    - cache: A tuple of (x, w, b, conv_param) as in conv_forward_naive

    Returns a tuple of:
    - dx: Gradient with respect to x
    - dw: Gradient with respect to w
    - db: Gradient with respect to b
    """
    dx, dw, db = None, None, None
    ###########################################################################
    # TODO: Implement the convolutional backward pass.                        #
    ###########################################################################
    # 数据准备
    x, w, b, conv_param = cache
    pad = conv_param['pad']
    stride = conv_param['stride']
    F, C, HH, WW = w.shape
    N, C, H, W = x.shape
    N, F, new_H, new_W = dout.shape

    # 下面，我们模拟卷积，首先填充x。
    padded_x = np.lib.pad(x,
                          ((0, 0), (0, 0), (pad, pad), (pad, pad)),
                          mode='constant',
                          constant_values=0)
    padded_dx = np.zeros_like(padded_x)  # 填充了的dx，后面去填充即可得到dx
    dw = np.zeros_like(w)
    db = np.zeros_like(b)
    
    for n in range(N):  # 第n个图像
        for f in range(F):  # 第f个过滤器
            for i in range(new_H):
                for j in range(new_W):
                    #dw 等于所有out的每一个像素求导之和，因为out每个像素都共享参数
                    db[f] += dout[n, f, i, j] # dg对db求导为1*dout
                    dw[f] += padded_x[n, :, i*stride : HH + i*stride, j*stride : WW + j*stride] * dout[n, f, i, j]
                    padded_dx[n, :, i*stride : HH + i*stride, j*stride : WW + j*stride] += w[f] * dout[n, f, i, j]
    # 去掉填充部分
    dx = padded_dx[:, :, pad:pad + H, pad:pad + W]

    ###########################################################################
    #                             END OF YOUR CODE                            #
    ###########################################################################
    return dx, dw, db

然后是max pool 层

def max_pool_forward_naive(x, pool_param):
    """
    A naive implementation of the forward pass for a max pooling layer.

    Inputs:
    - x: Input data, of shape (N, C, H, W)
    - pool_param: dictionary with the following keys:
      - 'pool_height': The height of each pooling region
      - 'pool_width': The width of each pooling region
      - 'stride': The distance between adjacent pooling regions

    Returns a tuple of:
    - out: Output data
    - cache: (x, pool_param)
    """
    out = None
    ###########################################################################
    # TODO: Implement the max pooling forward pass                            #
    ###########################################################################
    N, C, H, W = x.shape
    pool_height = pool_param['pool_height'] # 池化过滤器高度
    pool_width  = pool_param['pool_width']  # 池化过滤器宽度
    pool_stride = pool_param['stride']      # 移动步长
    new_H = 1 + int((H - pool_height) / pool_stride)    # 池化结果矩阵高度
    new_W = 1 + int((W - pool_width) / pool_stride)     # 池化结果矩阵宽度
    out = np.zeros([N, C, new_H, new_W])
    for n in range(N):
        for c in range(C):
            for i in range(new_H):
                for j in range(new_W):
                    out[n,c,i,j] = np.max(x[n, c, i*pool_stride : i*pool_stride+pool_height, j*pool_stride : j*pool_stride+pool_width])


    ###########################################################################
    #                             END OF YOUR CODE                            #
    ###########################################################################
    cache = (x, pool_param)
    return out, cache


def max_pool_backward_naive(dout, cache):
    """
    A naive implementation of the backward pass for a max pooling layer.

    Inputs:
    - dout: Upstream derivatives
    - cache: A tuple of (x, pool_param) as in the forward pass.

    Returns:
    - dx: Gradient with respect to x
    """
    dx = None
    ###########################################################################
    # TODO: Implement the max pooling backward pass                           #
    ###########################################################################
    #太难
    x, pool_param = cache
    N, C, H, W = x.shape
    pool_height = pool_param['pool_height']
    pool_width  = pool_param['pool_width']
    pool_stride = pool_param['stride']
    new_H = 1 + int((H - pool_height) / pool_stride)
    new_W = 1 + int((W - pool_width) / pool_stride)
    dx = np.zeros_like(x)
    for n in range(N):
        for c in range(C):
            for i in range(new_H):
                for j in range(new_W):
                    window = x[n, c, i * pool_stride: i * pool_stride + pool_height,j * pool_stride: j * pool_stride + pool_width]
                    dx[n, c, i * pool_stride: i * pool_stride + pool_height, j * pool_stride: j * pool_stride + pool_width] = (window == np.max(window))*dout[n,c,i,j]

    ###########################################################################
    #                             END OF YOUR CODE                            #
    ###########################################################################
    return dx

以上只是尝试最基本的CNN和max pool结构。实际使用不用这个，因为有更高效的版本。

然后用高效的版本定义了三明治层：

def conv_relu_forward(x, w, b, conv_param):
    """
    A convenience layer that performs a convolution followed by a ReLU.

    Inputs:
    - x: Input to the convolutional layer
    - w, b, conv_param: Weights and parameters for the convolutional layer

    Returns a tuple of:
    - out: Output from the ReLU
    - cache: Object to give to the backward pass
    """
    a, conv_cache = conv_forward_fast(x, w, b, conv_param)
    out, relu_cache = relu_forward(a)
    cache = (conv_cache, relu_cache)
    return out, cache


def conv_relu_backward(dout, cache):
    """
    Backward pass for the conv-relu convenience layer.
    """
    conv_cache, relu_cache = cache
    da = relu_backward(dout, relu_cache)
    dx, dw, db = conv_backward_fast(da, conv_cache)
    return dx, dw, db

在cnn.py中完成了三层的ConvNet


class ThreeLayerConvNet(object):
    """
    A three-layer convolutional network with the following architecture:

    conv - relu - 2x2 max pool - affine - relu - affine - softmax

    The network operates on minibatches of data that have shape (N, C, H, W)
    consisting of N images, each with height H and width W and with C input
    channels.
    """

    def __init__(self, input_dim=(3, 32, 32), num_filters=32, filter_size=7,
                 hidden_dim=100, num_classes=10, weight_scale=1e-3, reg=0.0,
                 dtype=np.float32):
        """
        Initialize a new network.

        Inputs:
        - input_dim: Tuple (C, H, W) giving size of input data
        - num_filters: Number of filters to use in the convolutional layer
        - filter_size: Size of filters to use in the convolutional layer
        - hidden_dim: Number of units to use in the fully-connected hidden layer
        - num_classes: Number of scores to produce from the final affine layer.
        - weight_scale: Scalar giving standard deviation for random initialization
          of weights.
        - reg: Scalar giving L2 regularization strength
        - dtype: numpy datatype to use for computation.
        """
        self.params = {}
        self.reg = reg
        self.dtype = dtype

        ############################################################################
        # TODO: Initialize weights and biases for the three-layer convolutional    #
        # network. Weights should be initialized from a Gaussian with standard     #
        # deviation equal to weight_scale; biases should be initialized to zero.   #
        # All weights and biases should be stored in the dictionary self.params.   #
        # Store weights and biases for the convolutional layer using the keys 'W1' #
        # and 'b1'; use keys 'W2' and 'b2' for the weights and biases of the       #
        # hidden affine layer, and keys 'W3' and 'b3' for the weights and biases   #
        # of the output affine layer.                                              #
        ############################################################################
        C, H, W = input_dim
        #W1为第一层conv参数
        self.params['W1'] = weight_scale * np.random.randn(num_filters, C, filter_size, filter_size)
        self.params['b1'] = np.zeros(num_filters)
        #W2为maxpool - hiddenlayer
        self.params['W2'] = weight_scale * np.random.randn(int(H / 2) * int(W / 2)*num_filters, hidden_dim)
        self.params['b2'] = np.zeros(hidden_dim)
        #W3 hidden - output
        self.params['W3'] = weight_scale * np.random.randn(hidden_dim, num_classes)
        self.params['b3'] = np.zeros(num_classes)
        ############################################################################
        #                             END OF YOUR CODE                             #
        ############################################################################

        for k, v in self.params.items():
            self.params[k] = v.astype(dtype)


    def loss(self, X, y=None):
        """
        Evaluate loss and gradient for the three-layer convolutional network.

        Input / output: Same API as TwoLayerNet in fc_net.py.
        """
        W1, b1 = self.params['W1'], self.params['b1']
        W2, b2 = self.params['W2'], self.params['b2']
        W3, b3 = self.params['W3'], self.params['b3']

        # pass conv_param to the forward pass for the convolutional layer
        filter_size = W1.shape[2]
        conv_param = {'stride': 1, 'pad': (filter_size - 1) // 2}

        # pass pool_param to the forward pass for the max-pooling layer
        pool_param = {'pool_height': 2, 'pool_width': 2, 'stride': 2}

        scores = None
        ############################################################################
        # TODO: Implement the forward pass for the three-layer convolutional net,  #
        # computing the class scores for X and storing them in the scores          #
        # variable.                                                                #
        ############################################################################
        pass
        conv_forward_out_1, cache_forward_1 = conv_relu_pool_forward(X, W1, b1, conv_param, pool_param)
        affine_out_2, cache_forward_2 = affine_relu_forward(conv_forward_out_1, W2, b2)
        scores, cache_forward_3 = affine_forward(affine_out_2, W3, b3)

        ############################################################################
        #                             END OF YOUR CODE                             #
        ############################################################################

        if y is None:
            return scores

        loss, grads = 0, {}
        ############################################################################
        # TODO: Implement the backward pass for the three-layer convolutional net, #
        # storing the loss and gradients in the loss and grads variables. Compute  #
        # data loss using softmax, and make sure that grads[k] holds the gradients #
        # for self.params[k]. Don't forget to add L2 regularization!               #
        ############################################################################
        pass
        loss, dscore = softmax_loss(scores, y)
        #da2 即affine_out_2的d
        da2, grads['W3'], grads['b3'] = affine_backward(dscore, cache_forward_3)
        #da1,即第一层经过conv pool之后的d
        da1, grads['W2'], grads['b2'] = affine_relu_backward(da2, cache_forward_2)
        _, grads['W1'], grads['b1'] = conv_relu_pool_backward(da1, cache_forward_1)

        loss += 0.5 * self.reg * (np.sum(W1 ** 2) + np.sum(W2 **2) + np.sum(W3 ** 2))

        grads['W1'] += self.reg * W1
        grads['W2'] += self.reg * W2
        grads['W3'] += self.reg * W3

        ############################################################################
        #                             END OF YOUR CODE                             #
        ############################################################################

        return loss, grads

人工智能、人生感悟

AIDeep Learning

cs231n作业：assignment2 - Convolutional Networks