cs231n作业:assignment2 - Convolutional Networks

作业做到这里才真正进入了cnn的范畴。

先用最基本的循环来写forward

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
def conv_forward_naive(x, w, b, conv_param):
"""
A naive implementation of the forward pass for a convolutional layer.

The input consists of N data points, each with C channels, height H and
width W. We convolve each input with F different filters, where each filter
spans all C channels and has height HH and width HH.

Input:
- x: Input data of shape (N, C, H, W)
- w: Filter weights of shape (F, C, HH, WW)
- b: Biases, of shape (F,)
- conv_param: A dictionary with the following keys:
- 'stride': The number of pixels between adjacent receptive fields in the
horizontal and vertical directions.
- 'pad': The number of pixels that will be used to zero-pad the input.

Returns a tuple of:
- out: Output data, of shape (N, F, H', W') where H' and W' are given by
H' = 1 + (H + 2 * pad - HH) / stride
W' = 1 + (W + 2 * pad - WW) / stride
- cache: (x, w, b, conv_param)
"""
out = None
###########################################################################
# TODO: Implement the convolutional forward pass. #
# Hint: you can use the function np.pad for padding. #
###########################################################################
#N个样本,C个通道,H高度,W宽度
N, C, H, W = x.shape
#F个filter,C个通道,HH个核高度,WW核宽度
F, C, HH, WW = w.shape
#步长
stride = conv_param['stride']
#padding 的像素个数
pad = conv_param['pad']

#经过卷积核之后的图片大小
new_H = 1 + int((H + 2 * pad - HH)/stride)
new_W = 1 + int((W + 2 * pad - WW)/stride)
out = np.zeros([N, F, new_H, new_W])

#遍历N个样本卷积
for n in range(N):
for f in range(F):
#需要加上bias
conv_newH_new_W = np.ones([new_H, new_W]) * b[f]
for c in range(C):
#填充原图片x
padded_x = np.lib.pad(x[n, c], pad_width = pad, mode='constant',constant_values=0)
#开始计算卷积后的图中的每一个像素,每一个像素就是对应一个卷积核乘上原来的图片的位置
for i in range(new_H):
for j in range(new_W):
conv_newH_new_W[i, j] += np.sum(padded_x[i * stride:i * stride+HH, j * stride: j*stride+WW]* w[f, c, :, :])
#把C个通道中的那些对应像素加在一起,得到了单张图片单个核数的out
out[n, f] = conv_newH_new_W

###########################################################################
# END OF YOUR CODE #
###########################################################################
cache = (x, w, b, conv_param)
return out, cache

backward如图:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
def conv_backward_naive(dout, cache):
"""
A naive implementation of the backward pass for a convolutional layer.

Inputs:
- dout: Upstream derivatives.
- cache: A tuple of (x, w, b, conv_param) as in conv_forward_naive

Returns a tuple of:
- dx: Gradient with respect to x
- dw: Gradient with respect to w
- db: Gradient with respect to b
"""
dx, dw, db = None, None, None
###########################################################################
# TODO: Implement the convolutional backward pass. #
###########################################################################
# 数据准备
x, w, b, conv_param = cache
pad = conv_param['pad']
stride = conv_param['stride']
F, C, HH, WW = w.shape
N, C, H, W = x.shape
N, F, new_H, new_W = dout.shape

# 下面,我们模拟卷积,首先填充x。
padded_x = np.lib.pad(x,
((0, 0), (0, 0), (pad, pad), (pad, pad)),
mode='constant',
constant_values=0)
padded_dx = np.zeros_like(padded_x) # 填充了的dx,后面去填充即可得到dx
dw = np.zeros_like(w)
db = np.zeros_like(b)

for n in range(N): # 第n个图像
for f in range(F): # 第f个过滤器
for i in range(new_H):
for j in range(new_W):
#dw 等于所有out的每一个像素求导之和,因为out每个像素都共享参数
db[f] += dout[n, f, i, j] # dg对db求导为1*dout
dw[f] += padded_x[n, :, i*stride : HH + i*stride, j*stride : WW + j*stride] * dout[n, f, i, j]
padded_dx[n, :, i*stride : HH + i*stride, j*stride : WW + j*stride] += w[f] * dout[n, f, i, j]
# 去掉填充部分
dx = padded_dx[:, :, pad:pad + H, pad:pad + W]

###########################################################################
# END OF YOUR CODE #
###########################################################################
return dx, dw, db

然后是max pool 层

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
def max_pool_forward_naive(x, pool_param):
"""
A naive implementation of the forward pass for a max pooling layer.

Inputs:
- x: Input data, of shape (N, C, H, W)
- pool_param: dictionary with the following keys:
- 'pool_height': The height of each pooling region
- 'pool_width': The width of each pooling region
- 'stride': The distance between adjacent pooling regions

Returns a tuple of:
- out: Output data
- cache: (x, pool_param)
"""
out = None
###########################################################################
# TODO: Implement the max pooling forward pass #
###########################################################################
N, C, H, W = x.shape
pool_height = pool_param['pool_height'] # 池化过滤器高度
pool_width = pool_param['pool_width'] # 池化过滤器宽度
pool_stride = pool_param['stride'] # 移动步长
new_H = 1 + int((H - pool_height) / pool_stride) # 池化结果矩阵高度
new_W = 1 + int((W - pool_width) / pool_stride) # 池化结果矩阵宽度
out = np.zeros([N, C, new_H, new_W])
for n in range(N):
for c in range(C):
for i in range(new_H):
for j in range(new_W):
out[n,c,i,j] = np.max(x[n, c, i*pool_stride : i*pool_stride+pool_height, j*pool_stride : j*pool_stride+pool_width])


###########################################################################
# END OF YOUR CODE #
###########################################################################
cache = (x, pool_param)
return out, cache


def max_pool_backward_naive(dout, cache):
"""
A naive implementation of the backward pass for a max pooling layer.

Inputs:
- dout: Upstream derivatives
- cache: A tuple of (x, pool_param) as in the forward pass.

Returns:
- dx: Gradient with respect to x
"""
dx = None
###########################################################################
# TODO: Implement the max pooling backward pass #
###########################################################################
#太难
x, pool_param = cache
N, C, H, W = x.shape
pool_height = pool_param['pool_height']
pool_width = pool_param['pool_width']
pool_stride = pool_param['stride']
new_H = 1 + int((H - pool_height) / pool_stride)
new_W = 1 + int((W - pool_width) / pool_stride)
dx = np.zeros_like(x)
for n in range(N):
for c in range(C):
for i in range(new_H):
for j in range(new_W):
window = x[n, c, i * pool_stride: i * pool_stride + pool_height,j * pool_stride: j * pool_stride + pool_width]
dx[n, c, i * pool_stride: i * pool_stride + pool_height, j * pool_stride: j * pool_stride + pool_width] = (window == np.max(window))*dout[n,c,i,j]

###########################################################################
# END OF YOUR CODE #
###########################################################################
return dx

以上只是尝试最基本的CNN和max pool结构。实际使用不用这个,因为有更高效的版本。

然后用高效的版本定义了三明治层:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
def conv_relu_forward(x, w, b, conv_param):
"""
A convenience layer that performs a convolution followed by a ReLU.

Inputs:
- x: Input to the convolutional layer
- w, b, conv_param: Weights and parameters for the convolutional layer

Returns a tuple of:
- out: Output from the ReLU
- cache: Object to give to the backward pass
"""
a, conv_cache = conv_forward_fast(x, w, b, conv_param)
out, relu_cache = relu_forward(a)
cache = (conv_cache, relu_cache)
return out, cache


def conv_relu_backward(dout, cache):
"""
Backward pass for the conv-relu convenience layer.
"""
conv_cache, relu_cache = cache
da = relu_backward(dout, relu_cache)
dx, dw, db = conv_backward_fast(da, conv_cache)
return dx, dw, db

cnn.py中完成了三层的ConvNet

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122

class ThreeLayerConvNet(object):
"""
A three-layer convolutional network with the following architecture:

conv - relu - 2x2 max pool - affine - relu - affine - softmax

The network operates on minibatches of data that have shape (N, C, H, W)
consisting of N images, each with height H and width W and with C input
channels.
"""

def __init__(self, input_dim=(3, 32, 32), num_filters=32, filter_size=7,
hidden_dim=100, num_classes=10, weight_scale=1e-3, reg=0.0,
dtype=np.float32):
"""
Initialize a new network.

Inputs:
- input_dim: Tuple (C, H, W) giving size of input data
- num_filters: Number of filters to use in the convolutional layer
- filter_size: Size of filters to use in the convolutional layer
- hidden_dim: Number of units to use in the fully-connected hidden layer
- num_classes: Number of scores to produce from the final affine layer.
- weight_scale: Scalar giving standard deviation for random initialization
of weights.
- reg: Scalar giving L2 regularization strength
- dtype: numpy datatype to use for computation.
"""
self.params = {}
self.reg = reg
self.dtype = dtype

############################################################################
# TODO: Initialize weights and biases for the three-layer convolutional #
# network. Weights should be initialized from a Gaussian with standard #
# deviation equal to weight_scale; biases should be initialized to zero. #
# All weights and biases should be stored in the dictionary self.params. #
# Store weights and biases for the convolutional layer using the keys 'W1' #
# and 'b1'; use keys 'W2' and 'b2' for the weights and biases of the #
# hidden affine layer, and keys 'W3' and 'b3' for the weights and biases #
# of the output affine layer. #
############################################################################
C, H, W = input_dim
#W1为第一层conv参数
self.params['W1'] = weight_scale * np.random.randn(num_filters, C, filter_size, filter_size)
self.params['b1'] = np.zeros(num_filters)
#W2为maxpool - hiddenlayer
self.params['W2'] = weight_scale * np.random.randn(int(H / 2) * int(W / 2)*num_filters, hidden_dim)
self.params['b2'] = np.zeros(hidden_dim)
#W3 hidden - output
self.params['W3'] = weight_scale * np.random.randn(hidden_dim, num_classes)
self.params['b3'] = np.zeros(num_classes)
############################################################################
# END OF YOUR CODE #
############################################################################

for k, v in self.params.items():
self.params[k] = v.astype(dtype)


def loss(self, X, y=None):
"""
Evaluate loss and gradient for the three-layer convolutional network.

Input / output: Same API as TwoLayerNet in fc_net.py.
"""
W1, b1 = self.params['W1'], self.params['b1']
W2, b2 = self.params['W2'], self.params['b2']
W3, b3 = self.params['W3'], self.params['b3']

# pass conv_param to the forward pass for the convolutional layer
filter_size = W1.shape[2]
conv_param = {'stride': 1, 'pad': (filter_size - 1) // 2}

# pass pool_param to the forward pass for the max-pooling layer
pool_param = {'pool_height': 2, 'pool_width': 2, 'stride': 2}

scores = None
############################################################################
# TODO: Implement the forward pass for the three-layer convolutional net, #
# computing the class scores for X and storing them in the scores #
# variable. #
############################################################################
pass
conv_forward_out_1, cache_forward_1 = conv_relu_pool_forward(X, W1, b1, conv_param, pool_param)
affine_out_2, cache_forward_2 = affine_relu_forward(conv_forward_out_1, W2, b2)
scores, cache_forward_3 = affine_forward(affine_out_2, W3, b3)

############################################################################
# END OF YOUR CODE #
############################################################################

if y is None:
return scores

loss, grads = 0, {}
############################################################################
# TODO: Implement the backward pass for the three-layer convolutional net, #
# storing the loss and gradients in the loss and grads variables. Compute #
# data loss using softmax, and make sure that grads[k] holds the gradients #
# for self.params[k]. Don't forget to add L2 regularization! #
############################################################################
pass
loss, dscore = softmax_loss(scores, y)
#da2 即affine_out_2的d
da2, grads['W3'], grads['b3'] = affine_backward(dscore, cache_forward_3)
#da1,即第一层经过conv pool之后的d
da1, grads['W2'], grads['b2'] = affine_relu_backward(da2, cache_forward_2)
_, grads['W1'], grads['b1'] = conv_relu_pool_backward(da1, cache_forward_1)

loss += 0.5 * self.reg * (np.sum(W1 ** 2) + np.sum(W2 **2) + np.sum(W3 ** 2))

grads['W1'] += self.reg * W1
grads['W2'] += self.reg * W2
grads['W3'] += self.reg * W3

############################################################################
# END OF YOUR CODE #
############################################################################

return loss, grads
分享到