1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229
| class FullyConnectedNet(object): """ A fully-connected neural network with an arbitrary number of hidden layers, ReLU nonlinearities, and a softmax loss function. This will also implement dropout and batch normalization as options. For a network with L layers, the architecture will be
{affine - [batch norm] - relu - [dropout]} x (L - 1) - affine - softmax
where batch normalization and dropout are optional, and the {...} block is repeated L - 1 times.
Similar to the TwoLayerNet above, learnable parameters are stored in the self.params dictionary and will be learned using the Solver class. """
def __init__(self, hidden_dims, input_dim=3*32*32, num_classes=10, dropout=0, use_batchnorm=False, reg=0.0, weight_scale=1e-2, dtype=np.float32, seed=None): """ Initialize a new FullyConnectedNet.
Inputs: - hidden_dims: A list of integers giving the size of each hidden layer. - input_dim: An integer giving the size of the input. - num_classes: An integer giving the number of classes to classify. - dropout: Scalar between 0 and 1 giving dropout strength. If dropout=0 then the network should not use dropout at all. - use_batchnorm: Whether or not the network should use batch normalization. - reg: Scalar giving L2 regularization strength. - weight_scale: Scalar giving the standard deviation for random initialization of the weights. - dtype: A numpy datatype object; all computations will be performed using this datatype. float32 is faster but less accurate, so you should use float64 for numeric gradient checking. - seed: If not None, then pass this random seed to the dropout layers. This will make the dropout layers deteriminstic so we can gradient check the model. """ self.use_batchnorm = use_batchnorm self.use_dropout = dropout > 0 self.reg = reg self.num_layers = 1 + len(hidden_dims) self.dtype = dtype self.params = {}
n_i_prev = input_dim for i, n_i in enumerate(hidden_dims): self.params['W' + str(i+1)] = np.random.randn(n_i_prev,n_i) * weight_scale self.params['b' + str(i+1)] = np.zeros((n_i,)) if self.use_batchnorm: self.params['gamma' +str(i+1)] = np.ones((n_i,)) self.params['beta' + str(i+1)] = np.zeros((n_i,))
n_i_prev = n_i
self.params['W' + str(self.num_layers)] = np.random.randn(n_i_prev,num_classes) * weight_scale self.params['b' + str(self.num_layers)] = np.zeros((num_classes,))
self.dropout_param = {} if self.use_dropout: self.dropout_param = {'mode': 'train', 'p': dropout} if seed is not None: self.dropout_param['seed'] = seed
self.bn_params = [] if self.use_batchnorm: self.bn_params = [{'mode': 'train'} for i in range(self.num_layers - 1)]
for k, v in self.params.items(): self.params[k] = v.astype(dtype)
def loss(self, X, y=None): """ Compute loss and gradient for the fully-connected net.
Input / output: Same as TwoLayerNet above. """ X = X.astype(self.dtype) mode = 'test' if y is None else 'train'
if self.use_dropout: self.dropout_param['mode'] = mode if self.use_batchnorm: for bn_param in self.bn_params: bn_param['mode'] = mode
scores = None
A_prev = X fc_mix_cache = [] drop_cache = [] for i in range(self.num_layers - 1): W, b = self.params['W' + str(i+1)],self.params['b' + str(i+1)] if self.use_batchnorm: gamma = self.params['gamma'+str(i+1)] beta = self.params['beta'+str(i+1)] A, A_cache = affine_bn_relu_forword(A_prev, W, b,gamma,beta,self.bn_params[i]) else: A, A_cache = affine_relu_forward(A_prev, W, b)
if self.use_dropout: A, drop_ch = dropout_forward(A, self.dropout_param) drop_cache.append(drop_ch) A_prev = A fc_mix_cache.append(A_cache) W, b = self.params['W' + str(self.num_layers)],self.params['b' + str(self.num_layers)] ZL, ZL_cache = affine_forward(A_prev,W,b) scores = ZL
if mode == 'test': return scores
loss, grads = 0.0, {}
loss, dout = softmax_loss(scores,y) loss += 0.5 * self.reg * (np.sum(self.params['W'+ str(self.num_layers)]**2)) dA_prev, dwl, dbl = affine_backward(dout, ZL_cache) grads['W' + str(self.num_layers)] = dwl + self.reg * self.params['W'+ str(self.num_layers)] grads['b' + str(self.num_layers)] = dbl for i in range(self.num_layers-1, 0,-1): loss += 0.5 * self.reg * np.sum(self.params['W'+ str(i)]**2) if self.use_dropout: dA_prev = dropout_backward(dA_prev, drop_cache[i-1]) if self.use_batchnorm: dA_prev, dw, db, dgamma, dbeta = affine_bn_relu_backward(dA_prev, fc_mix_cache[i-1]) else: dA_prev, dw, db = affine_relu_backward(dA_prev, fc_mix_cache[i-1])
grads['W'+str(i)] = dw + self.reg * self.params['W'+ str(i)] grads['b'+str(i)] = db
if self.use_batchnorm: grads['gamma' + str(i)] = dgamma grads['beta' + str(i)] = dbeta
return loss, grads
|