# Load the raw CIFAR-10 data. cifar10_dir = 'cs231n/datasets/cifar-10-batches-py' X_train, y_train, X_test, y_test = load_CIFAR10(cifar10_dir)
# As a sanity check, we print out the size of the training and test data. print('Training data shape: ', X_train.shape) print('Training labels shape: ', y_train.shape) print('Test data shape: ', X_test.shape) print('Test labels shape: ', y_test.shape)
可以看到,每一张图片是$32×32×3$,训练集有50000张,测试集有10000张
1 2 3 4
Training data shape: (50000, 32, 32, 3) Training labels shape: (50000,) Test data shape: (10000, 32, 32, 3) Test labels shape: (10000,)
为了更够更快的计算,就选5000张做训练,500张做测试就好了
1 2 3 4 5 6 7 8 9 10
# Subsample the data for more efficient code execution in this exercise num_training = 5000 mask = list(range(num_training)) X_train = X_train[mask] y_train = y_train[mask]
# Reshape the image data into rows X_train = np.reshape(X_train, (X_train.shape[0], -1)) X_test = np.reshape(X_test, (X_test.shape[0], -1)) print(X_train.shape, X_test.shape)
因为knn不需要训练,所以先存入数据:
1 2 3 4 5 6 7
from cs231n.classifiers import KNearestNeighbor
# Create a kNN classifier instance. # Remember that training a kNN classifier is a noop: # the Classifier simply remembers the data and does no further processing classifier = KNearestNeighbor() classifier.train(X_train, y_train)
defcompute_distances_two_loops(self, X): """ Compute the distance between each test point in X and each training point in self.X_train using a nested loop over both the training data and the test data. Inputs: - X: A numpy array of shape (num_test, D) containing test data. Returns: - dists: A numpy array of shape (num_test, num_train) where dists[i, j] is the Euclidean distance between the ith test point and the jth training point. """ num_test = X.shape[0] num_train = self.X_train.shape[0] dists = np.zeros((num_test, num_train)) for i in xrange(num_test): for j in xrange(num_train): ##################################################################### # TODO: # # Compute the l2 distance between the ith test point and the jth # # training point, and store the result in dists[i, j]. You should # # not use a loop over dimension. # ##################################################################### dists[i][j] = np.sqrt(np.sum(np.square(X[i,:] - self.X_train[j,:]))) ##################################################################### # END OF YOUR CODE # ##################################################################### return dists
defpredict_labels(self, dists, k=1): """ Given a matrix of distances between test points and training points, predict a label for each test point. Inputs: - dists: A numpy array of shape (num_test, num_train) where dists[i, j] gives the distance betwen the ith test point and the jth training point. Returns: - y: A numpy array of shape (num_test,) containing predicted labels for the test data, where y[i] is the predicted label for the test point X[i]. """ num_test = dists.shape[0] y_pred = np.zeros(num_test) for i in xrange(num_test): # A list of length k storing the labels of the k nearest neighbors to # the ith test point. closest_y = [] ######################################################################### # TODO: # # Use the distance matrix to find the k nearest neighbors of the ith # # testing point, and use self.y_train to find the labels of these # # neighbors. Store these labels in closest_y. # # Hint: Look up the function numpy.argsort. # ######################################################################### #找到每一个测试图片中对应的5000张训练集图片,距离最近的前k个 closest_y = self.y_train[np.argsort(dists[i])[:k]] ######################################################################### # TODO: # # Now that you have found the labels of the k nearest neighbors, you # # need to find the most common label in the list closest_y of labels. # # Store this label in y_pred[i]. Break ties by choosing the smaller # # label. # ######################################################################### #然后将这K个图片进行投票,得票数最多的就是预测值 y_pred[i] = np.argmax(np.bincount(closest_y)) ######################################################################### # END OF YOUR CODE # #########################################################################
return y_pred
预测一下:
1 2 3 4 5 6 7 8
# Now implement the function predict_labels and run the code below: # We use k = 1 (which is Nearest Neighbor). y_test_pred = classifier.predict_labels(dists, k=1)
defcompute_distances_one_loop(self, X): """ Compute the distance between each test point in X and each training point in self.X_train using a single loop over the test data. Input / Output: Same as compute_distances_two_loops """ num_test = X.shape[0] num_train = self.X_train.shape[0] dists = np.zeros((num_test, num_train)) for i in xrange(num_test): ####################################################################### # TODO: # # Compute the l2 distance between the ith test point and all training # # points, and store the result in dists[i, :]. # ####################################################################### #利用python的广播,一次性算出每一张图片与5000张图片的距离 dists[i, :] = np.sqrt(np.sum(np.square(self.X_train - X[i, :]),axis=1)) ####################################################################### # END OF YOUR CODE # ####################################################################### return dists
验证一下间距是
1 2
Difference was: 0.000000 Good! The distance matrices are the same
defcompute_distances_no_loops(self, X): """ Compute the distance between each test point in X and each training point in self.X_train using no explicit loops. Input / Output: Same as compute_distances_two_loops """ num_test = X.shape[0] num_train = self.X_train.shape[0] dists = np.zeros((num_test, num_train)) ######################################################################### # TODO: # # Compute the l2 distance between all test points and all training # # points without using any explicit loops, and store the result in # # dists. # # # # You should implement this function using only basic array operations; # # in particular you should not use functions from scipy. # # # # HINT: Try to formulate the l2 distance using matrix multiplication # # and two broadcast sums. # ######################################################################### temp_2xy = np.dot(X,self.X_train.T) * (-2) temp_x2 = np.sum(np.square(X),axis=1,keepdims=True) temp_y2 = np.sum(np.square(self.X_train),axis=1) dists = temp_x2 + temp_2xy + temp_y2 dists = np.sqrt(dists) ######################################################################### # END OF YOUR CODE # ######################################################################### return dists
对比一下三种方法的时间,我这里不知道为什么two比one短,理论上是循环越少时间越短:
1 2 3
Two loop version took 24.510484 seconds One loop version took 56.412211 seconds No loop version took 0.183508 seconds
X_train_folds = [] y_train_folds = [] ################################################################################ # TODO: # # Split up the training data into folds. After splitting, X_train_folds and # # y_train_folds should each be lists of length num_folds, where # # y_train_folds[i] is the label vector for the points in X_train_folds[i]. # # Hint: Look up the numpy array_split function. # ################################################################################ X_train_folds = np.array_split(X_train, num_folds) y_train_folds = np.array_split(y_train, num_folds)
################################################################################ # END OF YOUR CODE # ################################################################################
# A dictionary holding the accuracies for different values of k that we find # when running cross-validation. After running cross-validation, # k_to_accuracies[k] should be a list of length num_folds giving the different # accuracy values that we found when using that value of k. k_to_accuracies = {}
################################################################################ # TODO: # # Perform k-fold cross validation to find the best value of k. For each # # possible value of k, run the k-nearest-neighbor algorithm num_folds times, # # where in each case you use all but one of the folds as training data and the # # last fold as a validation set. Store the accuracies for all fold and all # # values of k in the k_to_accuracies dictionary. # ################################################################################ classifier = KNearestNeighbor() for k in k_choices: accuracies = [] for fold in range(num_folds): temp_X = X_train_folds[:] temp_y = y_train_folds[:] X_val_fold = temp_X.pop(fold) y_val_fold = temp_y.pop(fold) temp_X = np.array([y for x in temp_X for y in x]) temp_y = np.array([y for x in temp_y for y in x]) classifier.train(temp_X,temp_y) y_val_pred = classifier.predict(X_val_fold,k=k) num_correct = np.sum(y_val_fold == y_val_pred) accuracies.append(num_correct / y_val_fold.shape[0]) k_to_accuracies[k] = accuracies ################################################################################ # END OF YOUR CODE # ################################################################################
# Print out the computed accuracies for k in sorted(k_to_accuracies): for accuracy in k_to_accuracies[k]: print('k = %d, accuracy = %f' % (k, accuracy))
画个图:
1 2 3 4 5 6 7 8 9 10 11 12 13
# plot the raw observations for k in k_choices: accuracies = k_to_accuracies[k] plt.scatter([k] * len(accuracies), accuracies)
# plot the trend line with error bars that correspond to standard deviation accuracies_mean = np.array([np.mean(v) for k,v in sorted(k_to_accuracies.items())]) accuracies_std = np.array([np.std(v) for k,v in sorted(k_to_accuracies.items())]) plt.errorbar(k_choices, accuracies_mean, yerr=accuracies_std) plt.title('Cross-validation on k') plt.xlabel('k') plt.ylabel('Cross-validation accuracy') plt.show()
1 2 3 4 5 6 7 8 9 10 11 12 13
# Based on the cross-validation results above, choose the best value for k, # retrain the classifier using all the training data, and test it on the test # data. You should be able to get above 28% accuracy on the test data. best_k = 10