正则化以及优化-白红宇

正则化以及优化

阅读量：5122 次

发布时间：2019-06-13

本文共 36497 字，大约阅读时间需要 121 分钟。

权重的初始化：

1.如果激活函数是：Relu: W(n^[L],n^[L-1])=np.random.rand(n^[L],n^[L-1]) *np.sqrt(2/n^[L-1]）

2.如果激活函数是：tanh: W(n^[L],n^[L-1])=np.random.rand(n^[L],n^[L-1]) *np.sqrt(1/n^[L-1]）

1 def initialize_parameters_he(layers_dims): 2     """ 3     Arguments: 4     layer_dims -- python array (list) containing the size of each layer. 5      6     Returns: 7     parameters -- python dictionary containing your parameters "W1", "b1", ..., "WL", "bL": 8                     W1 -- weight matrix of shape (layers_dims[1], layers_dims[0]) 9                     b1 -- bias vector of shape (layers_dims[1], 1)10                     ...11                     WL -- weight matrix of shape (layers_dims[L], layers_dims[L-1])12                     bL -- bias vector of shape (layers_dims[L], 1)13     """14     15     np.random.seed(3)16     parameters = {}17     L = len(layers_dims) - 1 # integer representing the number of layers18      19     for l in range(1, L + 1):20         ### START CODE HERE ### (≈ 2 lines of code)21         parameters['W' + str(l)] = np.random.randn(layers_dims[l],layers_dims[l-1])*np.sqrt(1/layers_dims[l-1])22         parameters['b' + str(l)] = np.zeros((layers_dims[l],1))23         ### END CODE HERE ###24         25     return parameters

正则化：

L2 Regularization : The standard way to avoid overfitting

1. Basic function:

1 import numpy as np  2 import matplotlib.pyplot as plt  3 import h5py  4 import sklearn  5 import sklearn.datasets  6 import sklearn.linear_model  7 import scipy.io  8   9 def sigmoid(x): 10     """ 11     Compute the sigmoid of x 12  13     Arguments: 14     x -- A scalar or numpy array of any size. 15  16     Return: 17     s -- sigmoid(x) 18     """ 19     s = 1/(1+np.exp(-x)) 20     return s 21  22 def relu(x): 23     """ 24     Compute the relu of x 25  26     Arguments: 27     x -- A scalar or numpy array of any size. 28  29     Return: 30     s -- relu(x) 31     """ 32     s = np.maximum(0,x) 33      34     return s 35  36 def load_planar_dataset(seed): 37      38     np.random.seed(seed) 39      40     m = 400 # number of examples 41     N = int(m/2) # number of points per class 42     D = 2 # dimensionality 43     X = np.zeros((m,D)) # data matrix where each row is a single example 44     Y = np.zeros((m,1), dtype='uint8') # labels vector (0 for red, 1 for blue) 45     a = 4 # maximum ray of the flower 46  47     for j in range(2): 48         ix = range(N*j,N*(j+1)) 49         t = np.linspace(j*3.12,(j+1)*3.12,N) + np.random.randn(N)*0.2 # theta 50         r = a*np.sin(4*t) + np.random.randn(N)*0.2 # radius 51         X[ix] = np.c_[r*np.sin(t), r*np.cos(t)] 52         Y[ix] = j 53          54     X = X.T 55     Y = Y.T 56  57     return X, Y 58  59 def initialize_parameters(layer_dims): 60     """ 61     Arguments: 62     layer_dims -- python array (list) containing the dimensions of each layer in our network 63      64     Returns: 65     parameters -- python dictionary containing your parameters "W1", "b1", ..., "WL", "bL": 66                     W1 -- weight matrix of shape (layer_dims[l], layer_dims[l-1]) 67                     b1 -- bias vector of shape (layer_dims[l], 1) 68                     Wl -- weight matrix of shape (layer_dims[l-1], layer_dims[l]) 69                     bl -- bias vector of shape (1, layer_dims[l]) 70                      71     Tips: 72     - For example: the layer_dims for the "Planar Data classification model" would have been [2,2,1].  73     This means W1's shape was (2,2), b1 was (1,2), W2 was (2,1) and b2 was (1,1). Now you have to generalize it! 74     - In the for loop, use parameters['W' + str(l)] to access Wl, where l is the iterative integer. 75     """ 76      77     np.random.seed(3) 78     parameters = {} 79     L = len(layer_dims) # number of layers in the network 80  81     for l in range(1, L): 82         parameters['W' + str(l)] = np.random.randn(layer_dims[l], layer_dims[l-1]) / np.sqrt(layer_dims[l-1]) 83         parameters['b' + str(l)] = np.zeros((layer_dims[l], 1)) 84          85         assert(parameters['W' + str(l)].shape == layer_dims[l], layer_dims[l-1]) 86         assert(parameters['W' + str(l)].shape == layer_dims[l], 1) 87  88          89     return parameters 90  91 def forward_propagation(X, parameters): 92     """ 93     Implements the forward propagation (and computes the loss) presented in Figure 2. 94      95     Arguments: 96     X -- input dataset, of shape (input size, number of examples) 97     parameters -- python dictionary containing your parameters "W1", "b1", "W2", "b2", "W3", "b3": 98                     W1 -- weight matrix of shape () 99                     b1 -- bias vector of shape ()100                     W2 -- weight matrix of shape ()101                     b2 -- bias vector of shape ()102                     W3 -- weight matrix of shape ()103                     b3 -- bias vector of shape ()104     105     Returns:106     loss -- the loss function (vanilla logistic loss)107     """108         109     # retrieve parameters110     W1 = parameters["W1"]111     b1 = parameters["b1"]112     W2 = parameters["W2"]113     b2 = parameters["b2"]114     W3 = parameters["W3"]115     b3 = parameters["b3"]116     117     # LINEAR -> RELU -> LINEAR -> RELU -> LINEAR -> SIGMOID118     Z1 = np.dot(W1, X) + b1119     A1 = relu(Z1)120     Z2 = np.dot(W2, A1) + b2121     A2 = relu(Z2)122     Z3 = np.dot(W3, A2) + b3123     A3 = sigmoid(Z3)124     125     cache = (Z1, A1, W1, b1, Z2, A2, W2, b2, Z3, A3, W3, b3)126     127     return A3, cache128 129 def backward_propagation(X, Y, cache):130     """131     Implement the backward propagation presented in figure 2.132     133     Arguments:134     X -- input dataset, of shape (input size, number of examples)135     Y -- true "label" vector (containing 0 if cat, 1 if non-cat)136     cache -- cache output from forward_propagation()137     138     Returns:139     gradients -- A dictionary with the gradients with respect to each parameter, activation and pre-activation variables140     """141     m = X.shape[1]142     (Z1, A1, W1, b1, Z2, A2, W2, b2, Z3, A3, W3, b3) = cache143     144     dZ3 = A3 - Y145     dW3 = 1./m * np.dot(dZ3, A2.T)146     db3 = 1./m * np.sum(dZ3, axis=1, keepdims = True)147     148     dA2 = np.dot(W3.T, dZ3)149     dZ2 = np.multiply(dA2, np.int64(A2 > 0))150     dW2 = 1./m * np.dot(dZ2, A1.T)151     db2 = 1./m * np.sum(dZ2, axis=1, keepdims = True)152     153     dA1 = np.dot(W2.T, dZ2)154     dZ1 = np.multiply(dA1, np.int64(A1 > 0))155     dW1 = 1./m * np.dot(dZ1, X.T)156     db1 = 1./m * np.sum(dZ1, axis=1, keepdims = True)157     158     gradients = {
   "dZ3": dZ3, "dW3": dW3, "db3": db3,159                  "dA2": dA2, "dZ2": dZ2, "dW2": dW2, "db2": db2,160                  "dA1": dA1, "dZ1": dZ1, "dW1": dW1, "db1": db1}161     162     return gradients163 164 def update_parameters(parameters, grads, learning_rate):165     """166     Update parameters using gradient descent167     168     Arguments:169     parameters -- python dictionary containing your parameters:170                     parameters['W' + str(i)] = Wi171                     parameters['b' + str(i)] = bi172     grads -- python dictionary containing your gradients for each parameters:173                     grads['dW' + str(i)] = dWi174                     grads['db' + str(i)] = dbi175     learning_rate -- the learning rate, scalar.176     177     Returns:178     parameters -- python dictionary containing your updated parameters 179     """180     181     n = len(parameters) // 2 # number of layers in the neural networks182 183     # Update rule for each parameter184     for k in range(n):185         parameters["W" + str(k+1)] = parameters["W" + str(k+1)] - learning_rate * grads["dW" + str(k+1)]186         parameters["b" + str(k+1)] = parameters["b" + str(k+1)] - learning_rate * grads["db" + str(k+1)]187         188     return parameters189 190 def predict(X, y, parameters):191     """192     This function is used to predict the results of a  n-layer neural network.193     194     Arguments:195     X -- data set of examples you would like to label196     parameters -- parameters of the trained model197     198     Returns:199     p -- predictions for the given dataset X200     """201     202     m = X.shape[1]203     p = np.zeros((1,m), dtype = np.int)204     205     # Forward propagation206     a3, caches = forward_propagation(X, parameters)207     208     # convert probas to 0/1 predictions209     for i in range(0, a3.shape[1]):210         if a3[0,i] > 0.5:211             p[0,i] = 1212         else:213             p[0,i] = 0214 215     # print results216 217     #print ("predictions: " + str(p[0,:]))218     #print ("true labels: " + str(y[0,:]))219     print("Accuracy: "  + str(np.mean((p[0,:] == y[0,:]))))220     221     return p222 223 def compute_cost(a3, Y):224     """225     Implement the cost function226     227     Arguments:228     a3 -- post-activation, output of forward propagation229     Y -- "true" labels vector, same shape as a3230     231     Returns:232     cost - value of the cost function233     """234     m = Y.shape[1]235     236     logprobs = np.multiply(-np.log(a3),Y) + np.multiply(-np.log(1 - a3), 1 - Y)237     cost = 1./m * np.nansum(logprobs)238     239     return cost240 241 def load_dataset():242     train_dataset = h5py.File('datasets/train_catvnoncat.h5', "r")243     train_set_x_orig = np.array(train_dataset["train_set_x"][:]) # your train set features244     train_set_y_orig = np.array(train_dataset["train_set_y"][:]) # your train set labels245 246     test_dataset = h5py.File('datasets/test_catvnoncat.h5', "r")247     test_set_x_orig = np.array(test_dataset["test_set_x"][:]) # your test set features248     test_set_y_orig = np.array(test_dataset["test_set_y"][:]) # your test set labels249 250     classes = np.array(test_dataset["list_classes"][:]) # the list of classes251     252     train_set_y = train_set_y_orig.reshape((1, train_set_y_orig.shape[0]))253     test_set_y = test_set_y_orig.reshape((1, test_set_y_orig.shape[0]))254     255     train_set_x_orig = train_set_x_orig.reshape(train_set_x_orig.shape[0], -1).T256     test_set_x_orig = test_set_x_orig.reshape(test_set_x_orig.shape[0], -1).T257     258     train_set_x = train_set_x_orig/255259     test_set_x = test_set_x_orig/255260 261     return train_set_x, train_set_y, test_set_x, test_set_y, classes262 263 264 def predict_dec(parameters, X):265     """266     Used for plotting decision boundary.267     268     Arguments:269     parameters -- python dictionary containing your parameters 270     X -- input data of size (m, K)271     272     Returns273     predictions -- vector of predictions of our model (red: 0 / blue: 1)274     """275     276     # Predict using forward propagation and a classification threshold of 0.5277     a3, cache = forward_propagation(X, parameters)278     predictions = (a3>0.5)279     return predictions280 281 def load_planar_dataset(randomness, seed):282     283     np.random.seed(seed)284     285     m = 50286     N = int(m/2) # number of points per class287     D = 2 # dimensionality288     X = np.zeros((m,D)) # data matrix where each row is a single example289     Y = np.zeros((m,1), dtype='uint8') # labels vector (0 for red, 1 for blue)290     a = 2 # maximum ray of the flower291 292     for j in range(2):293         294         ix = range(N*j,N*(j+1))295         if j == 0:296             t = np.linspace(j, 4*3.1415*(j+1),N) #+ np.random.randn(N)*randomness # theta297             r = 0.3*np.square(t) + np.random.randn(N)*randomness # radius298         if j == 1:299             t = np.linspace(j, 2*3.1415*(j+1),N) #+ np.random.randn(N)*randomness # theta300             r = 0.2*np.square(t) + np.random.randn(N)*randomness # radius301             302         X[ix] = np.c_[r*np.cos(t), r*np.sin(t)]303         Y[ix] = j304         305     X = X.T306     Y = Y.T307 308     return X, Y309 310 def plot_decision_boundary(model, X, y):311     # Set min and max values and give it some padding312     x_min, x_max = X[0, :].min() - 1, X[0, :].max() + 1313     y_min, y_max = X[1, :].min() - 1, X[1, :].max() + 1314     h = 0.01315     # Generate a grid of points with distance h between them316     xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))317     # Predict the function value for the whole grid318     Z = model(np.c_[xx.ravel(), yy.ravel()])319     Z = Z.reshape(xx.shape)320     # Plot the contour and training examples321     plt.contourf(xx, yy, Z, cmap=plt.cm.Spectral)322     plt.ylabel('x2')323     plt.xlabel('x1')324     plt.scatter(X[0, :], X[1, :], c=np.squeeze(y), cmap=plt.cm.Spectral)325     plt.show()326     327 def load_2D_dataset():328     data = scipy.io.loadmat('datasets/data.mat')329     train_X = data['X'].T330     train_Y = data['y'].T331     test_X = data['Xval'].T332     test_Y = data['yval'].T333 334     #plt.scatter(train_X[0, :], train_X[1, :], c=np.squeeze(train_Y), s=40, cmap=plt.cm.Spectral);335     336     return train_X, train_Y, test_X, test_Y

2. Build Model

1 def model(X, Y, learning_rate = 0.3, num_iterations = 30000, print_cost = True, lambd = 0, keep_prob = 1): 2     """ 3     Implements a three-layer neural network: LINEAR->RELU->LINEAR->RELU->LINEAR->SIGMOID. 4      5     Arguments: 6     X -- input data, of shape (input size, number of examples) 7     Y -- true "label" vector (1 for blue dot / 0 for red dot), of shape (output size, number of examples) 8     learning_rate -- learning rate of the optimization 9     num_iterations -- number of iterations of the optimization loop10     print_cost -- If True, print the cost every 10000 iterations11     lambd -- regularization hyperparameter, scalar12     keep_prob - probability of keeping a neuron active during drop-out, scalar.13     14     Returns:15     parameters -- parameters learned by the model. They can then be used to predict.16     """17         18     grads = {}19     costs = []                            # to keep track of the cost20     m = X.shape[1]                        # number of examples21     layers_dims = [X.shape[0], 20, 3, 1]22     23     # Initialize parameters dictionary.24     parameters = initialize_parameters(layers_dims)25 26     # Loop (gradient descent)27 28     for i in range(0, num_iterations):29 30         # Forward propagation: LINEAR -> RELU -> LINEAR -> RELU -> LINEAR -> SIGMOID.31         if keep_prob == 1:32             a3, cache = forward_propagation(X, parameters)33         elif keep_prob < 1:34             a3, cache = forward_propagation_with_dropout(X, parameters, keep_prob)35         36         # Cost function37         if lambd == 0:38             cost = compute_cost(a3, Y)39         else:40             cost = compute_cost_with_regularization(a3, Y, parameters, lambd)41             42         # Backward propagation.43         assert(lambd==0 or keep_prob==1)    # it is possible to use both L2 regularization and dropout, 44                                             # but this assignment will only explore one at a time45         if lambd == 0 and keep_prob == 1:46             grads = backward_propagation(X, Y, cache)47         elif lambd != 0:48             grads = backward_propagation_with_regularization(X, Y, cache, lambd)49         elif keep_prob < 1:50             grads = backward_propagation_with_dropout(X, Y, cache, keep_prob)51         52         # Update parameters.53         parameters = update_parameters(parameters, grads, learning_rate)54         55         # Print the loss every 10000 iterations56         if print_cost and i % 10000 == 0:57             print("Cost after iteration {}: {}".format(i, cost))58         if print_cost and i % 1000 == 0:59             costs.append(cost)60     61     # plot the cost62     plt.plot(costs)63     plt.ylabel('cost')64     plt.xlabel('iterations (x1,000)')65     plt.title("Learning rate =" + str(learning_rate))66     plt.show()67     68     return parameters

3. Build without any regularzation

1 parameters = model(train_X, train_Y) 2 print ("On the training set:") 3 predictions_train = predict(train_X, train_Y, parameters) 4 print ("On the test set:") 5 predictions_test = predict(test_X, test_Y, parameters) 6 plt.title("Model without regularization") 7 axes = plt.gca() 8 axes.set_xlim([-0.75,0.40]) 9 axes.set_ylim([-0.75,0.65])10 plot_decision_boundary(lambda x: predict_dec(parameters, x.T), train_X, train_Y)

4. Build model with L2 regularzation:

1 #L2 regularization compute_cost function: 2  3 def compute_cost_with_regularization(A3, Y, parameters, lambd): 4     """ 5     Implement the cost function with L2 regularization. See formula (2) above. 6      7     Arguments: 8     A3 -- post-activation, output of forward propagation, of shape (output size, number of examples) 9     Y -- "true" labels vector, of shape (output size, number of examples)10     parameters -- python dictionary containing parameters of the model11     12     Returns:13     cost - value of the regularized loss function (formula (2))14     """15     m = Y.shape[1]16     W1 = parameters["W1"]17     W2 = parameters["W2"]18     W3 = parameters["W3"]19     20     cross_entropy_cost = compute_cost(A3, Y) # This gives you the cross-entropy part of the cost21     22     ### START CODE HERE ### (approx. 1 line)23     L2_regularization_cost = (1.0/(2*m))*lambd*(np.sum(np.square(W1))+np.sum(np.square(W2))+np.sum(np.square(W3)))24     ### END CODER HERE ###25     26     cost = cross_entropy_cost + L2_regularization_cost27     28     return cost29 30 # L2 backward_propagation regularization:31 32 def backward_propagation_with_regularization(X, Y, cache, lambd):33     """34     Implements the backward propagation of our baseline model to which we added an L2 regularization.35     36     Arguments:37     X -- input dataset, of shape (input size, number of examples)38     Y -- "true" labels vector, of shape (output size, number of examples)39     cache -- cache output from forward_propagation()40     lambd -- regularization hyperparameter, scalar41     42     Returns:43     gradients -- A dictionary with the gradients with respect to each parameter, activation and pre-activation variables44     """45     46     m = X.shape[1]47     (Z1, A1, W1, b1, Z2, A2, W2, b2, Z3, A3, W3, b3) = cache48     49     dZ3 = A3 - Y50     51     ### START CODE HERE ### (approx. 1 line)52     dW3 = 1./m * np.dot(dZ3, A2.T) + (lambd/m)*W353     ### END CODE HERE ###54     db3 = 1./m * np.sum(dZ3, axis=1, keepdims = True)55     56     dA2 = np.dot(W3.T, dZ3)57     dZ2 = np.multiply(dA2, np.int64(A2 > 0))58     ### START CODE HERE ### (approx. 1 line)59     dW2 = 1./m * np.dot(dZ2, A1.T) + (lambd/m)*W260     ### END CODE HERE ###61     db2 = 1./m * np.sum(dZ2, axis=1, keepdims = True)62     63     dA1 = np.dot(W2.T, dZ2)64     dZ1 = np.multiply(dA1, np.int64(A1 > 0))65     ### START CODE HERE ### (approx. 1 line)66     dW1 = 1./m * np.dot(dZ1, X.T) + (lambd/m)*W167     ### END CODE HERE ###68     db1 = 1./m * np.sum(dZ1, axis=1, keepdims = True)69     70     gradients = {
   "dZ3": dZ3, "dW3": dW3, "db3": db3,"dA2": dA2,71                  "dZ2": dZ2, "dW2": dW2, "db2": db2, "dA1": dA1, 72                  "dZ1": dZ1, "dW1": dW1, "db1": db1}73     74     return gradients75 76 #Build with L2 regularization model:77 parameters = model(train_X, train_Y, lambd = 0.7)78 print ("On the train set:")79 predictions_train = predict(train_X, train_Y, parameters)80 print ("On the test set:")81 predictions_test = predict(test_X, test_Y, parameters)82 plt.title("Model with L2-regularization")83 axes = plt.gca()84 axes.set_xlim([-0.75,0.40])85 axes.set_ylim([-0.75,0.65])86 plot_decision_boundary(lambda x: predict_dec(parameters, x.T), train_X, train_Y)

Observations:

The value of λ

L2 regularization makes your decision boundary smoother. If λ

What is L2-regularization actually doing?:

L2-regularization relies on the assumption that a model with small weights is simpler than a model with large weights. Thus, by penalizing the square values of the weights in the cost function you drive all the weights to smaller values. It becomes too costly for the cost to have large weights! This leads to a smoother model in which the output changes more slowly as the input changes.

**What you should remember** -- the implications of L2-regularization on: - The cost computation: - A regularization term is added to the cost - The backpropagation function: - There are extra terms in the gradients with respect to weight matrices - Weights end up smaller ("weight decay"): - Weights are pushed to smaller values.

5. Build modle with Dropout

Dropout is a widely used regularization technique that is specific to deep learning. It randomly shuts down some neurons in each iteration,

You shut down each neuron of a layer with probability 1-keep_prob or keep it with probability keep_prob.The dropped neurons don't contribute to the training in both the forward and backward propagations of the iteration. When you shut some neurons down,you actually modify your model ,you train a different model that uses only a subset of your neurons.

5.1 Forward propagation with dropout:

We just add dropout to the hidden layers, not apply dropout to the input layer or output layer.

1 def forward_propagation_with_dropout(X, parameters, keep_prob = 0.5): 2     """ 3     Implements the forward propagation: LINEAR -> RELU + DROPOUT -> LINEAR -> RELU + DROPOUT -> LINEAR -> SIGMOID. 4      5     Arguments: 6     X -- input dataset, of shape (2, number of examples) 7     parameters -- python dictionary containing your parameters "W1", "b1", "W2", "b2", "W3", "b3": 8                     W1 -- weight matrix of shape (20, 2) 9                     b1 -- bias vector of shape (20, 1)10                     W2 -- weight matrix of shape (3, 20)11                     b2 -- bias vector of shape (3, 1)12                     W3 -- weight matrix of shape (1, 3)13                     b3 -- bias vector of shape (1, 1)14     keep_prob - probability of keeping a neuron active during drop-out, scalar15     16     Returns:17     A3 -- last activation value, output of the forward propagation, of shape (1,1)18     cache -- tuple, information stored for computing the backward propagation19     """20     21     np.random.seed(1)22     23     # retrieve parameters24     W1 = parameters["W1"]25     b1 = parameters["b1"]26     W2 = parameters["W2"]27     b2 = parameters["b2"]28     W3 = parameters["W3"]29     b3 = parameters["b3"]30     31     # LINEAR -> RELU -> LINEAR -> RELU -> LINEAR -> SIGMOID32     Z1 = np.dot(W1, X) + b133     A1 = relu(Z1)34     ### START CODE HERE ### (approx. 4 lines)         # Steps 1-4 below correspond to the Steps 1-4 described above. 35     D1 = np.random.rand(A1.shape[0],A1.shape[1])                                         # Step 1: initialize matrix D1 = np.random.rand(..., ...)36     D1 = (D1

5.2 Backward propagation with dropout:

1 def backward_propagation_with_dropout(X, Y, cache, keep_prob): 2     """ 3     Implements the backward propagation of our baseline model to which we added dropout. 4      5     Arguments: 6     X -- input dataset, of shape (2, number of examples) 7     Y -- "true" labels vector, of shape (output size, number of examples) 8     cache -- cache output from forward_propagation_with_dropout() 9     keep_prob - probability of keeping a neuron active during drop-out, scalar10     11     Returns:12     gradients -- A dictionary with the gradients with respect to each parameter, activation and pre-activation variables13     """14     15     m = X.shape[1]16     (Z1, D1, A1, W1, b1, Z2, D2, A2, W2, b2, Z3, A3, W3, b3) = cache17     18     dZ3 = A3 - Y19     dW3 = 1./m * np.dot(dZ3, A2.T)20     db3 = 1./m * np.sum(dZ3, axis=1, keepdims = True)21     dA2 = np.dot(W3.T, dZ3)22     ### START CODE HERE ### (≈ 2 lines of code)23     dA2 = dA2*D2              # Step 1: Apply mask D2 to shut down the same neurons as during the forward propagation24     dA2 = dA2/keep_prob             # Step 2: Scale the value of neurons that haven't been shut down25     ### END CODE HERE ###26     dZ2 = np.multiply(dA2, np.int64(A2 > 0))27     dW2 = 1./m * np.dot(dZ2, A1.T)28     db2 = 1./m * np.sum(dZ2, axis=1, keepdims = True)29     30     dA1 = np.dot(W2.T, dZ2)31     ### START CODE HERE ### (≈ 2 lines of code)32     dA1 = dA1*D1              # Step 1: Apply mask D1 to shut down the same neurons as during the forward propagation33     dA1 = dA1/keep_prob              # Step 2: Scale the value of neurons that haven't been shut down34     ### END CODE HERE ###35     dZ1 = np.multiply(dA1, np.int64(A1 > 0))36     dW1 = 1./m * np.dot(dZ1, X.T)37     db1 = 1./m * np.sum(dZ1, axis=1, keepdims = True)38     39     gradients = {
   "dZ3": dZ3, "dW3": dW3, "db3": db3,"dA2": dA2,40                  "dZ2": dZ2, "dW2": dW2, "db2": db2, "dA1": dA1, 41                  "dZ1": dZ1, "dW1": dW1, "db1": db1}42     43     return gradients

5.3 Build model with dropout :

1 parameters = model(train_X, train_Y, keep_prob = 0.86, learning_rate = 0.3) 2  3 print ("On the train set:") 4 predictions_train = predict(train_X, train_Y, parameters) 5 print ("On the test set:") 6 predictions_test = predict(test_X, test_Y, parameters) 7 plt.title("Model with dropout") 8 axes = plt.gca() 9 axes.set_xlim([-0.75,0.40])10 axes.set_ylim([-0.75,0.65])11 plot_decision_boundary(lambda x: predict_dec(parameters, x.T), train_X, train_Y)

6. Gradient Checking:

f'=(f(x+e)-f(x-e))/2e 双边误差 e逼近无穷小

1 #forward_propagation function 2 def forward_propagation(x, theta): 3     """ 4     Implement the linear forward propagation (compute J) presented in Figure 1 (J(theta) = theta * x) 5      6     Arguments: 7     x -- a real-valued input 8     theta -- our parameter, a real number as well 9     10     Returns:11     J -- the value of function J, computed using the formula J(theta) = theta * x12     """13     14     ### START CODE HERE ### (approx. 1 line)15     J = np.multiply(x,theta)16     ### END CODE HERE ###17     18     return J19 20 #backward_propagation function:21 22 def backward_propagation(x, theta):23     """24     Computes the derivative of J with respect to theta (see Figure 1).25     26     Arguments:27     x -- a real-valued input28     theta -- our parameter, a real number as well29     30     Returns:31     dtheta -- the gradient of the cost with respect to theta32     """33     34     ### START CODE HERE ### (approx. 1 line)35     dtheta = x36     ### END CODE HERE ###37     38     return dtheta39 40 #gradient_check function:41 def gradient_check(x, theta, epsilon = 1e-7):42     """43     Implement the backward propagation presented in Figure 1.44     45     Arguments:46     x -- a real-valued input47     theta -- our parameter, a real number as well48     epsilon -- tiny shift to the input to compute approximated gradient with formula(1)49     50     Returns:51     difference -- difference (2) between the approximated gradient and the backward propagation gradient52     """53     54     # Compute gradapprox using left side of formula (1). epsilon is small enough, you don't need to worry about the limit.55     ### START CODE HERE ### (approx. 5 lines)56     thetaplus = theta+epsilon                             # Step 157     thetaminus = theta-epsilon                              # Step 258     J_plus = forward_propagation(x,thetaplus)                                  # Step 359     J_minus = forward_propagation(x,thetaminus)                                 # Step 460     gradapprox = (J_plus-J_minus)/(2*epsilon)                              # Step 561     ### END CODE HERE ###62     63     # Check if gradapprox is close enough to the output of backward_propagation()64     ### START CODE HERE ### (approx. 1 line)65     grad = backward_propagation(x,theta)66     ### END CODE HERE ###67     68     ### START CODE HERE ### (approx. 1 line)69     numerator = np.linalg.norm(grad-gradapprox)                               # Step 1'70     denominator = np.linalg.norm(grad)+np.linalg.norm(gradapprox)                             # Step 2'71     difference = numerator/denominator                              # Step 3'72     ### END CODE HERE ###73     74     if difference < 1e-7:75         print ("The gradient is correct!")76     else:77         print ("The gradient is wrong!")78     79     return difference

N-dimensional gradient checking:

1 def forward_propagation_n(X, Y, parameters):  2     """  3     Implements the forward propagation (and computes the cost) presented in Figure 3.  4       5     Arguments:  6     X -- training set for m examples  7     Y -- labels for m examples   8     parameters -- python dictionary containing your parameters "W1", "b1", "W2", "b2", "W3", "b3":  9                     W1 -- weight matrix of shape (5, 4) 10                     b1 -- bias vector of shape (5, 1) 11                     W2 -- weight matrix of shape (3, 5) 12                     b2 -- bias vector of shape (3, 1) 13                     W3 -- weight matrix of shape (1, 3) 14                     b3 -- bias vector of shape (1, 1) 15      16     Returns: 17     cost -- the cost function (logistic cost for one example) 18     """ 19      20     # retrieve parameters 21     m = X.shape[1] 22     W1 = parameters["W1"] 23     b1 = parameters["b1"] 24     W2 = parameters["W2"] 25     b2 = parameters["b2"] 26     W3 = parameters["W3"] 27     b3 = parameters["b3"] 28  29     # LINEAR -> RELU -> LINEAR -> RELU -> LINEAR -> SIGMOID 30     Z1 = np.dot(W1, X) + b1 31     A1 = relu(Z1) 32     Z2 = np.dot(W2, A1) + b2 33     A2 = relu(Z2) 34     Z3 = np.dot(W3, A2) + b3 35     A3 = sigmoid(Z3) 36  37     # Cost 38     logprobs = np.multiply(-np.log(A3),Y) + np.multiply(-np.log(1 - A3), 1 - Y) 39     cost = 1./m * np.sum(logprobs) 40      41     cache = (Z1, A1, W1, b1, Z2, A2, W2, b2, Z3, A3, W3, b3) 42      43     return cost, cache 44  45 def backward_propagation_n(X, Y, cache): 46     """ 47     Implement the backward propagation presented in figure 2. 48      49     Arguments: 50     X -- input datapoint, of shape (input size, 1) 51     Y -- true "label" 52     cache -- cache output from forward_propagation_n() 53      54     Returns: 55     gradients -- A dictionary with the gradients of the cost with respect to each parameter, activation and pre-activation variables. 56     """ 57      58     m = X.shape[1] 59     (Z1, A1, W1, b1, Z2, A2, W2, b2, Z3, A3, W3, b3) = cache 60      61     dZ3 = A3 - Y 62     dW3 = 1./m * np.dot(dZ3, A2.T) 63     db3 = 1./m * np.sum(dZ3, axis=1, keepdims = True) 64      65     dA2 = np.dot(W3.T, dZ3) 66     dZ2 = np.multiply(dA2, np.int64(A2 > 0)) 67     dW2 = 1./m * np.dot(dZ2, A1.T) * 2 68     db2 = 1./m * np.sum(dZ2, axis=1, keepdims = True) 69      70     dA1 = np.dot(W2.T, dZ2) 71     dZ1 = np.multiply(dA1, np.int64(A1 > 0)) 72     dW1 = 1./m * np.dot(dZ1, X.T) 73     db1 = 4./m * np.sum(dZ1, axis=1, keepdims = True) 74      75     gradients = {
   "dZ3": dZ3, "dW3": dW3, "db3": db3, 76                  "dA2": dA2, "dZ2": dZ2, "dW2": dW2, "db2": db2, 77                  "dA1": dA1, "dZ1": dZ1, "dW1": dW1, "db1": db1} 78      79     return gradients 80  81  82 def dictionary_to_vector(parameters): 83     """ 84     Roll all our parameters dictionary into a single vector satisfying our specific required shape. 85     """ 86     keys = [] 87     count = 0 88     for key in ["W1", "b1", "W2", "b2", "W3", "b3"]: 89          90         # flatten parameter 91         new_vector = np.reshape(parameters[key], (-1,1)) 92         keys = keys + [key]*new_vector.shape[0] 93          94         if count == 0: 95             theta = new_vector 96         else: 97             theta = np.concatenate((theta, new_vector), axis=0) 98         count = count + 1 99 100     return theta, keys101 102 def vector_to_dictionary(theta):103     """104     Unroll all our parameters dictionary from a single vector satisfying our specific required shape.105     """106     parameters = {}107     parameters["W1"] = theta[:20].reshape((5,4))108     parameters["b1"] = theta[20:25].reshape((5,1))109     parameters["W2"] = theta[25:40].reshape((3,5))110     parameters["b2"] = theta[40:43].reshape((3,1))111     parameters["W3"] = theta[43:46].reshape((1,3))112     parameters["b3"] = theta[46:47].reshape((1,1))113 114     return parameters115 def gradients_to_vector(gradients):116     """117     Roll all our gradients dictionary into a single vector satisfying our specific required shape.118     """119     120     count = 0121     for key in ["dW1", "db1", "dW2", "db2", "dW3", "db3"]:122         # flatten parameter123         new_vector = np.reshape(gradients[key], (-1,1))124         125         if count == 0:126             theta = new_vector127         else:128             theta = np.concatenate((theta, new_vector), axis=0)129         count = count + 1130 131     return theta132 133 def gradient_check_n(parameters, gradients, X, Y, epsilon = 1e-7):134     """135     Checks if backward_propagation_n computes correctly the gradient of the cost output by forward_propagation_n136     137     Arguments:138     parameters -- python dictionary containing your parameters "W1", "b1", "W2", "b2", "W3", "b3":139     grad -- output of backward_propagation_n, contains gradients of the cost with respect to the parameters. 140     x -- input datapoint, of shape (input size, 1)141     y -- true "label"142     epsilon -- tiny shift to the input to compute approximated gradient with formula(1)143     144     Returns:145     difference -- difference (2) between the approximated gradient and the backward propagation gradient146     """147     148     # Set-up variables149     parameters_values, _ = dictionary_to_vector(parameters)150     grad = gradients_to_vector(gradients)151     num_parameters = parameters_values.shape[0]152     J_plus = np.zeros((num_parameters, 1))153     J_minus = np.zeros((num_parameters, 1))154     gradapprox = np.zeros((num_parameters, 1))155     156     # Compute gradapprox157     for i in range(num_parameters):158         159         # Compute J_plus[i]. Inputs: "parameters_values, epsilon". Output = "J_plus[i]".160         # "_" is used because the function you have to outputs two parameters but we only care about the first one161         ### START CODE HERE ### (approx. 3 lines)162         thetaplus = np.copy(parameters_values)                                      # Step 1163         thetaplus[i][0] =thetaplus[i][0]+epsilon                                # Step 2164         J_plus[i], _ =forward_propagation_n(X,Y,vector_to_dictionary(thetaplus))                                   # Step 3165         ### END CODE HERE ###166         167         # Compute J_minus[i]. Inputs: "parameters_values, epsilon". Output = "J_minus[i]".168         ### START CODE HERE ### (approx. 3 lines)169         thetaminus = np.copy(parameters_values)                                     # Step 1170         thetaminus[i][0] = thetaminus[i][0]-epsilon                               # Step 2        171         J_minus[i], _ = forward_propagation_n(X,Y,vector_to_dictionary(thetaminus))                                  # Step 3172         ### END CODE HERE ###173         174         # Compute gradapprox[i]175         ### START CODE HERE ### (approx. 1 line)176         gradapprox[i] = (J_plus[i]-J_minus[i])/(2*epsilon)177         ### END CODE HERE ###178     179     # Compare gradapprox to backward propagation gradients by computing difference.180     ### START CODE HERE ### (approx. 1 line)181     numerator = np.linalg.norm(gradients_to_vector(gradients)-gradapprox)                                           # Step 1'182     denominator = np.linalg.norm(gradients_to_vector(gradients))+np.linalg.norm(gradapprox)                                         # Step 2'183     difference = numerator/denominator                                          # Step 3'184     ### END CODE HERE ###185 186     if difference > 1e-7:187         print ("\033[93m" + "There is a mistake in the backward propagation! difference = " + str(difference) + "\033[0m")188     else:189         print ("\033[92m" + "Your backward propagation works perfectly fine! difference = " + str(difference) + "\033[0m")190     191     return difference

转载于:https://www.cnblogs.com/easy-wang/p/10073872.html

你可能感兴趣的文章