5

Ich lerne künstliche neuronale Netzwerk (ANN) vor kurzem und haben einen Code arbeiten und laufen in Python für die gleiche basierend auf Mini-Batch-Training. Ich folgte dem Buch von Michael Nilson's Neural Networks and Deep Learning, wo es Schritt für Schritt Erklärung jedes Algorithmus für die Anfänger gibt. Es gibt auch einen voll funktionsfähigen Code für die handschriftliche Ziffernerkennung, der auch für mich funktioniert.Full-Matrix-Ansatz zur Backpropagation in künstlichen neuronalen Netzwerk

Ich versuche jedoch, den Code etwas zu optimieren, indem ich die gesamte Minibatch zusammengib, um durch Backpropagation in der Matrixform zu trainieren. Ich habe auch einen Arbeitscode dafür entwickelt, aber der Code läuft sehr langsam, wenn er ausgeführt wird. Gibt es eine Möglichkeit, einen vollständigen Matrix-basierten Ansatz für das Mini-Batch-Lernen des Netzwerks basierend auf dem Back-Propagation-Algorithmus zu implementieren?

import numpy as np 
import pandas as pd 

class Network: 

    def __init__(self, sizes): 
     self.layers = len(sizes) 
     self.sizes = sizes 

     self.biases = [np.random.randn(y, 1) for y in sizes[1:]] 
     self.weights = [np.random.randn(y, x) for y, x in zip(sizes[1:], sizes[:-1])] 

    def feed_forward(self, a): 
     for w, b in zip(self.weights, self.biases): 
      a = sigmoid(np.dot(w,a) + b) 
     return a 

    # Calculate the cost derivative (Gradient of C w.r.t. 'a' - Nabla C(a)) 
    def cost_derivative(self, output_activation, y): 
     return (output_activation - y) 


    def update_mini_batch(self, mini_batch, eta): 

     from scipy.linalg import block_diag 

     n = len(mini_batch) 

     xs = [x for x, y in mini_batch] 
     features = block_diag(*xs) 

     ys = [y for x, y in mini_batch] 
     responses = block_diag(*ys) 

     ws = [a for a in self.weights for i in xrange(n)] 

     new_list = [] 
     k = 0 
     while (k < len(ws)): 
      new_list.append(ws[k: k + n]) 
      k += n 

     weights = [block_diag(*elems) for elems in new_list] 

     bs = [b for b in self.biases for i in xrange(n)] 

     new_list2 = [] 
     j = 0 
     while (j < len(bs)): 
      new_list2.append(bs[j : j + n]) 
      j += n 

     biases = [block_diag(*elems) for elems in new_list2] 

     baises_dim_1 = [np.dot(np.ones((n*b.shape[0], b.shape[0])), b) for b in self.biases] 
     biases_dim_2 = [np.dot(b, np.ones((b.shape[1], n*b.shape[1]))) for b in baises_dim_1] 
     weights_dim_1 = [np.dot(np.ones((n*w.shape[0], w.shape[0])), w) for w in self.weights] 
     weights_dim_2 = [np.dot(w, np.ones((w.shape[1], n*w.shape[1]))) for w in weights_dim_1] 

     nabla_b = [np.zeros(b.shape) for b in biases_dim_2] 
     nabla_w = [np.zeros(w.shape) for w in weights_dim_2] 

     delta_b = [np.zeros(b.shape) for b in self.biases] 
     delta_w = [np.zeros(w.shape) for w in self.weights] 

     zs = [] 
     activation = features 
     activations = [features] 

     for w, b in zip(weights, biases): 

      z = np.dot(w, activation) + b 
      zs.append(z) 
      activation = sigmoid(z) 
      activations.append(activation) 

     delta = self.cost_derivative(activations[-1], responses) * sigmoid_prime(zs[-1]) 
     nabla_b[-1] = delta 
     nabla_w[-1] = np.dot(delta, activations[-2].transpose()) 

     for l in xrange(2, self.layers): 
      z = zs[-l]                  # the weighted input for that layer 
      activation_prime = sigmoid_prime(z)            # the derivative of activation for the layer 
      delta = np.dot(weights[-l + 1].transpose(), delta) * activation_prime   # calculate the adjustment term (delta) for that layer 
      nabla_b[-l] = delta                # calculate the bias adjustments - by means of using eq-BP3. 
      nabla_w[-l] = np.dot(delta, activations[-l-1].transpose())     # calculate the weight adjustments - by means of using eq-BP4. 

     delta_b = [self.split_cases(b, n) for b in nabla_b] 
     delta_w = [self.split_cases(w, n) for w in nabla_w] 

     self.weights = [w - (eta/n) * nw for w, nw in zip(self.weights, delta_w)] 
     self.biases = [b - (eta/ n) * nb for b, nb in zip(self.biases, delta_b)] 



    def split_cases(self, mat, mini_batch_size): 
     i = 0 
     j = 0 
     dim1 = mat.shape[0]/mini_batch_size 
     dim2 = mat.shape[1]/mini_batch_size 
     sum_samples = np.zeros((dim1, dim2)) 
     while i < len(mat): 

      sum_samples = sum_samples + mat[i: i + dim1, j : j + dim2] 
      i += dim1 
      j += dim2 

     return sum_samples 

    """Stochastic Gradient Descent for training in epochs""" 
    def SGD(self, training_data, epochs, mini_batch_size, eta, test_data = None): 

     n = len(training_data) 

     if test_data: 
      n_test = len(test_data) 

     for j in xrange(epochs): 
      np.random.shuffle(training_data)                 # for each epochs the mini-batches are selected randomly 
      mini_batches = [training_data[k: k+mini_batch_size] for k in xrange(0, n, mini_batch_size)]  # select equal sizes of mini-batches for the epochs (last mini_batch size might differ however) 

      c = 1 

      for mini_batch in mini_batches: 
       print "Updating mini-batch {0}".format(c) 
       self.update_mini_batch(mini_batch, eta) 
       c += 1 
      if test_data: 
       print "Epoch {0}: {1}/{2}".format(j, self.evaluate(test_data), n_test) 

      else: 
       print "Epoch {0} completed.".format(j) 

    def evaluate(self, test_data): 
     test_results = [(np.argmax(self.feed_forward(x)), y) for (x, y) in test_data] 
     return (sum(int(x == y) for x, y in test_results)) 

    def export_results(self, test_data): 
     results = [(np.argmax(self.feed_forward(x)), y) for (x, y) in test_data] 
     k = pd.DataFrame(results) 
     k.to_csv('net_results.csv') 


# Global functions 

## Activation function (sigmoid) 
@np.vectorize 
def sigmoid(z): 
    return 1.0/(1.0 + np.exp(-z)) 

## Activation derivative (sigmoid_prime) 
@np.vectorize 
def sigmoid_prime(z): 
    return sigmoid(z)*(1 - sigmoid(z)) 

Antwort

1

Hier ist mein Code. Die Zeit für die Wiederholung von 30 Epochen verkürzt sich von 800+ Sekunden auf 200+ Sekunden auf meinem Computer.

Da ich Python neu bin, verwende ich, was leicht verfügbar ist. Für dieses Snippet muss nur numpy ausgeführt werden.

Probieren Sie es aus.

def feedforward2(self, a): 
    zs = [] 
    activations = [a] 

    activation = a 
    for b, w in zip(self.biases, self.weights): 
     z = np.dot(w, activation) + b 
     zs.append(z) 
     activation = sigmoid(z) 
     activations.append(activation) 

    return (zs, activations) 

def update_mini_batch2(self, mini_batch, eta): 
    batch_size = len(mini_batch) 

    # transform to (input x batch_size) matrix 
    x = np.asarray([_x.ravel() for _x, _y in mini_batch]).transpose() 
    # transform to (output x batch_size) matrix 
    y = np.asarray([_y.ravel() for _x, _y in mini_batch]).transpose() 

    nabla_b, nabla_w = self.backprop2(x, y) 
    self.weights = [w - (eta/batch_size) * nw for w, nw in zip(self.weights, nabla_w)] 
    self.biases = [b - (eta/batch_size) * nb for b, nb in zip(self.biases, nabla_b)] 

    return 

def backprop2(self, x, y): 

    nabla_b = [0 for i in self.biases] 
    nabla_w = [0 for i in self.weights] 

    # feedforward 
    zs, activations = self.feedforward2(x) 

    # backward pass 
    delta = self.cost_derivative(activations[-1], y) * sigmoid_prime(zs[-1]) 
    nabla_b[-1] = delta.sum(1).reshape([len(delta), 1]) # reshape to (n x 1) matrix 
    nabla_w[-1] = np.dot(delta, activations[-2].transpose()) 

    for l in xrange(2, self.num_layers): 
     z = zs[-l] 
     sp = sigmoid_prime(z) 
     delta = np.dot(self.weights[-l + 1].transpose(), delta) * sp 
     nabla_b[-l] = delta.sum(1).reshape([len(delta), 1]) # reshape to (n x 1) matrix 
     nabla_w[-l] = np.dot(delta, activations[-l - 1].transpose()) 

    return (nabla_b, nabla_w) 
Verwandte Themen