Ich lerne künstliche neuronale Netzwerk (ANN) vor kurzem und haben einen Code arbeiten und laufen in Python für die gleiche basierend auf Mini-Batch-Training. Ich folgte dem Buch von Michael Nilson's Neural Networks and Deep Learning, wo es Schritt für Schritt Erklärung jedes Algorithmus für die Anfänger gibt. Es gibt auch einen voll funktionsfähigen Code für die handschriftliche Ziffernerkennung, der auch für mich funktioniert.Full-Matrix-Ansatz zur Backpropagation in künstlichen neuronalen Netzwerk
Ich versuche jedoch, den Code etwas zu optimieren, indem ich die gesamte Minibatch zusammengib, um durch Backpropagation in der Matrixform zu trainieren. Ich habe auch einen Arbeitscode dafür entwickelt, aber der Code läuft sehr langsam, wenn er ausgeführt wird. Gibt es eine Möglichkeit, einen vollständigen Matrix-basierten Ansatz für das Mini-Batch-Lernen des Netzwerks basierend auf dem Back-Propagation-Algorithmus zu implementieren?
import numpy as np
import pandas as pd
class Network:
def __init__(self, sizes):
self.layers = len(sizes)
self.sizes = sizes
self.biases = [np.random.randn(y, 1) for y in sizes[1:]]
self.weights = [np.random.randn(y, x) for y, x in zip(sizes[1:], sizes[:-1])]
def feed_forward(self, a):
for w, b in zip(self.weights, self.biases):
a = sigmoid(np.dot(w,a) + b)
return a
# Calculate the cost derivative (Gradient of C w.r.t. 'a' - Nabla C(a))
def cost_derivative(self, output_activation, y):
return (output_activation - y)
def update_mini_batch(self, mini_batch, eta):
from scipy.linalg import block_diag
n = len(mini_batch)
xs = [x for x, y in mini_batch]
features = block_diag(*xs)
ys = [y for x, y in mini_batch]
responses = block_diag(*ys)
ws = [a for a in self.weights for i in xrange(n)]
new_list = []
k = 0
while (k < len(ws)):
new_list.append(ws[k: k + n])
k += n
weights = [block_diag(*elems) for elems in new_list]
bs = [b for b in self.biases for i in xrange(n)]
new_list2 = []
j = 0
while (j < len(bs)):
new_list2.append(bs[j : j + n])
j += n
biases = [block_diag(*elems) for elems in new_list2]
baises_dim_1 = [np.dot(np.ones((n*b.shape[0], b.shape[0])), b) for b in self.biases]
biases_dim_2 = [np.dot(b, np.ones((b.shape[1], n*b.shape[1]))) for b in baises_dim_1]
weights_dim_1 = [np.dot(np.ones((n*w.shape[0], w.shape[0])), w) for w in self.weights]
weights_dim_2 = [np.dot(w, np.ones((w.shape[1], n*w.shape[1]))) for w in weights_dim_1]
nabla_b = [np.zeros(b.shape) for b in biases_dim_2]
nabla_w = [np.zeros(w.shape) for w in weights_dim_2]
delta_b = [np.zeros(b.shape) for b in self.biases]
delta_w = [np.zeros(w.shape) for w in self.weights]
zs = []
activation = features
activations = [features]
for w, b in zip(weights, biases):
z = np.dot(w, activation) + b
zs.append(z)
activation = sigmoid(z)
activations.append(activation)
delta = self.cost_derivative(activations[-1], responses) * sigmoid_prime(zs[-1])
nabla_b[-1] = delta
nabla_w[-1] = np.dot(delta, activations[-2].transpose())
for l in xrange(2, self.layers):
z = zs[-l] # the weighted input for that layer
activation_prime = sigmoid_prime(z) # the derivative of activation for the layer
delta = np.dot(weights[-l + 1].transpose(), delta) * activation_prime # calculate the adjustment term (delta) for that layer
nabla_b[-l] = delta # calculate the bias adjustments - by means of using eq-BP3.
nabla_w[-l] = np.dot(delta, activations[-l-1].transpose()) # calculate the weight adjustments - by means of using eq-BP4.
delta_b = [self.split_cases(b, n) for b in nabla_b]
delta_w = [self.split_cases(w, n) for w in nabla_w]
self.weights = [w - (eta/n) * nw for w, nw in zip(self.weights, delta_w)]
self.biases = [b - (eta/ n) * nb for b, nb in zip(self.biases, delta_b)]
def split_cases(self, mat, mini_batch_size):
i = 0
j = 0
dim1 = mat.shape[0]/mini_batch_size
dim2 = mat.shape[1]/mini_batch_size
sum_samples = np.zeros((dim1, dim2))
while i < len(mat):
sum_samples = sum_samples + mat[i: i + dim1, j : j + dim2]
i += dim1
j += dim2
return sum_samples
"""Stochastic Gradient Descent for training in epochs"""
def SGD(self, training_data, epochs, mini_batch_size, eta, test_data = None):
n = len(training_data)
if test_data:
n_test = len(test_data)
for j in xrange(epochs):
np.random.shuffle(training_data) # for each epochs the mini-batches are selected randomly
mini_batches = [training_data[k: k+mini_batch_size] for k in xrange(0, n, mini_batch_size)] # select equal sizes of mini-batches for the epochs (last mini_batch size might differ however)
c = 1
for mini_batch in mini_batches:
print "Updating mini-batch {0}".format(c)
self.update_mini_batch(mini_batch, eta)
c += 1
if test_data:
print "Epoch {0}: {1}/{2}".format(j, self.evaluate(test_data), n_test)
else:
print "Epoch {0} completed.".format(j)
def evaluate(self, test_data):
test_results = [(np.argmax(self.feed_forward(x)), y) for (x, y) in test_data]
return (sum(int(x == y) for x, y in test_results))
def export_results(self, test_data):
results = [(np.argmax(self.feed_forward(x)), y) for (x, y) in test_data]
k = pd.DataFrame(results)
k.to_csv('net_results.csv')
# Global functions
## Activation function (sigmoid)
@np.vectorize
def sigmoid(z):
return 1.0/(1.0 + np.exp(-z))
## Activation derivative (sigmoid_prime)
@np.vectorize
def sigmoid_prime(z):
return sigmoid(z)*(1 - sigmoid(z))