2017-06-29 3 views
1

Ich versuche ein Regressor-Modell zu trainieren, das 4 skalare Float-Ausgaben vorhersagen kann. Wie es derzeit ist, divergiert das Netzwerk sehr schnell mit einem Anstieg der Verluste auf NaN. Ich kann nicht herausfinden, was vor sich geht.TensorFlow CNN Verlust steigt schnell auf NaN

Im Folgenden finden Sie ein Beispiel mit TensorFlow 1.1.0 unter Windows 10 mit einer NVidia-GPU.

from __future__ import absolute_import 
from __future__ import division 
from __future__ import print_function 

import numpy 
import tensorflow as tf 

IMAGE_HEIGHT = 320 
IMAGE_WIDTH = 160 
NUM_CHANNELS = 3 

PIXEL_DEPTH = 255 
SEED = 66479 # Set to None for random seed. 
BATCH_SIZE=5 
NUM_OUTPUTS = 4 # the four outputs 

def data_type(): 
    return tf.float32 

# The variables below hold all the trainable weights. They are passed an 
# initial value which will be assigned when we call: 
# {tf.global_variables_initializer().run()} 
conv1_weights = tf.Variable(
    tf.truncated_normal([5, 5, NUM_CHANNELS, 32], # 5x5 filter, depth 32. 
         stddev=0.1, 
         seed=SEED, dtype=data_type())) 
conv1_biases = tf.Variable(tf.zeros([32], dtype=data_type())) 
conv2_weights = tf.Variable(tf.truncated_normal(
    [5, 5, 32, 64], stddev=0.1, 
    seed=SEED, dtype=data_type())) 
conv2_biases = tf.Variable(tf.constant(0.1, shape=[64], dtype=data_type())) 
fc1_weights = tf.Variable( # fully connected, depth 512. 
    tf.truncated_normal([IMAGE_HEIGHT // 4 * IMAGE_WIDTH // 4 * 64, 512], 
         stddev=0.1, 
         seed=SEED, 
         dtype=data_type())) 
fc1_biases = tf.Variable(tf.constant(0.1, shape=[512], dtype=data_type())) 
fc2_weights = tf.Variable(tf.truncated_normal([512, NUM_OUTPUTS], 
              stddev=0.1, 
              seed=SEED, 
              dtype=data_type())) 
fc2_biases = tf.Variable(tf.constant(
    0.1, shape=[NUM_OUTPUTS], dtype=data_type())) 


    # We will replicate the model structure for the training subgraph, as well 
    # as the evaluation subgraphs, while sharing the trainable parameters. 
def model(data, train=False): 
    """The Model definition.""" 
    # 2D convolution, with 'SAME' padding (i.e. the output feature map has 
    # the same size as the input). Note that {strides} is a 4D array whose 
    # shape matches the data layout: [image index, y, x, depth]. 
    conv = tf.nn.conv2d(data, 
         conv1_weights, 
         strides=[1, 1, 1, 1], 
         padding='SAME') 
    # Bias and rectified linear non-linearity. 
    relu = tf.nn.relu(tf.nn.bias_add(conv, conv1_biases)) 
    # Max pooling. The kernel size spec {ksize} also follows the layout of 
    # the data. Here we have a pooling window of 2, and a stride of 2. 
    pool = tf.nn.max_pool(relu, 
          ksize=[1, 2, 2, 1], 
          strides=[1, 2, 2, 1], 
          padding='SAME') 
    conv = tf.nn.conv2d(pool, 
         conv2_weights, 
         strides=[1, 1, 1, 1], 
         padding='SAME') 
    relu = tf.nn.relu(tf.nn.bias_add(conv, conv2_biases)) 
    pool = tf.nn.max_pool(relu, 
          ksize=[1, 2, 2, 1], 
          strides=[1, 2, 2, 1], 
          padding='SAME') 
    # Reshape the feature map cuboid into a 2D matrix to feed it to the 
    # fully connected layers. 
    pool_shape = pool.get_shape().as_list() 
    reshape = tf.reshape(
     pool, 
     [pool_shape[0], pool_shape[1] * pool_shape[2] * pool_shape[3]]) 
    # Fully connected layer. Note that the '+' operation automatically 
    # broadcasts the biases. 
    hidden = tf.nn.relu(tf.matmul(reshape, fc1_weights) + fc1_biases) 
    # Add a 50% dropout during training only. Dropout also scales 
    # activations such that no rescaling is needed at evaluation time. 
    if train: 
     hidden = tf.nn.dropout(hidden, 0.5, seed=SEED) 
    return tf.matmul(hidden, fc2_weights) + fc2_biases 

def main(): 

    train_data_batch = tf.placeholder(tf.float32, shape=(BATCH_SIZE, IMAGE_HEIGHT, IMAGE_WIDTH, NUM_CHANNELS)) 
    train_label_batch = tf.placeholder(tf.float32, shape=(BATCH_SIZE, NUM_OUTPUTS)) 


    with tf.name_scope('pred'): 
     train_pred = model(train_data_batch, train=True) 

    with tf.name_scope('loss'): 
     loss = tf.reduce_sum(tf.square(train_pred - train_label_batch)) 
     tf.summary.scalar('loss', loss) 


    # L2 regularization for the fully connected parameters. 
    regularizers = (tf.nn.l2_loss(fc1_weights) + tf.nn.l2_loss(fc1_biases) + 
        tf.nn.l2_loss(fc2_weights) + tf.nn.l2_loss(fc2_biases)) 
    # Add the regularization term to the loss. 
    loss += 5e-4 * regularizers 

    optimizer = tf.train.GradientDescentOptimizer(0.01) 
    train_op = optimizer.minimize(loss) 

    with tf.Session() as sess: 
     # The op for initializing the variables. 
     init_op = tf.group(tf.global_variables_initializer(), 
          tf.local_variables_initializer()) 

     sess.run(init_op) 

     while True: 
      predictions, l, _ = sess.run([train_pred, loss, train_op], feed_dict={ 

       train_data_batch: numpy.zeros([BATCH_SIZE, IMAGE_HEIGHT, IMAGE_WIDTH, NUM_CHANNELS])+0.2, 
       train_label_batch: numpy.zeros([BATCH_SIZE, 4])}) 

      print(l) 

if __name__ == "__main__": 
    main() 

Ausgang:

9031.0 
5.6838e+22 
nan 
nan 
nan 
nan 
nan 
nan 
nan 
nan 
nan 
nan 
nan 
nan 
nan 
+1

Sie könnten auch [this thread] (https://stackoverflow.com/q/33962226/1714410) relevant finden. Es ist markiert [Tag: Caffe], aber es kann für andere Deep Learning-Tools relevant sein, wie [Tag: Tensorflow]. – Shai

Antwort

1

Es scheint, dass mein Modell divergierend wurde. Ich löste das, indem ich zu einem AdamOptimizer änderte:

optimizer = tf.train.AdamOptimizer(0.5) 

Dieses setzt adaptiv Parameter für einen Impulsoptimierer.

Verwandte Themen