2017-02-18 2 views
1

Ich versuche XLA auf Multi-GPU-Maschine zu verwenden. Fledermaus, wenn ich XLA JIT einschalte, verwendet Tensorflow nicht gleichzeitig GPU.TensorFlow XLA mit mehreren GPU verwendet keine GPU zur gleichen Zeit

wenn XLA ein, gpu0 und gpu1 abwechselnd aktiv sind.

enter image description here

enter image description here

wenn XLA off, gpu0 und gpu1 bei gleichzeitig beide aktiv sind.

enter image description here

, was auf meiner Umgebung passieren?

meinen Code

import tensorflow as tf 
from pathlib import Path 
import time 

INPUT_SIZE = 64 
INPUT_CHANNELS = 1 
MINIBATCH_SIZE = 32 
NUM_ITERATIONS = 200000 
NUM_GPU = 2 

def read_op(filename_queue, reader): 
    _, raw = reader.read(filename_queue) 

    read_image = tf.image.decode_jpeg(
     raw, channels=INPUT_CHANNELS) 
    read_image = tf.to_float(read_image)/255. 
    read_image = tf.image.resize_images(read_image, [INPUT_SIZE, INPUT_SIZE]) 
    return read_image 

def inference(image, log_suffix): 
    # autoencoder model for mutli GPU testing 
    # this model has no particular meaning 
    def w_init(initial_weight=1e-3): 
     return tf.truncated_normal_initializer(stddev=initial_weight) 

    def make_conv(x, out_ch, stride=[1,1,1,1]): 
     shape = x.get_shape().as_list() 

     with tf.device('/cpu:0'): 
      conv_w = tf.get_variable(initializer=w_init(), name='weight', 
       shape=[7, 7, shape[3], out_ch]) 

     conv = tf.nn.conv2d(x, conv_w, stride, padding='SAME') 
     mean, var = tf.nn.moments(conv, [0]) 
     conv = tf.nn.batch_normalization(conv, mean, var, None, None, 1e-9) 

     return tf.nn.relu(conv) 

    def make_deconv(x, out_shape, bn=True): 
     shape = x.get_shape().as_list() 

     with tf.device('/cpu:0'): 
      w = tf.get_variable(initializer=w_init(), name='weight', 
       shape=[7, 7, out_shape[3], shape[3]]) 

     deconv = tf.nn.conv2d_transpose(x, w, out_shape, [1,2,2,1]) 
     mean, var = tf.nn.moments(deconv, [0]) 

     if bn: deconv = tf.nn.batch_normalization(deconv, mean, var, None, None, 1e-9) 

     return tf.nn.relu(deconv) 

    def make_deconv_same(x, out_shape, activate=tf.nn.relu, bn=True, scale=1e-3): 
     shape = x.get_shape().as_list() 

     with tf.device('/cpu:0'): 
      w = tf.get_variable(initializer=w_init(), name='weight', 
       shape=[7, 7, out_shape[3], shape[3]]) 

     deconv = tf.nn.conv2d_transpose(x, w, out_shape, [1,1,1,1]) 
     mean, var = tf.nn.moments(deconv, [0]) 

     if bn: deconv = tf.nn.batch_normalization(deconv, mean, var, None, None, 1e-9) 

     return activate(deconv) 

    with tf.variable_scope('conv1'): 
     conv1 = make_conv(image, 128) 
    with tf.variable_scope('conv2'): 
     conv2 = make_conv(conv1, 128) 
    with tf.variable_scope('conv3'): 
     conv3 = make_conv(conv2, 160, stride=[1,2,2,1]) 
    with tf.variable_scope('conv4'): 
     conv4 = make_conv(conv3, 160) 
    with tf.variable_scope('conv5'): 
     conv5 = make_conv(conv4, 192, stride=[1,2,2,1]) 
    with tf.variable_scope('conv6'): 
     conv6 = make_conv(conv5, 192) 
    with tf.variable_scope('conv7'): 
     conv7 = make_conv(conv6, 256, stride=[1,2,2,1]) 
    with tf.variable_scope('conv8'): 
     conv8 = make_conv(conv7, 256) 
    with tf.variable_scope('linear1'): 
     feature_lengh = 300 
     shape = conv8.get_shape().as_list() 
     vec_length = shape[1] * shape[2] * shape[3] 
     in_vec = tf.reshape(conv8,[-1, vec_length]) 

     with tf.device('/cpu:0'): 
      w = tf.get_variable(initializer=w_init(1e-2), name='weight', 
       shape=[vec_length, feature_lengh]) 
      b = tf.get_variable(initializer=w_init(1e-2), name='bias', 
       shape=[feature_lengh]) 

     linear1 = tf.matmul(in_vec, w) + b 
     mean, var = tf.nn.moments(linear1, [0]) 
     linear1 = tf.nn.batch_normalization(linear1, mean, var, None, None, 1e-9) 
     linear1 = tf.nn.sigmoid(linear1) 
    with tf.variable_scope('linear2'): 
     in_shape = linear1.get_shape().as_list() 
     in_length = in_shape[1] 
     out_shape = conv8.get_shape().as_list() 
     out_length = out_shape[1] * out_shape[2] * out_shape[3] 

     with tf.device('/cpu:0'): 
      w = tf.get_variable(initializer=w_init(1e-2), name='weight', 
       shape=[in_length, out_length]) 
      b = tf.get_variable(initializer=w_init(1e-2), name='bias', 
       shape=[out_length]) 

     linear2 = tf.matmul(linear1, w) + b 
     mean, var = tf.nn.moments(linear2, [0]) 
     linear2 = tf.nn.batch_normalization(linear2, mean, var, None, None, 1e-9) 
     linear2 = tf.nn.sigmoid(linear2) 
     linear2 = tf.reshape(linear2, out_shape) 
    with tf.variable_scope('deconv1'): 
     deconv1 = make_deconv_same(linear2, conv7.get_shape()) 
    with tf.variable_scope('deconv2'): 
     deconv2 = make_deconv  (deconv1, conv6.get_shape()) 
    with tf.variable_scope('deconv3'): 
     deconv3 = make_deconv_same(deconv2, conv5.get_shape()) 
    with tf.variable_scope('deconv4'): 
     deconv4 = make_deconv  (deconv3, conv4.get_shape()) 
    with tf.variable_scope('deconv5'): 
     deconv5 = make_deconv_same(deconv4, conv3.get_shape()) 
    with tf.variable_scope('deconv6'): 
     deconv6 = make_deconv  (deconv5, conv2.get_shape()) 
    with tf.variable_scope('deconv7'): 
     deconv7 = make_deconv_same(deconv6, conv1.get_shape()) 
    with tf.variable_scope('deconv8'): 
     deconv8 = make_deconv_same(deconv7, image.get_shape(), bn=False, scale=1e-1) 

    with tf.device('/cpu:0'): 
     image_log = tf.summary.image('output'+log_suffix, deconv8, collections=['image_log']) 
     image_log = tf.summary.image('input'+log_suffix, image, collections=['image_log']) 

    return deconv8 

def loss(label, out, global_step, log_suffix): 
    with tf.name_scope('loss'): 
     l = tf.squared_difference(label, out) 

     # for tensorboard Logarithmic graph mode 
     lv = tf.reduce_mean(l) * 1e+7 

     with tf.device('/cpu:0'): 
      loss_log = tf.summary.scalar('loss'+log_suffix,lv) 

    return l 

def average_gradients(tower_grads): 
    with tf.name_scope('avarage_gradients'): 
     average_grads = [] 

     for grad_and_vars in zip(*tower_grads): 
      grads = [] 

      for g, u in grad_and_vars: 
       expanded_g = tf.expand_dims(g,0) 
       grads.append(expanded_g) 

      grad = tf.concat(grads, axis=0) 
      grad = tf.reduce_mean(grad,0) 

      v = grad_and_vars[0][1] 
      grad_and_var = (grad, v) 
      average_grads.append(grad_and_var) 

     for grad,var in average_grads: 
      with tf.device('/cpu:0'): 
       tf.summary.histogram('grads/'+var.name, grad, collections=['grads']) 

    return average_grads 

def main(): 
    global NUM_GPU, MINIBATCH_SIZE 

    # many jpeg images 
    sample_dir = Path('./training_samples') 
    file_list = [p for p in sample_dir.iterdir() if p.suffix == '.jpg'] 
    file_list = list(map(str, file_list)) 

    with tf.Graph().as_default(), tf.device('/cpu:0'): 
     config_proto = tf.ConfigProto(
      allow_soft_placement=True, log_device_placement=False) 
     # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 
     # if XLA is on, problem occured 
     # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 
     #config_proto.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1 
     sess = tf.Session(config=config_proto) 

     global_step = tf.get_variable(
      'global_step', [], initializer=tf.constant_initializer(0), trainable=False) 

     with tf.variable_scope('optimizer'): 
      opt = tf.train.AdamOptimizer(1e-6) 

     with tf.variable_scope('input'): 
      filename_queue = tf.train.string_input_producer(file_list) 
      reader = tf.WholeFileReader() 
      images_list = [ 
       tf.train.shuffle_batch(
        [read_op(filename_queue, reader)], MINIBATCH_SIZE, 24000, 8000, num_threads=8), 
       tf.train.shuffle_batch(
        [read_op(filename_queue, reader)], MINIBATCH_SIZE, 24000, 8000, num_threads=8)] 

     tower_grads = [] 
     reuse = False 
     for i in range(NUM_GPU): 
      with tf.device('/gpu:{}'.format(i)): 
       with tf.variable_scope('model', reuse=reuse, caching_device='/gpu:{}'.format(i)): 
        infer = inference(images_list[i], '/tower_{}'.format(i)) 
        reuse = True 
        tower_loss = loss(images_list[i], infer, global_step, '/tower_{}'.format(i)) 

       grads = opt.compute_gradients(tower_loss) 
       tower_grads.append(grads) 

     grads = average_gradients(tower_grads) 
     train_op = opt.apply_gradients(grads, global_step=global_step) 

     image_log_op = tf.summary.merge(tf.get_collection('image_log')) 
     loss_log_op = tf.summary.merge_all() 
     grads_log_op = tf.summary.merge(tf.get_collection('grads')) 

     writer = tf.summary.FileWriter('logs') 
     sess.run(tf.global_variables_initializer()) 
     writer.add_graph(tf.get_default_graph()) 
     coordinator = tf.train.Coordinator() 

     threads = tf.train.start_queue_runners(sess=sess, coord=coordinator) 

     for i in range(NUM_ITERATIONS): 
      print('iteration: ',i) 

      start = time.time() 

      if i % 2 == 0: 
       _, loss_log, image_log = sess.run([train_op, loss_log_op, image_log_op]) 
       writer.add_summary(loss_log, i) 
       writer.add_summary(image_log, i) 
       writer.flush() 
      else: 
       _ = sess.run([train_op]) 

      end = time.time() 

      print('time = {}'.format(end - start)) 

     writer.close() 

if __name__ == '__main__': 
    main() 

Umwelt Info

Betriebssystem: Ubuntu 16.04 GPU: GTX 1080 x2 Konfigurationsoption (gcc): -march = native O3 Konfigurationsoption (CUDA-fähig) : 6.1

installierte Version von CUDA und cuDNN:

/usr/local/cuda/lib64/libcudadevrt.a 
/usr/local/cuda/lib64/libcudart.so -> libcudart.so.8.0 
/usr/local/cuda/lib64/libcudart.so.8.0 -> libcudart.so.8.0.44 
/usr/local/cuda/lib64/libcudart.so.8.0.44 
/usr/local/cuda/lib64/libcudart_static.a 
/usr/local/cuda/lib64/libcudnn.so -> libcudnn.so.5.1.5 
/usr/local/cuda/lib64/libcudnn.so.5 -> libcudnn.so.5.1.5 
/usr/local/cuda/lib64/libcudnn.so.5.1.5 
/usr/local/cuda/lib64/libcudnn_static.a 

Die tensorflow commit hash c56c873fbaf976d26d487ad57c8efbc87f05331c

Der Ausgang des bazel version

....... 
Build label: 0.4.4 
Build target: bazel-out/local-fastbuild/bin/src/main/java/com/google/devtools/build/lib/bazel/BazelServer_deploy.jar 
Build time: Wed Feb 1 18:54:21 2017 (1485975261) 
Build timestamp: 1485975261 
Build timestamp as int: 1485975261 

Antwort

3

Im Moment ist XLA einzigen GPU.

+0

OK, ich verstehe. Vielen Dank. – Yusuke