Ich versuche XLA auf Multi-GPU-Maschine zu verwenden. Fledermaus, wenn ich XLA JIT einschalte, verwendet Tensorflow nicht gleichzeitig GPU.TensorFlow XLA mit mehreren GPU verwendet keine GPU zur gleichen Zeit
wenn XLA ein, gpu0 und gpu1 abwechselnd aktiv sind.
wenn XLA off, gpu0 und gpu1 bei gleichzeitig beide aktiv sind.
, was auf meiner Umgebung passieren?
meinen Code
import tensorflow as tf
from pathlib import Path
import time
INPUT_SIZE = 64
INPUT_CHANNELS = 1
MINIBATCH_SIZE = 32
NUM_ITERATIONS = 200000
NUM_GPU = 2
def read_op(filename_queue, reader):
_, raw = reader.read(filename_queue)
read_image = tf.image.decode_jpeg(
raw, channels=INPUT_CHANNELS)
read_image = tf.to_float(read_image)/255.
read_image = tf.image.resize_images(read_image, [INPUT_SIZE, INPUT_SIZE])
return read_image
def inference(image, log_suffix):
# autoencoder model for mutli GPU testing
# this model has no particular meaning
def w_init(initial_weight=1e-3):
return tf.truncated_normal_initializer(stddev=initial_weight)
def make_conv(x, out_ch, stride=[1,1,1,1]):
shape = x.get_shape().as_list()
with tf.device('/cpu:0'):
conv_w = tf.get_variable(initializer=w_init(), name='weight',
shape=[7, 7, shape[3], out_ch])
conv = tf.nn.conv2d(x, conv_w, stride, padding='SAME')
mean, var = tf.nn.moments(conv, [0])
conv = tf.nn.batch_normalization(conv, mean, var, None, None, 1e-9)
return tf.nn.relu(conv)
def make_deconv(x, out_shape, bn=True):
shape = x.get_shape().as_list()
with tf.device('/cpu:0'):
w = tf.get_variable(initializer=w_init(), name='weight',
shape=[7, 7, out_shape[3], shape[3]])
deconv = tf.nn.conv2d_transpose(x, w, out_shape, [1,2,2,1])
mean, var = tf.nn.moments(deconv, [0])
if bn: deconv = tf.nn.batch_normalization(deconv, mean, var, None, None, 1e-9)
return tf.nn.relu(deconv)
def make_deconv_same(x, out_shape, activate=tf.nn.relu, bn=True, scale=1e-3):
shape = x.get_shape().as_list()
with tf.device('/cpu:0'):
w = tf.get_variable(initializer=w_init(), name='weight',
shape=[7, 7, out_shape[3], shape[3]])
deconv = tf.nn.conv2d_transpose(x, w, out_shape, [1,1,1,1])
mean, var = tf.nn.moments(deconv, [0])
if bn: deconv = tf.nn.batch_normalization(deconv, mean, var, None, None, 1e-9)
return activate(deconv)
with tf.variable_scope('conv1'):
conv1 = make_conv(image, 128)
with tf.variable_scope('conv2'):
conv2 = make_conv(conv1, 128)
with tf.variable_scope('conv3'):
conv3 = make_conv(conv2, 160, stride=[1,2,2,1])
with tf.variable_scope('conv4'):
conv4 = make_conv(conv3, 160)
with tf.variable_scope('conv5'):
conv5 = make_conv(conv4, 192, stride=[1,2,2,1])
with tf.variable_scope('conv6'):
conv6 = make_conv(conv5, 192)
with tf.variable_scope('conv7'):
conv7 = make_conv(conv6, 256, stride=[1,2,2,1])
with tf.variable_scope('conv8'):
conv8 = make_conv(conv7, 256)
with tf.variable_scope('linear1'):
feature_lengh = 300
shape = conv8.get_shape().as_list()
vec_length = shape[1] * shape[2] * shape[3]
in_vec = tf.reshape(conv8,[-1, vec_length])
with tf.device('/cpu:0'):
w = tf.get_variable(initializer=w_init(1e-2), name='weight',
shape=[vec_length, feature_lengh])
b = tf.get_variable(initializer=w_init(1e-2), name='bias',
shape=[feature_lengh])
linear1 = tf.matmul(in_vec, w) + b
mean, var = tf.nn.moments(linear1, [0])
linear1 = tf.nn.batch_normalization(linear1, mean, var, None, None, 1e-9)
linear1 = tf.nn.sigmoid(linear1)
with tf.variable_scope('linear2'):
in_shape = linear1.get_shape().as_list()
in_length = in_shape[1]
out_shape = conv8.get_shape().as_list()
out_length = out_shape[1] * out_shape[2] * out_shape[3]
with tf.device('/cpu:0'):
w = tf.get_variable(initializer=w_init(1e-2), name='weight',
shape=[in_length, out_length])
b = tf.get_variable(initializer=w_init(1e-2), name='bias',
shape=[out_length])
linear2 = tf.matmul(linear1, w) + b
mean, var = tf.nn.moments(linear2, [0])
linear2 = tf.nn.batch_normalization(linear2, mean, var, None, None, 1e-9)
linear2 = tf.nn.sigmoid(linear2)
linear2 = tf.reshape(linear2, out_shape)
with tf.variable_scope('deconv1'):
deconv1 = make_deconv_same(linear2, conv7.get_shape())
with tf.variable_scope('deconv2'):
deconv2 = make_deconv (deconv1, conv6.get_shape())
with tf.variable_scope('deconv3'):
deconv3 = make_deconv_same(deconv2, conv5.get_shape())
with tf.variable_scope('deconv4'):
deconv4 = make_deconv (deconv3, conv4.get_shape())
with tf.variable_scope('deconv5'):
deconv5 = make_deconv_same(deconv4, conv3.get_shape())
with tf.variable_scope('deconv6'):
deconv6 = make_deconv (deconv5, conv2.get_shape())
with tf.variable_scope('deconv7'):
deconv7 = make_deconv_same(deconv6, conv1.get_shape())
with tf.variable_scope('deconv8'):
deconv8 = make_deconv_same(deconv7, image.get_shape(), bn=False, scale=1e-1)
with tf.device('/cpu:0'):
image_log = tf.summary.image('output'+log_suffix, deconv8, collections=['image_log'])
image_log = tf.summary.image('input'+log_suffix, image, collections=['image_log'])
return deconv8
def loss(label, out, global_step, log_suffix):
with tf.name_scope('loss'):
l = tf.squared_difference(label, out)
# for tensorboard Logarithmic graph mode
lv = tf.reduce_mean(l) * 1e+7
with tf.device('/cpu:0'):
loss_log = tf.summary.scalar('loss'+log_suffix,lv)
return l
def average_gradients(tower_grads):
with tf.name_scope('avarage_gradients'):
average_grads = []
for grad_and_vars in zip(*tower_grads):
grads = []
for g, u in grad_and_vars:
expanded_g = tf.expand_dims(g,0)
grads.append(expanded_g)
grad = tf.concat(grads, axis=0)
grad = tf.reduce_mean(grad,0)
v = grad_and_vars[0][1]
grad_and_var = (grad, v)
average_grads.append(grad_and_var)
for grad,var in average_grads:
with tf.device('/cpu:0'):
tf.summary.histogram('grads/'+var.name, grad, collections=['grads'])
return average_grads
def main():
global NUM_GPU, MINIBATCH_SIZE
# many jpeg images
sample_dir = Path('./training_samples')
file_list = [p for p in sample_dir.iterdir() if p.suffix == '.jpg']
file_list = list(map(str, file_list))
with tf.Graph().as_default(), tf.device('/cpu:0'):
config_proto = tf.ConfigProto(
allow_soft_placement=True, log_device_placement=False)
# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# if XLA is on, problem occured
# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
#config_proto.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
sess = tf.Session(config=config_proto)
global_step = tf.get_variable(
'global_step', [], initializer=tf.constant_initializer(0), trainable=False)
with tf.variable_scope('optimizer'):
opt = tf.train.AdamOptimizer(1e-6)
with tf.variable_scope('input'):
filename_queue = tf.train.string_input_producer(file_list)
reader = tf.WholeFileReader()
images_list = [
tf.train.shuffle_batch(
[read_op(filename_queue, reader)], MINIBATCH_SIZE, 24000, 8000, num_threads=8),
tf.train.shuffle_batch(
[read_op(filename_queue, reader)], MINIBATCH_SIZE, 24000, 8000, num_threads=8)]
tower_grads = []
reuse = False
for i in range(NUM_GPU):
with tf.device('/gpu:{}'.format(i)):
with tf.variable_scope('model', reuse=reuse, caching_device='/gpu:{}'.format(i)):
infer = inference(images_list[i], '/tower_{}'.format(i))
reuse = True
tower_loss = loss(images_list[i], infer, global_step, '/tower_{}'.format(i))
grads = opt.compute_gradients(tower_loss)
tower_grads.append(grads)
grads = average_gradients(tower_grads)
train_op = opt.apply_gradients(grads, global_step=global_step)
image_log_op = tf.summary.merge(tf.get_collection('image_log'))
loss_log_op = tf.summary.merge_all()
grads_log_op = tf.summary.merge(tf.get_collection('grads'))
writer = tf.summary.FileWriter('logs')
sess.run(tf.global_variables_initializer())
writer.add_graph(tf.get_default_graph())
coordinator = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sess, coord=coordinator)
for i in range(NUM_ITERATIONS):
print('iteration: ',i)
start = time.time()
if i % 2 == 0:
_, loss_log, image_log = sess.run([train_op, loss_log_op, image_log_op])
writer.add_summary(loss_log, i)
writer.add_summary(image_log, i)
writer.flush()
else:
_ = sess.run([train_op])
end = time.time()
print('time = {}'.format(end - start))
writer.close()
if __name__ == '__main__':
main()
Umwelt Info
Betriebssystem: Ubuntu 16.04 GPU: GTX 1080 x2 Konfigurationsoption (gcc): -march = native O3 Konfigurationsoption (CUDA-fähig) : 6.1
installierte Version von CUDA und cuDNN:
/usr/local/cuda/lib64/libcudadevrt.a
/usr/local/cuda/lib64/libcudart.so -> libcudart.so.8.0
/usr/local/cuda/lib64/libcudart.so.8.0 -> libcudart.so.8.0.44
/usr/local/cuda/lib64/libcudart.so.8.0.44
/usr/local/cuda/lib64/libcudart_static.a
/usr/local/cuda/lib64/libcudnn.so -> libcudnn.so.5.1.5
/usr/local/cuda/lib64/libcudnn.so.5 -> libcudnn.so.5.1.5
/usr/local/cuda/lib64/libcudnn.so.5.1.5
/usr/local/cuda/lib64/libcudnn_static.a
Die tensorflow commit hash c56c873fbaf976d26d487ad57c8efbc87f05331c
Der Ausgang des bazel version
.......
Build label: 0.4.4
Build target: bazel-out/local-fastbuild/bin/src/main/java/com/google/devtools/build/lib/bazel/BazelServer_deploy.jar
Build time: Wed Feb 1 18:54:21 2017 (1485975261)
Build timestamp: 1485975261
Build timestamp as int: 1485975261
OK, ich verstehe. Vielen Dank. – Yusuke