gpt4 book ai didi

python - tensorflow 的实现比 torch 的慢 2 倍

转载 作者:行者123 更新时间:2023-12-01 03:28:29 25 4
gpt4 key购买 nike

我正在尝试实现 stacked hourglass在 tensorflow 上,而 torch 实现已经存在here .

我使用 Titan X pascal 在默认配置(批量大小 = 6)下对其进行了测试,平均训练迭代时间约为 343 毫秒。

我使用随机输入/输出来测试我的 tensorflow 实现:

import tensorflow as tf

class stacked_hourglass():
def __init__(self, nb_stack, name='stacked_hourglass'):
self.nb_stack = nb_stack
self.name = name

def __call__(self, x):
with tf.name_scope(self.name) as scope:
padding = tf.pad(x, [[0,0],[3,3],[3,3],[0,0]], name='padding')
with tf.name_scope("preprocessing") as sc:
conv1 = self._conv(padding, 64, 7, 2, 'VALID', 'conv1')
norm1 = tf.contrib.layers.batch_norm(conv1, 0.9, epsilon=1e-5,
activation_fn=tf.nn.relu, scope=sc)
r1 = self._residual_block(norm1, 128, 'r1')
pool = tf.contrib.layers.max_pool2d(r1, [2,2], [2,2], 'VALID', scope=scope)
r2 = self._residual_block(pool, 128, 'r2')
r3 = self._residual_block(r2, 256, 'r3')
hg = [None] * self.nb_stack
ll = [None] * self.nb_stack
ll_ = [None] * self.nb_stack
out = [None] * self.nb_stack
out_ = [None] * self.nb_stack
sum_ = [None] * self.nb_stack
with tf.name_scope('_hourglass_0_with_supervision') as sc:
hg[0] = self._hourglass(r3, 4, 256, '_hourglass')
ll[0] = self._conv_bn_relu(hg[0], 256, name='conv_1')
ll_[0] = self._conv(ll[0],256,1,1,'VALID','ll')
out[0] = self._conv(ll[0],16,1,1,'VALID','out')
out_[0] = self._conv(out[0],256,1,1,'VALID','out_')
sum_[0] = tf.add_n([ll_[0], out_[0], r3])
for i in range(1, self.nb_stack - 1):
with tf.name_scope('_hourglass_' + str(i) + '_with_supervision') as sc:
hg[i] = self._hourglass(sum_[i-1], 4, 256, '_hourglass')
ll[i] = self._conv_bn_relu(hg[i], 256, name='conv_1')
ll_[i] = self._conv(ll[i],256,1,1,'VALID','ll')
out[i] = self._conv(ll[i],16,1,1,'VALID','out')
out_[i] = self._conv(out[i],256,1,1,'VALID','out_')
sum_[i] = tf.add_n([ll_[i], out_[i], sum_[i-1]])
with tf.name_scope('_hourglass_' + str(self.nb_stack - 1) + '_with_supervision') as sc:
hg[self.nb_stack-1] = self._hourglass(sum_[self.nb_stack - 2], 4, 256, '_hourglass')
ll[self.nb_stack-1] = self._conv_bn_relu(hg[self.nb_stack - 1], 256, name='conv_1')
out[self.nb_stack-1] = self._conv(ll[self.nb_stack-1],16,1,1,'VALID','out')
return tf.stack(out)

def _conv(self, inputs, nb_filter, kernel_size=1, strides=1, pad='VALID', name='conv'):
with tf.name_scope(name) as scope:
kernel = tf.Variable(tf.contrib.layers.xavier_initializer(uniform=False)([kernel_size,\
kernel_size,inputs.get_shape().as_list()[3],nb_filter]), name='weights')
conv = tf.nn.conv2d(inputs, kernel, [1,strides,strides,1], padding=pad, data_format='NHWC')
return conv

def _conv_bn_relu(self, inputs, nb_filter, kernel_size=1, strides=1, name=None):
with tf.name_scope(name) as scope:
kernel = tf.Variable(tf.contrib.layers.xavier_initializer(uniform=False)([kernel_size,\
kernel_size,inputs.get_shape().as_list()[3],nb_filter]), name='weights')
conv = tf.nn.conv2d(inputs, kernel, [1,strides,strides,1], padding='SAME', data_format='NHWC')
norm = tf.contrib.layers.batch_norm(conv, 0.9, epsilon=1e-5, activation_fn=tf.nn.relu, scope=scope)
return norm

def _conv_block(self, inputs, nb_filter_out, name='_conv_block'):
with tf.name_scope(name) as scope:
with tf.name_scope('norm_conv1') as sc:
norm1 = tf.contrib.layers.batch_norm(inputs, 0.9, epsilon=1e-5,
activation_fn=tf.nn.relu, scope=sc)
conv1 = self._conv(norm1, nb_filter_out / 2, 1, 1, 'SAME', name='conv1')
with tf.name_scope('norm_conv2') as sc:
norm2 = tf.contrib.layers.batch_norm(conv1, 0.9, epsilon=1e-5,
activation_fn=tf.nn.relu, scope=sc)
conv2 = self._conv(norm2, nb_filter_out / 2, 3, 1, 'SAME', name='conv2')
with tf.name_scope('norm_conv3') as sc:
norm3 = tf.contrib.layers.batch_norm(conv2, 0.9, epsilon=1e-5,
activation_fn=tf.nn.relu, scope=sc)
conv3 = self._conv(norm3, nb_filter_out, 1, 1, 'SAME', name='conv3')
return conv3

def _skip_layer(self, inputs, nb_filter_out, name='_skip_layer'):
if inputs.get_shape()[3].__eq__(tf.Dimension(nb_filter_out)):
return inputs
else:
with tf.name_scope(name) as scope:
conv = self._conv(inputs, nb_filter_out, 1, 1, 'SAME', name='conv')
return conv

def _residual_block(self, inputs, nb_filter_out, name='_residual_block'):
with tf.name_scope(name) as scope:
_conv_block = self._conv_block(inputs, nb_filter_out)
_skip_layer = self._skip_layer(inputs, nb_filter_out)
return tf.add(_skip_layer, _conv_block)

def _hourglass(self, inputs, n, nb_filter_res, name='_hourglass'):
with tf.name_scope(name) as scope:
# Upper branch
up1 = self._residual_block(inputs, nb_filter_res, 'up1')
# Lower branch
pool = tf.contrib.layers.max_pool2d(inputs, [2,2], [2,2], 'VALID', scope=scope)
low1 = self._residual_block(pool, nb_filter_res, 'low1')
if n > 1:
low2 = self._hourglass(low1, n-1, nb_filter_res, 'low2')
else:
low2 = self._residual_block(low1, nb_filter_res, 'low2')
low3 = self._residual_block(low2, nb_filter_res, 'low3')
low4 = tf.image.resize_nearest_neighbor(low3, tf.shape(low3)[1:3] * 2,
name='upsampling')
if n < 4:
return tf.add(up1, low4, name='merge')
else:
return self._residual_block(tf.add(up1, low4), nb_filter_res, 'low4')

if __name__ == "__main__":
import os
import sys
import numpy as np
import time
sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0)
with tf.Graph().as_default():
DEVICE = '/gpu:0'
with tf.device(DEVICE):
print "start build model..."
_x = tf.placeholder(tf.float32, [None, 256, 256, 3])
y = tf.placeholder(tf.float32, [8, None, 64, 64, 16])
output = stacked_hourglass(8, 'stacked_hourglass')(_x)
loss = tf.reduce_mean(tf.square(output - y))
rmsprop = tf.train.RMSPropOptimizer(2.5e-4)
print "build finished..."
train_step = tf.Variable(0, name='global_step', trainable=False)
with tf.device(DEVICE):
train_rmsprop = rmsprop.minimize(loss, train_step)
init = tf.global_variables_initializer()
with tf.Session() as sess:
with tf.device(DEVICE):
sess.run(init)
print "test..."
xarr = np.random.rand(100, 6, 256, 256, 3)
yarr = np.random.rand(100, 8, 6, 64, 64, 16)
_time = time.clock()
with tf.device(DEVICE):
for u in range(0, 100):
sess.run(train_rmsprop, feed_dict={_x:xarr[u], y:yarr[u]})
print "test:", time.clock() - _time

输出为:

I tensorflow/stream_executor/dso_loader.cc:128] successfully opened CUDA library libcublas.so
.8.0 locally
I tensorflow/stream_executor/dso_loader.cc:128] successfully opened CUDA library libcudnn.so.
5 locally
I tensorflow/stream_executor/dso_loader.cc:128] successfully opened CUDA library libcufft.so.
8.0 locally
I tensorflow/stream_executor/dso_loader.cc:128] successfully opened CUDA library libcuda.so.1
locally
I tensorflow/stream_executor/dso_loader.cc:128] successfully opened CUDA library libcurand.so
.8.0 locally
start build model...
E tensorflow/core/framework/op_kernel.cc:925] OpKernel ('op: "NegTrain" device_type: "CPU"')
for unknown op: NegTrain
E tensorflow/core/framework/op_kernel.cc:925] OpKernel ('op: "Skipgram" device_type: "CPU"')
for unknown op: Skipgram
build finished...
I tensorflow/core/common_runtime/gpu/gpu_device.cc:885] Found device 0 with properties:
name: TITAN X (Pascal)
major: 6 minor: 1 memoryClockRate (GHz) 1.531
pciBusID 0000:05:00.0
Total memory: 11.90GiB
Free memory: 11.75GiB
I tensorflow/core/common_runtime/gpu/gpu_device.cc:906] DMA: 0
I tensorflow/core/common_runtime/gpu/gpu_device.cc:916] 0: Y
I tensorflow/core/common_runtime/gpu/gpu_device.cc:975] Creating TensorFlow device (/gpu:0) -
> (device: 0, name: TITAN X (Pascal), pci bus id: 0000:05:00.0)
test...
I tensorflow/core/common_runtime/gpu/pool_allocator.cc:247] PoolAllocator: After 4543 get req
uests, put_count=2609 evicted_count=1000 eviction_rate=0.383289 and unsatisfied allocation ra
te=0.667841
I tensorflow/core/common_runtime/gpu/pool_allocator.cc:259] Raising pool_size_limit_ from 100
to 110
I tensorflow/core/common_runtime/gpu/pool_allocator.cc:247] PoolAllocator: After 0 get reques
ts, put_count=2013 evicted_count=2000 eviction_rate=0.993542 and unsatisfied allocation rate=
0
I tensorflow/core/common_runtime/gpu/pool_allocator.cc:247] PoolAllocator: After 4543 get req
uests, put_count=4719 evicted_count=3000 eviction_rate=0.635728 and unsatisfied allocation ra
te=0.625358
I tensorflow/core/common_runtime/gpu/pool_allocator.cc:259] Raising pool_size_limit_ from 193
to 212
I tensorflow/core/common_runtime/gpu/pool_allocator.cc:247] PoolAllocator: After 0 get reques
ts, put_count=2025 evicted_count=2000 eviction_rate=0.987654 and unsatisfied allocation rate=
0
I tensorflow/core/common_runtime/gpu/pool_allocator.cc:247] PoolAllocator: After 0 get reques
ts, put_count=1037 evicted_count=1000 eviction_rate=0.96432 and unsatisfied allocation rate=0
I tensorflow/core/common_runtime/gpu/pool_allocator.cc:247] PoolAllocator: After 0 get reques
ts, put_count=1054 evicted_count=1000 eviction_rate=0.948767 and unsatisfied allocation rate=
0
I tensorflow/core/common_runtime/gpu/pool_allocator.cc:247] PoolAllocator: After 0 get reques
ts, put_count=1079 evicted_count=1000 eviction_rate=0.926784 and unsatisfied allocation rate=
0
I tensorflow/core/common_runtime/gpu/pool_allocator.cc:247] PoolAllocator: After 4543 get req
uests, put_count=5036 evicted_count=2000 eviction_rate=0.397141 and unsatisfied allocation ra
te=0.359674
I tensorflow/core/common_runtime/gpu/pool_allocator.cc:259] Raising pool_size_limit_ from 140
0 to 1540
test: 71.733044

这意味着平均迭代时间约为 717 毫秒,比 torch 实现慢两倍......

我知道 Tensorflow 应该会稍微慢一些,但是已经做了很多工作来 catch (鉴于一些 benchmarks 现在应该非常接近)

你知道是什么让我的实现如此缓慢吗?

最佳答案

向前步进时间比较如何?历史上,TensorFlow 在反向传播上比 Torch 慢,因为 AD 在具有更高粒度的图上运行(单独的数学运算,而不是 Torch 层),因此会为反向传递生成更多运算。在某些情况下,通过添加重要操作/梯度的融合版本可以缓解这种情况。

一些想法

  1. 确保您在幕后使用tf.fused_batch_norm(即fused=True参数here)

  2. 使用队列而不是feed_dictfeed_dict 会产生从 Python 运行时到 TensorFlow 运行时的额外副本,因此您实际上执行了 2 个副本——Python->TensorFlow CPU、TensorFlow CPU->TensorFlow GPU。对于吸收 CPU->GPU 传输延迟的额外步骤,有 this

  3. 查看 timelines可以告诉您哪个部分太慢。

tcmalloc 和 c protobuf

sudo apt-get install google-perftools
export LD_PRELOAD="/usr/lib/libtcmalloc.so.4"
pip install --upgrade https://storage.googleapis.com/tensorflow/linux/cpu/protobuf-3.0.0-cp27-none-linux_x86_64.whl

关于python - tensorflow 的实现比 torch 的慢 2 倍,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/41211691/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com