- html - 出于某种原因,IE8 对我的 Sass 文件中继承的 html5 CSS 不友好?
- JMeter 在响应断言中使用 span 标签的问题
- html - 在 :hover and :active? 上具有不同效果的 CSS 动画
- html - 相对于居中的 html 内容固定的 CSS 重复背景?
我正在尝试实现 stacked hourglass在 tensorflow 上,而 torch 实现已经存在here .
我使用 Titan X pascal 在默认配置(批量大小 = 6)下对其进行了测试,平均训练迭代时间约为 343 毫秒。
我使用随机输入/输出来测试我的 tensorflow 实现:
import tensorflow as tf
class stacked_hourglass():
def __init__(self, nb_stack, name='stacked_hourglass'):
self.nb_stack = nb_stack
self.name = name
def __call__(self, x):
with tf.name_scope(self.name) as scope:
padding = tf.pad(x, [[0,0],[3,3],[3,3],[0,0]], name='padding')
with tf.name_scope("preprocessing") as sc:
conv1 = self._conv(padding, 64, 7, 2, 'VALID', 'conv1')
norm1 = tf.contrib.layers.batch_norm(conv1, 0.9, epsilon=1e-5,
activation_fn=tf.nn.relu, scope=sc)
r1 = self._residual_block(norm1, 128, 'r1')
pool = tf.contrib.layers.max_pool2d(r1, [2,2], [2,2], 'VALID', scope=scope)
r2 = self._residual_block(pool, 128, 'r2')
r3 = self._residual_block(r2, 256, 'r3')
hg = [None] * self.nb_stack
ll = [None] * self.nb_stack
ll_ = [None] * self.nb_stack
out = [None] * self.nb_stack
out_ = [None] * self.nb_stack
sum_ = [None] * self.nb_stack
with tf.name_scope('_hourglass_0_with_supervision') as sc:
hg[0] = self._hourglass(r3, 4, 256, '_hourglass')
ll[0] = self._conv_bn_relu(hg[0], 256, name='conv_1')
ll_[0] = self._conv(ll[0],256,1,1,'VALID','ll')
out[0] = self._conv(ll[0],16,1,1,'VALID','out')
out_[0] = self._conv(out[0],256,1,1,'VALID','out_')
sum_[0] = tf.add_n([ll_[0], out_[0], r3])
for i in range(1, self.nb_stack - 1):
with tf.name_scope('_hourglass_' + str(i) + '_with_supervision') as sc:
hg[i] = self._hourglass(sum_[i-1], 4, 256, '_hourglass')
ll[i] = self._conv_bn_relu(hg[i], 256, name='conv_1')
ll_[i] = self._conv(ll[i],256,1,1,'VALID','ll')
out[i] = self._conv(ll[i],16,1,1,'VALID','out')
out_[i] = self._conv(out[i],256,1,1,'VALID','out_')
sum_[i] = tf.add_n([ll_[i], out_[i], sum_[i-1]])
with tf.name_scope('_hourglass_' + str(self.nb_stack - 1) + '_with_supervision') as sc:
hg[self.nb_stack-1] = self._hourglass(sum_[self.nb_stack - 2], 4, 256, '_hourglass')
ll[self.nb_stack-1] = self._conv_bn_relu(hg[self.nb_stack - 1], 256, name='conv_1')
out[self.nb_stack-1] = self._conv(ll[self.nb_stack-1],16,1,1,'VALID','out')
return tf.stack(out)
def _conv(self, inputs, nb_filter, kernel_size=1, strides=1, pad='VALID', name='conv'):
with tf.name_scope(name) as scope:
kernel = tf.Variable(tf.contrib.layers.xavier_initializer(uniform=False)([kernel_size,\
kernel_size,inputs.get_shape().as_list()[3],nb_filter]), name='weights')
conv = tf.nn.conv2d(inputs, kernel, [1,strides,strides,1], padding=pad, data_format='NHWC')
return conv
def _conv_bn_relu(self, inputs, nb_filter, kernel_size=1, strides=1, name=None):
with tf.name_scope(name) as scope:
kernel = tf.Variable(tf.contrib.layers.xavier_initializer(uniform=False)([kernel_size,\
kernel_size,inputs.get_shape().as_list()[3],nb_filter]), name='weights')
conv = tf.nn.conv2d(inputs, kernel, [1,strides,strides,1], padding='SAME', data_format='NHWC')
norm = tf.contrib.layers.batch_norm(conv, 0.9, epsilon=1e-5, activation_fn=tf.nn.relu, scope=scope)
return norm
def _conv_block(self, inputs, nb_filter_out, name='_conv_block'):
with tf.name_scope(name) as scope:
with tf.name_scope('norm_conv1') as sc:
norm1 = tf.contrib.layers.batch_norm(inputs, 0.9, epsilon=1e-5,
activation_fn=tf.nn.relu, scope=sc)
conv1 = self._conv(norm1, nb_filter_out / 2, 1, 1, 'SAME', name='conv1')
with tf.name_scope('norm_conv2') as sc:
norm2 = tf.contrib.layers.batch_norm(conv1, 0.9, epsilon=1e-5,
activation_fn=tf.nn.relu, scope=sc)
conv2 = self._conv(norm2, nb_filter_out / 2, 3, 1, 'SAME', name='conv2')
with tf.name_scope('norm_conv3') as sc:
norm3 = tf.contrib.layers.batch_norm(conv2, 0.9, epsilon=1e-5,
activation_fn=tf.nn.relu, scope=sc)
conv3 = self._conv(norm3, nb_filter_out, 1, 1, 'SAME', name='conv3')
return conv3
def _skip_layer(self, inputs, nb_filter_out, name='_skip_layer'):
if inputs.get_shape()[3].__eq__(tf.Dimension(nb_filter_out)):
return inputs
else:
with tf.name_scope(name) as scope:
conv = self._conv(inputs, nb_filter_out, 1, 1, 'SAME', name='conv')
return conv
def _residual_block(self, inputs, nb_filter_out, name='_residual_block'):
with tf.name_scope(name) as scope:
_conv_block = self._conv_block(inputs, nb_filter_out)
_skip_layer = self._skip_layer(inputs, nb_filter_out)
return tf.add(_skip_layer, _conv_block)
def _hourglass(self, inputs, n, nb_filter_res, name='_hourglass'):
with tf.name_scope(name) as scope:
# Upper branch
up1 = self._residual_block(inputs, nb_filter_res, 'up1')
# Lower branch
pool = tf.contrib.layers.max_pool2d(inputs, [2,2], [2,2], 'VALID', scope=scope)
low1 = self._residual_block(pool, nb_filter_res, 'low1')
if n > 1:
low2 = self._hourglass(low1, n-1, nb_filter_res, 'low2')
else:
low2 = self._residual_block(low1, nb_filter_res, 'low2')
low3 = self._residual_block(low2, nb_filter_res, 'low3')
low4 = tf.image.resize_nearest_neighbor(low3, tf.shape(low3)[1:3] * 2,
name='upsampling')
if n < 4:
return tf.add(up1, low4, name='merge')
else:
return self._residual_block(tf.add(up1, low4), nb_filter_res, 'low4')
if __name__ == "__main__":
import os
import sys
import numpy as np
import time
sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0)
with tf.Graph().as_default():
DEVICE = '/gpu:0'
with tf.device(DEVICE):
print "start build model..."
_x = tf.placeholder(tf.float32, [None, 256, 256, 3])
y = tf.placeholder(tf.float32, [8, None, 64, 64, 16])
output = stacked_hourglass(8, 'stacked_hourglass')(_x)
loss = tf.reduce_mean(tf.square(output - y))
rmsprop = tf.train.RMSPropOptimizer(2.5e-4)
print "build finished..."
train_step = tf.Variable(0, name='global_step', trainable=False)
with tf.device(DEVICE):
train_rmsprop = rmsprop.minimize(loss, train_step)
init = tf.global_variables_initializer()
with tf.Session() as sess:
with tf.device(DEVICE):
sess.run(init)
print "test..."
xarr = np.random.rand(100, 6, 256, 256, 3)
yarr = np.random.rand(100, 8, 6, 64, 64, 16)
_time = time.clock()
with tf.device(DEVICE):
for u in range(0, 100):
sess.run(train_rmsprop, feed_dict={_x:xarr[u], y:yarr[u]})
print "test:", time.clock() - _time
输出为:
I tensorflow/stream_executor/dso_loader.cc:128] successfully opened CUDA library libcublas.so
.8.0 locally
I tensorflow/stream_executor/dso_loader.cc:128] successfully opened CUDA library libcudnn.so.
5 locally
I tensorflow/stream_executor/dso_loader.cc:128] successfully opened CUDA library libcufft.so.
8.0 locally
I tensorflow/stream_executor/dso_loader.cc:128] successfully opened CUDA library libcuda.so.1
locally
I tensorflow/stream_executor/dso_loader.cc:128] successfully opened CUDA library libcurand.so
.8.0 locally
start build model...
E tensorflow/core/framework/op_kernel.cc:925] OpKernel ('op: "NegTrain" device_type: "CPU"')
for unknown op: NegTrain
E tensorflow/core/framework/op_kernel.cc:925] OpKernel ('op: "Skipgram" device_type: "CPU"')
for unknown op: Skipgram
build finished...
I tensorflow/core/common_runtime/gpu/gpu_device.cc:885] Found device 0 with properties:
name: TITAN X (Pascal)
major: 6 minor: 1 memoryClockRate (GHz) 1.531
pciBusID 0000:05:00.0
Total memory: 11.90GiB
Free memory: 11.75GiB
I tensorflow/core/common_runtime/gpu/gpu_device.cc:906] DMA: 0
I tensorflow/core/common_runtime/gpu/gpu_device.cc:916] 0: Y
I tensorflow/core/common_runtime/gpu/gpu_device.cc:975] Creating TensorFlow device (/gpu:0) -
> (device: 0, name: TITAN X (Pascal), pci bus id: 0000:05:00.0)
test...
I tensorflow/core/common_runtime/gpu/pool_allocator.cc:247] PoolAllocator: After 4543 get req
uests, put_count=2609 evicted_count=1000 eviction_rate=0.383289 and unsatisfied allocation ra
te=0.667841
I tensorflow/core/common_runtime/gpu/pool_allocator.cc:259] Raising pool_size_limit_ from 100
to 110
I tensorflow/core/common_runtime/gpu/pool_allocator.cc:247] PoolAllocator: After 0 get reques
ts, put_count=2013 evicted_count=2000 eviction_rate=0.993542 and unsatisfied allocation rate=
0
I tensorflow/core/common_runtime/gpu/pool_allocator.cc:247] PoolAllocator: After 4543 get req
uests, put_count=4719 evicted_count=3000 eviction_rate=0.635728 and unsatisfied allocation ra
te=0.625358
I tensorflow/core/common_runtime/gpu/pool_allocator.cc:259] Raising pool_size_limit_ from 193
to 212
I tensorflow/core/common_runtime/gpu/pool_allocator.cc:247] PoolAllocator: After 0 get reques
ts, put_count=2025 evicted_count=2000 eviction_rate=0.987654 and unsatisfied allocation rate=
0
I tensorflow/core/common_runtime/gpu/pool_allocator.cc:247] PoolAllocator: After 0 get reques
ts, put_count=1037 evicted_count=1000 eviction_rate=0.96432 and unsatisfied allocation rate=0
I tensorflow/core/common_runtime/gpu/pool_allocator.cc:247] PoolAllocator: After 0 get reques
ts, put_count=1054 evicted_count=1000 eviction_rate=0.948767 and unsatisfied allocation rate=
0
I tensorflow/core/common_runtime/gpu/pool_allocator.cc:247] PoolAllocator: After 0 get reques
ts, put_count=1079 evicted_count=1000 eviction_rate=0.926784 and unsatisfied allocation rate=
0
I tensorflow/core/common_runtime/gpu/pool_allocator.cc:247] PoolAllocator: After 4543 get req
uests, put_count=5036 evicted_count=2000 eviction_rate=0.397141 and unsatisfied allocation ra
te=0.359674
I tensorflow/core/common_runtime/gpu/pool_allocator.cc:259] Raising pool_size_limit_ from 140
0 to 1540
test: 71.733044
这意味着平均迭代时间约为 717 毫秒,比 torch 实现慢两倍......
我知道 Tensorflow 应该会稍微慢一些,但是已经做了很多工作来 catch (鉴于一些 benchmarks 现在应该非常接近)
你知道是什么让我的实现如此缓慢吗?
最佳答案
向前步进时间比较如何?历史上,TensorFlow 在反向传播上比 Torch 慢,因为 AD 在具有更高粒度的图上运行(单独的数学运算,而不是 Torch 层),因此会为反向传递生成更多运算。在某些情况下,通过添加重要操作/梯度的融合版本可以缓解这种情况。
一些想法
确保您在幕后使用tf.fused_batch_norm
(即fused=True
参数here)
使用队列而不是feed_dict
。 feed_dict
会产生从 Python 运行时到 TensorFlow 运行时的额外副本,因此您实际上执行了 2 个副本——Python->TensorFlow CPU、TensorFlow CPU->TensorFlow GPU。对于吸收 CPU->GPU 传输延迟的额外步骤,有 this
查看 timelines可以告诉您哪个部分太慢。
tcmalloc 和 c protobuf
sudo apt-get install google-perftools
export LD_PRELOAD="/usr/lib/libtcmalloc.so.4"
pip install --upgrade https://storage.googleapis.com/tensorflow/linux/cpu/protobuf-3.0.0-cp27-none-linux_x86_64.whl
关于python - tensorflow 的实现比 torch 的慢 2 倍,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/41211691/
我想将模型及其各自训练的权重从 tensorflow.js 转换为标准 tensorflow,但无法弄清楚如何做到这一点,tensorflow.js 的文档对此没有任何说明 我有一个 manifest
我有一个运行良好的 TF 模型,它是用 Python 和 TFlearn 构建的。有没有办法在另一个系统上运行这个模型而不安装 Tensorflow?它已经经过预训练,所以我只需要通过它运行数据。 我
当执行 tensorflow_model_server 二进制文件时,它需要一个模型名称命令行参数,model_name。 如何在训练期间指定模型名称,以便在运行 tensorflow_model_s
我一直在 R 中使用标准包进行生存分析。我知道如何在 TensorFlow 中处理分类问题,例如逻辑回归,但我很难将其映射到生存分析问题。在某种程度上,您有两个输出向量而不是一个输出向量(time_t
Torch7 has a library for generating Gaussian Kernels在一个固定的支持。 Tensorflow 中有什么可比的吗?我看到 these distribu
在Keras中我们可以简单的添加回调,如下所示: self.model.fit(X_train,y_train,callbacks=[Custom_callback]) 回调在doc中定义,但我找不到
我正在寻找一种在 tensorflow 中有条件打印节点的方法,使用下面的示例代码行,其中每 10 个循环计数,它应该在控制台中打印一些东西。但这对我不起作用。谁能建议? 谢谢,哈米德雷萨, epsi
我想使用 tensorflow object detection API 创建我自己的 .tfrecord 文件,并将它们用于训练。该记录将是原始数据集的子集,因此模型将仅检测特定类别。我不明白也无法
我在 TensorFlow 中训练了一个聊天机器人,想保存模型以便使用 TensorFlow.js 将其部署到 Web。我有以下内容 checkpoint = "./chatbot_weights.c
我最近开始学习 Tensorflow,特别是我想使用卷积神经网络进行图像分类。我一直在看官方仓库中的android demo,特别是这个例子:https://github.com/tensorflow
我目前正在研究单图像超分辨率,并且我设法卡住了现有的检查点文件并将其转换为 tensorflow lite。但是,使用 .tflite 文件执行推理时,对一张图像进行上采样所需的时间至少是使用 .ck
我注意到 tensorflow 的 api 中已经有批量标准化函数。我不明白的一件事是如何更改训练和测试之间的程序? 批量归一化在测试和训练期间的作用不同。具体来说,在训练期间使用固定的均值和方差。
我创建了一个模型,该模型将 Mobilenet V2 应用于 Google colab 中的卷积基础层。然后我使用这个命令转换它: path_to_h5 = working_dir + '/Tenso
代码取自:- http://adventuresinmachinelearning.com/python-tensorflow-tutorial/ import tensorflow as tf fr
好了,所以我准备在Tensorflow中运行 tf.nn.softmax_cross_entropy_with_logits() 函数。 据我了解,“logit”应该是概率的张量,每个对应于某个像素的
tensorflow 服务构建依赖于大型 tensorflow ;但我已经成功构建了 tensorflow。所以我想用它。我做这些事情:我更改了 tensorflow 服务 WORKSPACE(org
Tensoflow 嵌入层 ( https://www.tensorflow.org/api_docs/python/tf/keras/layers/Embedding ) 易于使用, 并且有大量的文
我正在尝试使用非常大的数据集(比我的内存大得多)训练 Tensorflow 模型。 为了充分利用所有可用的训练数据,我正在考虑将它们分成几个小的“分片”,并一次在一个分片上进行训练。 经过一番研究,我
根据 Sutton 的书 - Reinforcement Learning: An Introduction,网络权重的更新方程为: 其中 et 是资格轨迹。 这类似于带有额外 et 的梯度下降更新。
如何根据条件选择执行图表的一部分? 我的网络有一部分只有在 feed_dict 中提供占位符值时才会执行.如果未提供该值,则采用备用路径。我该如何使用 tensorflow 来实现它? 以下是我的代码
我是一名优秀的程序员,十分优秀!