python - tensorflow 的实现比 torch 的慢 2 倍-6ren

python - tensorflow 的实现比 torch 的慢 2 倍

转载作者：行者123 更新时间：2023-12-01 03:28:29

我正在尝试实现 stacked hourglass在 tensorflow 上，而 torch 实现已经存在here .

我使用 Titan X pascal 在默认配置(批量大小 = 6)下对其进行了测试，平均训练迭代时间约为 343 毫秒。

我使用随机输入/输出来测试我的 tensorflow 实现:

import tensorflow as tf

class stacked_hourglass():
    def __init__(self, nb_stack, name='stacked_hourglass'):
        self.nb_stack = nb_stack
        self.name = name

    def __call__(self, x):
        with tf.name_scope(self.name) as scope:
            padding = tf.pad(x, [[0,0],[3,3],[3,3],[0,0]], name='padding')
            with tf.name_scope("preprocessing") as sc:
                conv1 = self._conv(padding, 64, 7, 2, 'VALID', 'conv1')
                norm1 = tf.contrib.layers.batch_norm(conv1, 0.9, epsilon=1e-5, 
                                    activation_fn=tf.nn.relu, scope=sc)
                r1 = self._residual_block(norm1, 128, 'r1')
                pool = tf.contrib.layers.max_pool2d(r1, [2,2], [2,2], 'VALID', scope=scope)
                r2 = self._residual_block(pool, 128, 'r2')
                r3 = self._residual_block(r2, 256, 'r3')
            hg = [None] * self.nb_stack
            ll = [None] * self.nb_stack
            ll_ = [None] * self.nb_stack
            out = [None] * self.nb_stack
            out_ = [None] * self.nb_stack
            sum_ = [None] * self.nb_stack
            with tf.name_scope('_hourglass_0_with_supervision') as sc:
                hg[0] = self._hourglass(r3, 4, 256, '_hourglass')
                ll[0] = self._conv_bn_relu(hg[0], 256, name='conv_1')
                ll_[0] = self._conv(ll[0],256,1,1,'VALID','ll')
                out[0] = self._conv(ll[0],16,1,1,'VALID','out')
                out_[0] = self._conv(out[0],256,1,1,'VALID','out_')
                sum_[0] = tf.add_n([ll_[0], out_[0], r3])
            for i in range(1, self.nb_stack - 1):
                with tf.name_scope('_hourglass_' + str(i) + '_with_supervision') as sc:
                    hg[i] = self._hourglass(sum_[i-1], 4, 256, '_hourglass')
                    ll[i] = self._conv_bn_relu(hg[i], 256, name='conv_1')
                    ll_[i] = self._conv(ll[i],256,1,1,'VALID','ll')
                    out[i] = self._conv(ll[i],16,1,1,'VALID','out')
                    out_[i] = self._conv(out[i],256,1,1,'VALID','out_')
                    sum_[i] = tf.add_n([ll_[i], out_[i], sum_[i-1]])
            with tf.name_scope('_hourglass_' + str(self.nb_stack - 1) + '_with_supervision') as sc:
                hg[self.nb_stack-1] = self._hourglass(sum_[self.nb_stack - 2], 4, 256, '_hourglass')
                ll[self.nb_stack-1] = self._conv_bn_relu(hg[self.nb_stack - 1], 256, name='conv_1')
                out[self.nb_stack-1] = self._conv(ll[self.nb_stack-1],16,1,1,'VALID','out')
            return tf.stack(out)

    def _conv(self, inputs, nb_filter, kernel_size=1, strides=1, pad='VALID', name='conv'):
        with tf.name_scope(name) as scope:
            kernel = tf.Variable(tf.contrib.layers.xavier_initializer(uniform=False)([kernel_size,\
                                    kernel_size,inputs.get_shape().as_list()[3],nb_filter]), name='weights')
            conv = tf.nn.conv2d(inputs, kernel, [1,strides,strides,1], padding=pad, data_format='NHWC')
            return conv

    def _conv_bn_relu(self, inputs, nb_filter, kernel_size=1, strides=1, name=None):
         with tf.name_scope(name) as scope:
            kernel = tf.Variable(tf.contrib.layers.xavier_initializer(uniform=False)([kernel_size,\
                                    kernel_size,inputs.get_shape().as_list()[3],nb_filter]), name='weights')
            conv = tf.nn.conv2d(inputs, kernel, [1,strides,strides,1], padding='SAME', data_format='NHWC')
            norm = tf.contrib.layers.batch_norm(conv, 0.9, epsilon=1e-5, activation_fn=tf.nn.relu, scope=scope)
            return norm

    def _conv_block(self, inputs, nb_filter_out, name='_conv_block'):
        with tf.name_scope(name) as scope:
            with tf.name_scope('norm_conv1') as sc:
                norm1 = tf.contrib.layers.batch_norm(inputs, 0.9, epsilon=1e-5, 
                                    activation_fn=tf.nn.relu, scope=sc)
                conv1 = self._conv(norm1, nb_filter_out / 2, 1, 1, 'SAME', name='conv1')
            with tf.name_scope('norm_conv2') as sc:
                norm2 = tf.contrib.layers.batch_norm(conv1, 0.9, epsilon=1e-5, 
                                    activation_fn=tf.nn.relu, scope=sc)
                conv2 = self._conv(norm2, nb_filter_out / 2, 3, 1, 'SAME', name='conv2')
            with tf.name_scope('norm_conv3') as sc:
                norm3 = tf.contrib.layers.batch_norm(conv2, 0.9, epsilon=1e-5, 
                                    activation_fn=tf.nn.relu, scope=sc)
                conv3 = self._conv(norm3, nb_filter_out, 1, 1, 'SAME', name='conv3')
            return conv3

    def _skip_layer(self, inputs, nb_filter_out, name='_skip_layer'):
        if inputs.get_shape()[3].__eq__(tf.Dimension(nb_filter_out)):
            return inputs
        else:
            with tf.name_scope(name) as scope:
                conv = self._conv(inputs, nb_filter_out, 1, 1, 'SAME', name='conv')
                return conv

    def _residual_block(self, inputs, nb_filter_out, name='_residual_block'):
        with tf.name_scope(name) as scope:
            _conv_block = self._conv_block(inputs, nb_filter_out)
            _skip_layer = self._skip_layer(inputs, nb_filter_out)
            return tf.add(_skip_layer, _conv_block)

    def _hourglass(self, inputs, n, nb_filter_res, name='_hourglass'):
        with tf.name_scope(name) as scope:
            # Upper branch
            up1 = self._residual_block(inputs, nb_filter_res, 'up1')
            # Lower branch
            pool = tf.contrib.layers.max_pool2d(inputs, [2,2], [2,2], 'VALID', scope=scope)
            low1 = self._residual_block(pool, nb_filter_res, 'low1')
            if n > 1:
                low2 = self._hourglass(low1, n-1, nb_filter_res, 'low2')
            else:
                low2 = self._residual_block(low1, nb_filter_res, 'low2')
            low3 = self._residual_block(low2, nb_filter_res, 'low3')
            low4 = tf.image.resize_nearest_neighbor(low3, tf.shape(low3)[1:3] * 2,
                                                    name='upsampling')
            if n < 4:
                return tf.add(up1, low4, name='merge')
            else:
                return self._residual_block(tf.add(up1, low4), nb_filter_res, 'low4')

if __name__ == "__main__":
    import os
    import sys
    import numpy as np
    import time
    sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0) 
    with tf.Graph().as_default():
        DEVICE = '/gpu:0'
        with tf.device(DEVICE):
            print "start build model..."
            _x = tf.placeholder(tf.float32, [None, 256, 256, 3])
            y = tf.placeholder(tf.float32, [8, None, 64, 64, 16])
            output = stacked_hourglass(8, 'stacked_hourglass')(_x)
            loss = tf.reduce_mean(tf.square(output - y))
            rmsprop = tf.train.RMSPropOptimizer(2.5e-4)
            print "build finished..."
        train_step = tf.Variable(0, name='global_step', trainable=False)
        with tf.device(DEVICE):
            train_rmsprop = rmsprop.minimize(loss, train_step)
        init = tf.global_variables_initializer()
        with tf.Session() as sess:
            with tf.device(DEVICE):
                sess.run(init)
            print "test..."
            xarr = np.random.rand(100, 6, 256, 256, 3)
            yarr = np.random.rand(100, 8, 6, 64, 64, 16)
            _time = time.clock()
            with tf.device(DEVICE):
                for u in range(0, 100):
                    sess.run(train_rmsprop, feed_dict={_x:xarr[u], y:yarr[u]})
            print "test:", time.clock() - _time

输出为:

I tensorflow/stream_executor/dso_loader.cc:128] successfully opened CUDA library libcublas.so
.8.0 locally
I tensorflow/stream_executor/dso_loader.cc:128] successfully opened CUDA library libcudnn.so.
5 locally
I tensorflow/stream_executor/dso_loader.cc:128] successfully opened CUDA library libcufft.so.
8.0 locally
I tensorflow/stream_executor/dso_loader.cc:128] successfully opened CUDA library libcuda.so.1
 locally
I tensorflow/stream_executor/dso_loader.cc:128] successfully opened CUDA library libcurand.so
.8.0 locally
start build model...
E tensorflow/core/framework/op_kernel.cc:925] OpKernel ('op: "NegTrain" device_type: "CPU"') 
for unknown op: NegTrain
E tensorflow/core/framework/op_kernel.cc:925] OpKernel ('op: "Skipgram" device_type: "CPU"') 
for unknown op: Skipgram
build finished...
I tensorflow/core/common_runtime/gpu/gpu_device.cc:885] Found device 0 with properties: 
name: TITAN X (Pascal)
major: 6 minor: 1 memoryClockRate (GHz) 1.531
pciBusID 0000:05:00.0
Total memory: 11.90GiB
Free memory: 11.75GiB
I tensorflow/core/common_runtime/gpu/gpu_device.cc:906] DMA: 0 
I tensorflow/core/common_runtime/gpu/gpu_device.cc:916] 0:   Y 
I tensorflow/core/common_runtime/gpu/gpu_device.cc:975] Creating TensorFlow device (/gpu:0) -
> (device: 0, name: TITAN X (Pascal), pci bus id: 0000:05:00.0)
test...
I tensorflow/core/common_runtime/gpu/pool_allocator.cc:247] PoolAllocator: After 4543 get req
uests, put_count=2609 evicted_count=1000 eviction_rate=0.383289 and unsatisfied allocation ra
te=0.667841
I tensorflow/core/common_runtime/gpu/pool_allocator.cc:259] Raising pool_size_limit_ from 100
 to 110
I tensorflow/core/common_runtime/gpu/pool_allocator.cc:247] PoolAllocator: After 0 get reques
ts, put_count=2013 evicted_count=2000 eviction_rate=0.993542 and unsatisfied allocation rate=
0
I tensorflow/core/common_runtime/gpu/pool_allocator.cc:247] PoolAllocator: After 4543 get req
uests, put_count=4719 evicted_count=3000 eviction_rate=0.635728 and unsatisfied allocation ra
te=0.625358
I tensorflow/core/common_runtime/gpu/pool_allocator.cc:259] Raising pool_size_limit_ from 193
 to 212
I tensorflow/core/common_runtime/gpu/pool_allocator.cc:247] PoolAllocator: After 0 get reques
ts, put_count=2025 evicted_count=2000 eviction_rate=0.987654 and unsatisfied allocation rate=
0
I tensorflow/core/common_runtime/gpu/pool_allocator.cc:247] PoolAllocator: After 0 get reques
ts, put_count=1037 evicted_count=1000 eviction_rate=0.96432 and unsatisfied allocation rate=0
I tensorflow/core/common_runtime/gpu/pool_allocator.cc:247] PoolAllocator: After 0 get reques
ts, put_count=1054 evicted_count=1000 eviction_rate=0.948767 and unsatisfied allocation rate=
0
I tensorflow/core/common_runtime/gpu/pool_allocator.cc:247] PoolAllocator: After 0 get reques
ts, put_count=1079 evicted_count=1000 eviction_rate=0.926784 and unsatisfied allocation rate=
0
I tensorflow/core/common_runtime/gpu/pool_allocator.cc:247] PoolAllocator: After 4543 get req
uests, put_count=5036 evicted_count=2000 eviction_rate=0.397141 and unsatisfied allocation ra
te=0.359674
I tensorflow/core/common_runtime/gpu/pool_allocator.cc:259] Raising pool_size_limit_ from 140
0 to 1540
test: 71.733044

这意味着平均迭代时间约为 717 毫秒，比 torch 实现慢两倍......

我知道 Tensorflow 应该会稍微慢一些，但是已经做了很多工作来 catch (鉴于一些 benchmarks 现在应该非常接近)

你知道是什么让我的实现如此缓慢吗？

最佳答案

向前步进时间比较如何？历史上，TensorFlow 在反向传播上比 Torch 慢，因为 AD 在具有更高粒度的图上运行(单独的数学运算，而不是 Torch 层)，因此会为反向传递生成更多运算。在某些情况下，通过添加重要操作/梯度的融合版本可以缓解这种情况。

一些想法

确保您在幕后使用tf.fused_batch_norm(即fused=True参数here)
使用队列而不是feed_dict。 feed_dict 会产生从 Python 运行时到 TensorFlow 运行时的额外副本，因此您实际上执行了 2 个副本——Python->TensorFlow CPU、TensorFlow CPU->TensorFlow GPU。对于吸收 CPU->GPU 传输延迟的额外步骤，有 this
查看 timelines可以告诉您哪个部分太慢。

tcmalloc 和 c protobuf

sudo apt-get install google-perftools
export LD_PRELOAD="/usr/lib/libtcmalloc.so.4" 
pip install --upgrade https://storage.googleapis.com/tensorflow/linux/cpu/protobuf-3.0.0-cp27-none-linux_x86_64.whl

关于python - tensorflow 的实现比 torch 的慢 2 倍，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/41211691/

文章推荐： python - 什么可以关闭 websocket 连接？

文章推荐： javascript - AngularJS - 为 View 创建自定义转换器

文章推荐：没有 jQueryUI 的 jQuery 模态对话框

tensorflow - 如何将 tensorflow.js 模型和权重转换为标准 tensorflow？
我想将模型及其各自训练的权重从 tensorflow.js 转换为标准 tensorflow，但无法弄清楚如何做到这一点，tensorflow.js 的文档对此没有任何说明我有一个 manifest
tensorflow - 在没有安装 Tensorflow 的情况下运行 Tensorflow 模型
我有一个运行良好的 TF 模型，它是用 Python 和 TFlearn 构建的。有没有办法在另一个系统上运行这个模型而不安装 Tensorflow？它已经经过预训练，所以我只需要通过它运行数据。我
tensorflow - 如何命名要在 Tensorflow Serving 中使用的 Tensorflow 模型？
当执行 tensorflow_model_server 二进制文件时，它需要一个模型名称命令行参数，model_name。如何在训练期间指定模型名称，以便在运行 tensorflow_model_s
tensorflow - TensorFlow 中的生存分析
我一直在 R 中使用标准包进行生存分析。我知道如何在 TensorFlow 中处理分类问题，例如逻辑回归，但我很难将其映射到生存分析问题。在某种程度上，您有两个输出向量而不是一个输出向量(time_t
tensorflow - Tensorflow 中的高斯核
Torch7 has a library for generating Gaussian Kernels在一个固定的支持。 Tensorflow 中有什么可比的吗？我看到 these distribu
tensorflow - Tensorflow 中的回调
在Keras中我们可以简单的添加回调，如下所示: self.model.fit(X_train,y_train,callbacks=[Custom_callback]) 回调在doc中定义，但我找不到
tensorflow - tensorflow 中的条件打印节点
我正在寻找一种在 tensorflow 中有条件打印节点的方法，使用下面的示例代码行，其中每 10 个循环计数，它应该在控制台中打印一些东西。但这对我不起作用。谁能建议？谢谢，哈米德雷萨， epsi
tensorflow - tensorflow 对象检测训练中的标签文件
我想使用 tensorflow object detection API 创建我自己的 .tfrecord 文件，并将它们用于训练。该记录将是原始数据集的子集，因此模型将仅检测特定类别。我不明白也无法
tensorflow - 为 Tensorflow.js 保存 TensorFlow 模型
我在 TensorFlow 中训练了一个聊天机器人，想保存模型以便使用 TensorFlow.js 将其部署到 Web。我有以下内容 checkpoint = "./chatbot_weights.c
tensorflow - TensorFlow 中图像张量的形状是什么
我最近开始学习 Tensorflow，特别是我想使用卷积神经网络进行图像分类。我一直在看官方仓库中的android demo，特别是这个例子:https://github.com/tensorflow
tensorflow - 为什么 TensorFlow Lite 比桌面版 TensorFlow 慢？
我目前正在研究单图像超分辨率，并且我设法卡住了现有的检查点文件并将其转换为 tensorflow lite。但是，使用 .tflite 文件执行推理时，对一张图像进行上采样所需的时间至少是使用 .ck
tensorflow - tensorflow 中的批量标准化
我注意到 tensorflow 的 api 中已经有批量标准化函数。我不明白的一件事是如何更改训练和测试之间的程序？批量归一化在测试和训练期间的作用不同。具体来说，在训练期间使用固定的均值和方差。
tensorflow - 我转换后的 tensorflow 迁移学习模型总是在 Tensorflow JS 中返回相同的结果
我创建了一个模型，该模型将 Mobilenet V2 应用于 Google colab 中的卷积基础层。然后我使用这个命令转换它: path_to_h5 = working_dir + '/Tenso
tensorflow - TensorFlow 如何知道要更改哪些变量以进行优化？
代码取自:- http://adventuresinmachinelearning.com/python-tensorflow-tutorial/ import tensorflow as tf fr
tensorflow - TensorFlow:我的登录信息是否采用正确的格式以实现交叉熵功能？
好了，所以我准备在Tensorflow中运行 tf.nn.softmax_cross_entropy_with_logits() 函数。据我了解，“logit”应该是概率的张量，每个对应于某个像素的
tensorflow - bazel 使用本地下载的 tensorflow 构建 tensorflow 服务
tensorflow 服务构建依赖于大型 tensorflow ；但我已经成功构建了 tensorflow。所以我想用它。我做这些事情:我更改了 tensorflow 服务 WORKSPACE(org
tensorflow - Tensorflow 嵌入层内部的网络结构是什么？
Tensoflow 嵌入层 ( https://www.tensorflow.org/api_docs/python/tf/keras/layers/Embedding ) 易于使用，并且有大量的文
tensorflow - Tensorflow 是否可以进行增量学习？
我正在尝试使用非常大的数据集(比我的内存大得多)训练 Tensorflow 模型。为了充分利用所有可用的训练数据，我正在考虑将它们分成几个小的“分片”，并一次在一个分片上进行训练。经过一番研究，我
tensorflow - TensorFlow 中的资格跟踪
根据 Sutton 的书 - Reinforcement Learning: An Introduction，网络权重的更新方程为: 其中 et 是资格轨迹。这类似于带有额外 et 的梯度下降更新。
tensorflow - TensorFlow 中的条件执行
如何根据条件选择执行图表的一部分？我的网络有一部分只有在 feed_dict 中提供占位符值时才会执行.如果未提供该值，则采用备用路径。我该如何使用 tensorflow 来实现它？以下是我的代码

行者123

个人简介

我是一名优秀的程序员,十分优秀！

作者热门文章

滴滴打车优惠券免费领取

全站热门文章

首页

博学

6Ren·AI

商城

python - tensorflow 的实现比 torch 的慢 2 倍