c++ - cuda9 + 推力 sort_by_key 覆盖 H2D 拷贝(使用流)-6ren

c++ - cuda9 + 推力 sort_by_key 覆盖 H2D 拷贝(使用流)

转载作者：行者123 更新时间：2023-11-28 01:37:27

我想将 thrust::sort_by_key 操作与主机到设备的拷贝重叠。尽管将 cudaStream_t 作为参数，但我的实验似乎表明 thrust::sort_by_key 是一个阻塞操作。下面我附上了一个完整的代码示例，其中首先我测量了复制数据(从固定内存)的时间，然后我测量了执行 sort_by_key 的时间。最后，我尝试重叠这两个操作。我希望看到 sort_by_key 操作隐藏的复制时间。相反，我发现叠加操作花费的时间超过两个独立操作的总和。

有人能看出代码有问题吗？还是我误解了对 cuda 流的推力支持？

#include <cuda_runtime.h>
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <random>
#include <iostream>
#include <sys/time.h>

int main() {

    // size of arrays
    const int n = 300000000;

    // random number generator
    std::mt19937 rng;

    // key/val on host
    uint32_t * key = new uint32_t[n];
    uint32_t * val = new uint32_t[n];

    // fill key val
    for(int i = 0; i < n; i++) {
        key[i] = rng();
        val[i] = i;
    }

   // key/val on device
   uint32_t * dev_key;
   uint32_t * dev_val;

   // allocate memory on GPU for key/val
   cudaMalloc((void**)&dev_key, n*sizeof(uint32_t));
   cudaMalloc((void**)&dev_val, n*sizeof(uint32_t));

   // copy key/val onto the device
   cudaMemcpy(dev_key, key, n*sizeof(uint32_t), cudaMemcpyHostToDevice);
   cudaMemcpy(dev_val, val, n*sizeof(uint32_t), cudaMemcpyHostToDevice);

   // get thrust device pointers to key/val on device
   thrust::device_ptr<uint32_t> dev_key_ptr = thrust::device_pointer_cast(dev_key);
   thrust::device_ptr<uint32_t> dev_val_ptr = thrust::device_pointer_cast(dev_val);

   // data on host
   uint32_t * data;

   // allocate pinned memory for data on host
   cudaMallocHost((void**)&data, n*sizeof(uint32_t));

   // fill data with random numbers
   for(int i = 0; i < n; i++) {
       data[i] = rng();
   }

   // data on device
   uint32_t * dev_data;

   // allocate memory for data on the device
   cudaMalloc((void**)&dev_data, n*sizeof(uint32_t));

   // for timing
   struct timeval t1, t2;

   // two streams
   cudaStream_t stream1;
   cudaStream_t stream2;

   // create streams
   cudaStreamCreate(&stream1);
   cudaStreamCreate(&stream2);

   for(int i = 0; i < 10; i++) {

       // Copy data into dev_data on stream 1 (nothing happening on stream 2 for now)
       gettimeofday(&t1, NULL);
           cudaMemcpyAsync(dev_data, data, n*sizeof(uint32_t), cudaMemcpyHostToDevice, stream1);
           cudaDeviceSynchronize();
       gettimeofday(&t2, NULL);
       double t_copy = double(t2.tv_sec-t1.tv_sec)*1000.0 + double(t2.tv_usec-t1.tv_usec)/1000.0;


       // Sort_by_key on stream 2 (nothing hapenning on stream 1 for now)
       gettimeofday(&t1, NULL);
           thrust::sort_by_key(thrust::cuda::par.on(stream2), dev_key_ptr, dev_key_ptr + n, dev_val_ptr);
           cudaDeviceSynchronize();
       gettimeofday(&t2, NULL);
       double t_sort = double(t2.tv_sec-t1.tv_sec)*1000.0 + double(t2.tv_usec-t1.tv_usec)/1000.0;

       // Overlap both operations
       gettimeofday(&t1, NULL);
           thrust::sort_by_key(thrust::cuda::par.on(stream2), dev_key_ptr, dev_key_ptr + n, dev_val_ptr);
           cudaMemcpyAsync(dev_data, data, n*sizeof(uint32_t), cudaMemcpyHostToDevice, stream1);
           cudaDeviceSynchronize();
       gettimeofday(&t2, NULL);
       double t_both = double(t2.tv_sec-t1.tv_sec)*1000.0 + double(t2.tv_usec-t1.tv_usec)/1000.0;

       std::cout << "t_copy: " << t_copy << ", t_sort: " << t_sort << ", t_both1: " << t_both << std::endl;

   }

   // clean up
   cudaStreamDestroy(stream1);
   cudaStreamDestroy(stream2);
   cudaFreeHost(data);
   cudaFree(dev_data);
   cudaFree(dev_key);
   cudaFree(dev_val);
   delete [] key;
   delete [] val;

}

这是在 GTX 1080 TI 上运行并使用 CUDA 工具包 (V9.0.176) 编译时获得的结果:

t_copy: 99.972,  t_sort: 215.597, t_both: 393.861
t_copy: 100.769, t_sort: 225.234, t_both: 394.839
t_copy: 100.489, t_sort: 221.44,  t_both: 397.042
t_copy: 100.047, t_sort: 214.231, t_both: 403.371
t_copy: 100.167, t_sort: 222.031, t_both: 393.143
t_copy: 100.255, t_sort: 209.191, t_both: 374.633
t_copy: 100.179, t_sort: 208.452, t_both: 374.122
t_copy: 100.038, t_sort: 208.39,  t_both: 375.454
t_copy: 100.072, t_sort: 208.468, t_both: 376.02
t_copy: 100.069, t_sort: 208.426, t_both: 377.759

此外，使用 nvprof 进行的分析显示所有操作都在两个独立的非默认流中执行。

如果有人可以重现此问题或提出修复建议，我将不胜感激。

最佳答案

推力排序操作“在幕后”进行内存分配。这应该可以使用 nvprof --print-api-trace ... 发现 - 您应该看到与每种排序相关联的 cudaMalloc 操作。此设备内存分配是同步的，可能会阻止预期的重叠。如果你想解决这个问题，你可以探索使用 thrust custom allocator .

这是一个有效的例子，大量借鉴了上面的链接:

$ cat t44.cu
#include <cuda_runtime.h>
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <random>
#include <iostream>
#include <sys/time.h>
#include <thrust/system/cuda/vector.h>
#include <thrust/system/cuda/execution_policy.h>
#include <thrust/host_vector.h>
#include <thrust/generate.h>
#include <thrust/pair.h>
#include <cstdlib>
#include <iostream>
#include <map>
#include <cassert>


// This example demonstrates how to intercept calls to get_temporary_buffer
// and return_temporary_buffer to control how Thrust allocates temporary storage
// during algorithms such as thrust::sort. The idea will be to create a simple
// cache of allocations to search when temporary storage is requested. If a hit
// is found in the cache, we quickly return the cached allocation instead of
// resorting to the more expensive thrust::cuda::malloc.
//
// Note: this implementation cached_allocator is not thread-safe. If multiple
// (host) threads use the same cached_allocator then they should gain exclusive
// access to the allocator before accessing its methods.


// cached_allocator: a simple allocator for caching allocation requests
class cached_allocator
{
  public:
    // just allocate bytes
    typedef char value_type;

    cached_allocator() {}

    ~cached_allocator()
    {
      // free all allocations when cached_allocator goes out of scope
      free_all();
    }

    char *allocate(std::ptrdiff_t num_bytes)
    {
      char *result = 0;

      // search the cache for a free block
      free_blocks_type::iterator free_block = free_blocks.find(num_bytes);

      if(free_block != free_blocks.end())
      {
        std::cout << "cached_allocator::allocator(): found a hit" << std::endl;

        // get the pointer
        result = free_block->second;

        // erase from the free_blocks map
        free_blocks.erase(free_block);
      }
      else
      {
        // no allocation of the right size exists
        // create a new one with cuda::malloc
        // throw if cuda::malloc can't satisfy the request
        try
        {
          std::cout << "cached_allocator::allocator(): no free block found; calling cuda::malloc" << std::endl;

          // allocate memory and convert cuda::pointer to raw pointer
          result = thrust::cuda::malloc<char>(num_bytes).get();
        }
        catch(std::runtime_error &e)
        {
          throw;
        }
      }

      // insert the allocated pointer into the allocated_blocks map
      allocated_blocks.insert(std::make_pair(result, num_bytes));

      return result;
    }

    void deallocate(char *ptr, size_t n)
    {
      // erase the allocated block from the allocated blocks map
      allocated_blocks_type::iterator iter = allocated_blocks.find(ptr);
      std::ptrdiff_t num_bytes = iter->second;
      allocated_blocks.erase(iter);

      // insert the block into the free blocks map
      free_blocks.insert(std::make_pair(num_bytes, ptr));
    }

  private:
    typedef std::multimap<std::ptrdiff_t, char*> free_blocks_type;
    typedef std::map<char *, std::ptrdiff_t>     allocated_blocks_type;

    free_blocks_type      free_blocks;
    allocated_blocks_type allocated_blocks;

    void free_all()
    {
      std::cout << "cached_allocator::free_all(): cleaning up after ourselves..." << std::endl;

      // deallocate all outstanding blocks in both lists
      for(free_blocks_type::iterator i = free_blocks.begin();
          i != free_blocks.end();
          ++i)
      {
        // transform the pointer to cuda::pointer before calling cuda::free
        thrust::cuda::free(thrust::cuda::pointer<char>(i->second));
      }

      for(allocated_blocks_type::iterator i = allocated_blocks.begin();
          i != allocated_blocks.end();
          ++i)
      {
        // transform the pointer to cuda::pointer before calling cuda::free
        thrust::cuda::free(thrust::cuda::pointer<char>(i->first));
      }
    }

};




int main() {
    cached_allocator alloc;
    // size of arrays
    const int n = 300000000;

    // random number generator
    std::mt19937 rng;

    // key/val on host
    uint32_t * key = new uint32_t[n];
    uint32_t * val = new uint32_t[n];

    // fill key val
    for(int i = 0; i < n; i++) {
        key[i] = rng();
        val[i] = i;
    }

   // key/val on device
   uint32_t * dev_key;
   uint32_t * dev_val;

   // allocate memory on GPU for key/val
   cudaMalloc((void**)&dev_key, n*sizeof(uint32_t));
   cudaMalloc((void**)&dev_val, n*sizeof(uint32_t));

   // copy key/val onto the device
   cudaMemcpy(dev_key, key, n*sizeof(uint32_t), cudaMemcpyHostToDevice);
   cudaMemcpy(dev_val, val, n*sizeof(uint32_t), cudaMemcpyHostToDevice);

   // get thrust device pointers to key/val on device
   thrust::device_ptr<uint32_t> dev_key_ptr = thrust::device_pointer_cast(dev_key);
   thrust::device_ptr<uint32_t> dev_val_ptr = thrust::device_pointer_cast(dev_val);

   // data on host
   uint32_t * data;

   // allocate pinned memory for data on host
   cudaMallocHost((void**)&data, n*sizeof(uint32_t));

   // fill data with random numbers
   for(int i = 0; i < n; i++) {
       data[i] = rng();
   }

   // data on device
   uint32_t * dev_data;

   // allocate memory for data on the device
   cudaMalloc((void**)&dev_data, n*sizeof(uint32_t));

   // for timing
   struct timeval t1, t2;

   // two streams
   cudaStream_t stream1;
   cudaStream_t stream2;

   // create streams
   cudaStreamCreate(&stream1);
   cudaStreamCreate(&stream2);

   for(int i = 0; i < 10; i++) {

       // Copy data into dev_data on stream 1 (nothing happening on stream 2 for now)
       gettimeofday(&t1, NULL);
           cudaMemcpyAsync(dev_data, data, n*sizeof(uint32_t), cudaMemcpyHostToDevice, stream1);
           cudaDeviceSynchronize();
       gettimeofday(&t2, NULL);
       double t_copy = double(t2.tv_sec-t1.tv_sec)*1000.0 + double(t2.tv_usec-t1.tv_usec)/1000.0;


       // Sort_by_key on stream 2 (nothing hapenning on stream 1 for now)
       gettimeofday(&t1, NULL);
           thrust::sort_by_key(thrust::cuda::par(alloc).on(stream2), dev_key_ptr, dev_key_ptr + n, dev_val_ptr);
           cudaDeviceSynchronize();
       gettimeofday(&t2, NULL);
       double t_sort = double(t2.tv_sec-t1.tv_sec)*1000.0 + double(t2.tv_usec-t1.tv_usec)/1000.0;

       // Overlap both operations
       gettimeofday(&t1, NULL);
           thrust::sort_by_key(thrust::cuda::par(alloc).on(stream2), dev_key_ptr, dev_key_ptr + n, dev_val_ptr);
           cudaMemcpyAsync(dev_data, data, n*sizeof(uint32_t), cudaMemcpyHostToDevice, stream1);
           cudaDeviceSynchronize();
       gettimeofday(&t2, NULL);
       double t_both = double(t2.tv_sec-t1.tv_sec)*1000.0 + double(t2.tv_usec-t1.tv_usec)/1000.0;

       std::cout << "t_copy: " << t_copy << ", t_sort: " << t_sort << ", t_both1: " << t_both << std::endl;

   }

   // clean up
   cudaStreamDestroy(stream1);
   cudaStreamDestroy(stream2);
   cudaFreeHost(data);
   cudaFree(dev_data);
   cudaFree(dev_key);
   cudaFree(dev_val);
   delete [] key;
   delete [] val;

}
$ nvcc -arch=sm_60 -std=c++11 -o t44 t44.cu
$ ./t44
cached_allocator::allocator(): no free block found; calling cuda::malloc
cached_allocator::allocator(): found a hit
t_copy: 100.329, t_sort: 110.122, t_both1: 109.585
cached_allocator::allocator(): found a hit
cached_allocator::allocator(): found a hit
t_copy: 100.441, t_sort: 106.454, t_both1: 109.692
cached_allocator::allocator(): found a hit
cached_allocator::allocator(): found a hit
t_copy: 100.595, t_sort: 106.507, t_both1: 109.436
cached_allocator::allocator(): found a hit
cached_allocator::allocator(): found a hit
t_copy: 100.35, t_sort: 106.463, t_both1: 109.517
cached_allocator::allocator(): found a hit
cached_allocator::allocator(): found a hit
t_copy: 100.486, t_sort: 106.473, t_both1: 109.6
cached_allocator::allocator(): found a hit
cached_allocator::allocator(): found a hit
t_copy: 100.324, t_sort: 106.385, t_both1: 109.551
cached_allocator::allocator(): found a hit
cached_allocator::allocator(): found a hit
t_copy: 100.4, t_sort: 106.549, t_both1: 109.692
cached_allocator::allocator(): found a hit
cached_allocator::allocator(): found a hit
t_copy: 100.521, t_sort: 106.445, t_both1: 109.719
cached_allocator::allocator(): found a hit
cached_allocator::allocator(): found a hit
t_copy: 100.362, t_sort: 106.413, t_both1: 109.762
cached_allocator::allocator(): found a hit
cached_allocator::allocator(): found a hit
t_copy: 100.349, t_sort: 106.37, t_both1: 109.52
cached_allocator::free_all(): cleaning up after ourselves...
$

CentOS 7.4、CUDA 9.1、特斯拉 P100

关于c++ - cuda9 + 推力 sort_by_key 覆盖 H2D 拷贝(使用流)，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/48670284/

文章推荐： javascript - JS 十六进制字符串到文件的实际字节值

文章推荐： c++ - 运算符 [ ] (std::vector) 不匹配

文章推荐： javascript - 如何导出从 HTML DOM Parser 检索到的所有图像？

文章推荐： c++ - 如何使用 std::find_if() 按成员访问器值搜索？

C++ 覆盖...覆盖？
我知道 C++ 中的 overriding 是什么。但是，是否存在覆盖？如果有，是什么意思？谢谢。最佳答案在 C++ 术语中，您有覆盖(与类层次结构中的虚拟方法相关)和重载(与具有相同名称但
覆盖/覆盖/堆叠元素上的 Javascript 鼠标事件
我想捕获位于另一个元素下的元素的鼠标事件。这是我所拥有的示例:http://jsfiddle.net/KVLkp/13/ 现在我想要的是当鼠标悬停在红色方 block 上时蓝色方 block 有黄色
python - Pandas boxplot 覆盖/覆盖 matplotlib 图
以下报道 here我尝试创建一个带有重叠散点图的箱线图。但是当我运行时: In [27]: table1.t_in[table1.duration==6] Out[27]: counter 7
javascript - 在 JavaScript 事件中替换/覆盖/覆盖 e.target
有一个 JS Fiddle here , 你能在不克隆到新对象的情况下替换 e.target 吗？下面重复了那个 fiddle 的听众； one.addEventListener('click',
JavaDoc 覆盖
首先要解决重复的可能性: 我不是询问 Override 是什么、它的含义或 @Override 在 java 文档注释之外。那是我不是问 /**Some JavaDoc Comment*/ @over
Java特殊数组补顶/覆盖
我想要高于定义的数组。它存储点及其坐标。 public static List simpleGraph(List nodes) { int numEdges = nodes.size() *
CSS 覆盖
我在 http://olisan.dk/blog/ 有一个博客- 如您所见，有一个 28 像素的高间隙(边距顶部)...在 style.css 中: margin-top: 0; 也被设置为 marg
c++ - 覆盖 VK_USE_64_BIT_PTR_DEFINES
Vulkan 句柄是指向 struct 的不透明指针，或者只是无符号的 64 位整数，具体取决于 VK_USE_64_BIT_PTR_DEFINES 的值: #if (VK_USE_64_BI
wpf - 覆盖 DataGridTextColumn
我正在尝试提供一个行为类似于 DataGridTextColumn 的 DataGrid 列，但在编辑模式下有一个附加按钮。我查看了 DataGridTemplateColumn，但似乎更容易将 Da
Django 覆盖 ASCIIUsernameValidator
使用 Django 1.10 我想在用户名中允许\字符，因为我在使用“django.contrib.auth.middleware.RemoteUserMiddleware”的 Windows 环境中
找不到用于在视频中添加水印的 FFMPEG 覆盖
我正在尝试使用 ffmpeg 将 Logo 放入 rtmp 流中。我的 ffmpeg 版本是 ffmpeg version 4.3.1目前在我的复杂过滤器中，我有: ffmpeg -re -i 'v
Firebase 3存储不允许文件更新/覆盖
是否有用于Firebase 3存储的方法/规则来禁用文件更新或覆盖？我为数据库找到了data.exists()，但没有为存储找到解决方案。最佳答案 TL; DR:在Storage Security
Docker Compose 覆盖
我有两个 Docker Compose 文件，docker-compose.yml看起来像这样 version: '2' services: mongo: image: mongo:3.2
java - 覆盖@CollectionTable
我需要覆盖 JPA 中的集合表吗？也许有人有想法 public class nationality{ @Embedded @AttributeOverrides({
WIX 安装文件，覆盖
嗨，我正在使用 WIX 和下面的代码将文件安装到目录中。我的应用程序的工作方式是用户可以在该目录中复制他们自己的文件，覆盖他们喜欢的内容
Lua 覆盖 # 字符串
我正在尝试为 Lua 中的字符串实现我自己的长度方法。我已成功覆盖字符串的 len() 方法，但我不知道如何为 # 运算符执行此操作。 orig_len = string.len function
scala - 覆盖 'val'时出现意外结果
在Scala 2.10.4中，给出以下类: scala> class Foo { | val x = true | val f = if (x) 100 else 200
Magento 覆盖 Controller
我想做上面的事情。我过去覆盖了许多文件...... block ，模型，助手......但这个让我望而却步。谁能看到我在这里做错了什么: (我编辑了这段代码......现在包括一些建议......
java - 参数顺序 - 覆盖
根据javadoc An instance method in a subclass with the same signature (name, plus the number and the ty
Java FileWriter 覆盖
我有一段代码，只要有可用的新数据作为 InputStream 就会生成新数据。每次都覆盖同一个文件。有时文件在写入之前变为 0 kb。 Web 服务会定期读取这些文件。我需要避免文件为 0 字节的情况

行者123

个人简介

我是一名优秀的程序员,十分优秀！

作者热门文章

滴滴打车优惠券免费领取

全站热门文章

首页

博学

6Ren·AI

商城

c++ - cuda9 + 推力 sort_by_key 覆盖 H2D 拷贝(使用流)