c++ - 如何使用 thrust::lower_bound() 克服 VBO 索引计算期间的内存限制-6ren

c++ - 如何使用 thrust::lower_bound() 克服 VBO 索引计算期间的内存限制

转载作者：行者123 更新时间：2023-11-28 02:24:16

我在 GPU 上使用 Marching 立方体生成网格(使用 CUDA)。网格非常详细，粗略的顶点列表存储在映射到 CUDA 数组的 VBO 中的 GPU 中 float *d_vertexData .数据顺序为顶点位置和法线交错如下所示。

{v0x, v0y, v0z, n0x, n0y, n0z, v1x, v1y, v1z, n1x, n1y, n1z, ...}

网格的大小通常在 34MB(500K 三角形)~1400MB(20M 三角形)之间，并存储在 GPU 上。

然后我使用 thrust::sort() , thrust::unique摆脱重复的顶点并使用 thrust::lower_bound()计算指数。此步骤后，网格大小减少了 70% 或更多。下面的代码演示了这一步。

float exampleVerts[36]=
{ 1, 2, 3, 0, 1, 0, 4, 5, 6, 0, 1, 0, 7, 8, 9, 0, 1, 0, 1, 2, 3, 0, 1, 0,
4, 5, 6, 0, 1, 0, 10, 11, 12, 0, 1, 0};

unsingned int numVertices = 36;
cudaMalloc(void**(&d_vertexData), numVertices*sizeof(float));
cudaMemCpy( d_vertexData, exampleVerts, numVertices*sizeof(float), cudaMemcpyHostToDevice);

unsigned int data_size = numVertices * 6; //6 floats per vertex

thrust::device_ptr<float> vertsPtr = thrust::device_pointer_cast(d_vertexData);

thrust::device_vector<float> vertsCopy(vertsPtr, vertsPtr + data_size);
thrust::device_vector<unsigned int> indices(numVertices);

auto zip_vert_first = zip(...); // using vertsPtr and strided_range
auto zip_vert_last = zip(...); // using vertsPtr and strided_range

thrust::sort(zip_verts_first, zip_verts_last);
auto new_vert_last = thrust::unique(zip_vertex_first, zip_vertex_last);

auto zip_vertcopy_first = zip(...); //using vertsCopy.data() and strided_range
auto zip_vertcopy_last = zip(...); //using vertsCopy.data() and strided_range

//find index of each input vertex in the list of unique vertices
thrust::lower_bound(zip_vert_first, new_vert_last,
    zip_vertcopy_first, zip_vertcopy_last,
    indices.begin());

它可以工作，但需要相当大的内存。此行thrust::device_vector<float> vertsCopy(vertsPtr, vertsPtr + data_size);需要 [VBO 大小] 内存来存储用于 thrust::lower_bound() 的顶点拷贝.

在我的应用程序中，对于粗顶点列表，网格通常非常大，超过 1.5GB。此方法有以下限制。

It requires additional 117% of VBO size. (100% for the copy of all vertices, 17% for indices)

由于此限制，此方法无法在具有 2GB 或更低 VRAM 的 GPU 上运行。我正在使用具有 4GB VRAM 的 GPU，即使这样我的应用程序也很容易达到这个限制。

有没有其他方法可以在 GPU 上计算索引而不需要这么大的内存？否则我唯一的选择是返回 CPU(主机)，我认为这会非常慢。

最佳答案

如果您对索引而不是顶点数据本身进行操作，则可以避免顶点的拷贝。

以下示例(基于我的 answer to your previous question 和我的回答 here )执行以下步骤:

一次对顶点和索引进行排序
查找重复顶点的起始索引
根据这些起始索引删除重复的顶点
计算新指数

最终索引存储在 d_indices_2 中。

输出

d_vertices:     1   2   3   4   5   6   7   8   9   4   5   6   7   8   9   0   1   2   
d_indices:      0   1   2   3   4   5   
d_vertices:     0   1   2   1   2   3   4   5   6   4   5   6   7   8   9   7   8   9   
d_indices:      5   0   1   3   2   4   
d_indices_2:    0   1   2   0   4   0   
d_vertices:     0   1   2   1   2   3   4   5   6   7   8   9   
d_indices_3:    0   1   2   2   3   3   
d_indices_2:    1   2   3   2   3   0

#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/copy.h>
#include <thrust/sequence.h>
#include <thrust/transform.h>
#include <thrust/functional.h>
#include <thrust/scan.h>
#include <iostream>
#include <thrust/tuple.h>
#include <thrust/execution_policy.h>
#include <thrust/scatter.h>
#include <thrust/unique.h>
#include <thrust/remove.h>
#include <stdint.h>

template<typename... Iterators>
__host__ __device__
thrust::zip_iterator<thrust::tuple<Iterators...>> zip(Iterators... its)
{
    return thrust::make_zip_iterator(thrust::make_tuple(its...));
}

template <typename Iterator, typename thrust::iterator_difference<Iterator>::type stride>
class strided_range
{
public:
    typedef typename thrust::iterator_difference<Iterator>::type difference_type;

    //template <difference_type stride>
    struct stride_functor : public thrust::unary_function<difference_type,difference_type>
    {
        __host__ __device__
        difference_type operator()(const difference_type& i) const
        { 
            return stride * i;
        }
    };

    typedef typename thrust::counting_iterator<difference_type>                           CountingIterator;
    typedef typename thrust::transform_iterator<stride_functor, CountingIterator> TransformIterator;
    typedef typename thrust::permutation_iterator<Iterator,TransformIterator>             PermutationIterator;

    // type of the strided_range iterator
    typedef PermutationIterator iterator;

    // construct strided_range for the range [first,last)
    strided_range(Iterator first, Iterator last)
        : first(first), last(last) {}

    iterator begin(void) const
    {
        return PermutationIterator(first, TransformIterator(CountingIterator(0), stride_functor()));
    }

    iterator end(void) const
    {
        return begin() + ((last - first) + (stride - 1)) / stride;
    }

protected:
    Iterator first;
    Iterator last;
};


template<typename, typename>
struct append_to_type_seq { };

template<typename T, typename... Ts, template<typename...> class TT>
struct append_to_type_seq<T, TT<Ts...>>
{
    using type = TT<Ts..., T>;
};

template<typename T, unsigned int N, template<typename...> class TT>
struct repeat
{
    using type = typename
        append_to_type_seq<
            T,
            typename repeat<T, N-1, TT>::type
            >::type;
};

template<typename T, template<typename...> class TT>
struct repeat<T, 0, TT>
{
    using type = TT<>;
};

template<typename Tuple> struct std_to_thrust_tuple;
template<typename...T> struct std_to_thrust_tuple<std::tuple<T...>> {
  using type = thrust::tuple<T...>;
};

template<typename IteratorType, std::size_t stride>
class zipped_strided_range
{
public:

    typedef typename strided_range<IteratorType, stride>::iterator SingleIterator;
    typedef typename repeat<SingleIterator, stride, std::tuple>::type StdIteratorTuple;
    typedef typename std_to_thrust_tuple<StdIteratorTuple>::type IteratorTuple;
    typedef decltype(thrust::make_zip_iterator(IteratorTuple())) ZipIterator;

    zipped_strided_range(IteratorType first, IteratorType last) : first(first), last(last)
    {
        assign<0>();
    }

    ZipIterator begin() const
    {
        return thrust::make_zip_iterator(begin_tuple);
    }

    ZipIterator end() const
    {
        return thrust::make_zip_iterator(end_tuple);
    }

protected:

    template <std::size_t index>
    void assign(typename std::enable_if< (index < stride) >::type* = 0)
    {
        strided_range<IteratorType,stride> strided_range_iterator(first+index, last-(stride-1)+index);

        thrust::get<index>(begin_tuple) = strided_range_iterator.begin();
        thrust::get<index>(end_tuple) = strided_range_iterator.end();
        assign<index+1>();
    }

    template <std::size_t index>
    void assign(typename std::enable_if< (index == stride) >::type* = 0)
    {
        // end recursion
    }

    IteratorType first;
    IteratorType last;

    IteratorTuple begin_tuple;
    IteratorTuple end_tuple;
};




#define PRINTER(name) print(#name, (name))
template <template <typename...> class V, typename T, typename ...Args>
void print(const char* name, const V<T,Args...> & v)
{
    std::cout << name << ":\t";
    thrust::copy(v.begin(), v.end(), std::ostream_iterator<T>(std::cout, "\t"));
    std::cout << std::endl;
}


template <typename IteratorType, typename IndexType = uint32_t>
struct my_scatter : public thrust::unary_function<IndexType,IndexType>
{
    my_scatter(IteratorType first) : first(first)
    {
    }

   __host__ __device__
   IndexType operator()(const IndexType& i)
   {
      IndexType result = i;
      if (i > static_cast<IndexType>(0) && *(first+i) == *(first+i-static_cast<IndexType>(1)))
      { 
          result = static_cast<IndexType>(0);
      }
      return result;
   }

   IteratorType first;
};

template <typename IteratorType>
my_scatter<IteratorType> make_my_scatter(IteratorType first)
{
  return my_scatter<IteratorType>(first);
}

template <typename T>
struct my_transformer : public thrust::unary_function<T,T>
{
  __host__ __device__
  T operator()(const T& x) const 
  {
    return static_cast<bool>(x);
  }
};


int main()
{
    using namespace thrust::placeholders;

    const int stride = 3;
    const int num = 6;

    const int size = stride * num;

    float values[size] = {1,2,3,
                          4,5,6,
                          7,8,9,
                          4,5,6,
                          7,8,9,
                          0,1,2
    };


    typedef uint32_t Integer;

    thrust::host_vector<float> h_vertices (values, values+size);
    thrust::device_vector<float> d_vertices = h_vertices;
    float* dev_ptr = thrust::raw_pointer_cast(d_vertices.data());
    zipped_strided_range<float*, stride> zipped(dev_ptr, dev_ptr+size);

    thrust::device_vector<Integer> d_indices(num);
    thrust::sequence(d_indices.begin(), d_indices.end());

    PRINTER(d_vertices);
    PRINTER(d_indices);

    // 1. sort 
    auto zip_begin = zip(zipped.begin(),d_indices.begin());
    auto zip_end   = zip(zipped.end(),d_indices.end());
    thrust::sort(thrust::device, zip_begin, zip_end);
    PRINTER(d_vertices);
    PRINTER(d_indices);

    thrust::device_vector<Integer> d_indices_2(num);

    // 2. find start indics of duplicate vertices
    auto my_scatter_op =  make_my_scatter(zipped.begin());
    thrust::transform(thrust::make_counting_iterator(static_cast<Integer>(0)),
                      thrust::make_counting_iterator(static_cast<Integer>(num)),
                      d_indices_2.begin(),
                      my_scatter_op);
    PRINTER(d_indices_2);

    // 3. remove duplicate vertices
    /*
    // unique could be used, but we already know which vertices we want
    auto new_end = thrust::unique(thrust::device, zipped.begin(), zipped.end());
    */
    auto new_end = thrust::remove_if(thrust::device, zipped.begin()+1, zipped.end()+1, d_indices_2.begin()+1, !_1);
    int new_size = (new_end - zipped.begin());
    d_vertices.resize(stride*new_size);
    PRINTER(d_vertices);

    thrust::device_vector<Integer> d_indices_3(num);
    auto transform_op = my_transformer<Integer>();
    auto t_b = thrust::make_transform_iterator(d_indices_2.begin()+1, transform_op);
    auto t_e = thrust::make_transform_iterator(d_indices_2.end(), transform_op);
    thrust::inclusive_scan(t_b, t_e, d_indices_3.begin()+1);
    PRINTER(d_indices_3);

    // 4. calculate final indices
    thrust::scatter(d_indices_3.begin(), d_indices_3.end(), d_indices.begin(), d_indices_2.begin());
    PRINTER(d_indices_2);


  return 0;
}

关于c++ - 如何使用 thrust::lower_bound() 克服 VBO 索引计算期间的内存限制，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/31252441/

文章推荐： c++ - 使用声明的模板化参数实例化嵌套类模板

文章推荐： javascript - 在 iframe url 中传递变量

文章推荐： c++ - Mac 上 Qt Creator 中的 openCV

mysql - 计算另一个给定日期间隔(期间)内的日期间隔(期间)的天数
我有一个问题，但由于 this question 部分正在解决，但我想知道如何计算给定间隔之间的天数。这是一个计算员工休假天数的查询。所以给定(或不给定)一个日期范围，我想计算给定间隔之间有多少假期
java - 期间.仅减去天数
变量dateSubtract结果是 16，但我想找到这 2 天之间的总天数，应该是 165。没有 JODA TIME 我该如何做到这一点？ String date = "06/17/2014"; Da
java - 在java中将一个月计算为 "between"期间
我想选择创建日期介于给定月份的第一天和最后一天之间的记录。我通过以下方式计算开始日期和结束日期的月份: 日期“月份”只是时间范围内的随机日期 Calendar cal = Calendar.getIn
R:期间(月)的左侧移动平均线
我有一个对你们大多数人来说可能微不足道的问题。我尝试了很多，没有找到解决方案，所以如果有人能给我提示，我会很高兴。起点是每周 xts -时间序列。月周值(value)目标 2011 年 12 月 W
php - 获取一周前/期间/一周后的生日
我有一个 Facebook 应用程序，它将用户生日作为 varchar 存储在 mysql 数据库中。我正在尝试获取所有用户的生日 1周后推出，如果是在本周如果生日是上周。在我的 php 中，我获取
java - 年月日的 Joda 期间
我正在使用以下代码来获取年、月、日中的两个日期之间的差异 tenAppDTO.getTAP_PROPOSED_START_DATE()=2009-11-01 tenAppDTO.getTAP_PRO
C++ 时间(而不是日期)期间
我想检查当前时间(在 C++ 中)是否在一个时间范围内。我想从元组 ("12:00", "17:30") 构造时间范围，即 (string, string) 并检查时间 now() 是否介于两者之间
gitlab - 在推送到 GitLab 期间，如何在提交消息中要求票号？
gitlab 有一个功能，如果我在提交消息中放入票号，那么提交将与 gitlab.com 上的票相关联。这在进行代码审查时非常方便。不幸的是，开发人员有时会忘记这样做。我想指定 git hooks
java - execSQL() 期间 SQLite 数据库错误
我正在尝试制作使用SQLite数据库的简单注册/登录应用程序，到目前为止我得到了这段代码。这是我的“注册” Activity ，我猜它应该在按下注册按钮后将用户名和 pin(密码)实现到数据库，遗憾的
文件 I/O 期间 Java 扫描仪出现未知来源异常
我正在尝试打开、关闭和写入文件。每当我尝试打开一个文件时，如果我提供的路径中不存在该文件，程序就会告诉我。如果存在，程序将读取其中的内容并显示它。如果用户不想查找文件，可以选择创建文件并用数据填充它。
jquery - react 期间 meteor 触发动画
我想要我的至slideToggle每当发生 react 性变化时，但到目前为止我还无法使其发生。我尝试在 rendered 中使用 JQuery和created模板的事件，但它没有触发。触发此操作的
.net - Commit 期间 MySQL 超时
我们的 MySQL 遇到了神秘的网络问题。简单的更新查询(使用索引更新单行)通常会立即运行，然后有时(假设 1000 次中有 1 次)因超时而失败。与简单的插入查询相同。数据库没有过载。我们怀疑网络问
java - ConfigChanges 期间 ActionBar 的导航模式变化
我正在使用 actionbarsherlock 的 ActionBar，第一次以横向或水平方向运行应用程序时，选项卡以 Tabs Mode 显示。将方向更改为纵向后，导航模式仍在 Tabs 中。第二次
Mysql 服务器已消失 - 在 mysqldump 期间
每天晚上(太平洋标准时间晚上 8 点)我都会对生产数据库(innoDB 引擎)进行全局备份。这是 mysqldump 命令: mysqldump -u$MYSQLUSER -p$MYSQLPWD -
android - AsyncTask 期间 UI 卡住
当我的应用程序第一次启动时，它应该显示用户协议(protocol)，这是一个 59kb 的 txt 文件。由于读取文件并将其附加到 TextView 需要一些时间，因此我决定在异步任务中执行此操作并在
javascript - KeyPress 期间 Javascript 中仅允许一个 "."
如何只允许一个“.”在按键期间的javascript中？我这里有一个代码: function allowOneDot(txt) { if ((txt.value.split(".")
jquery - 在调整大小和选项卡 View 期间，图像点击功能不起作用
我已经创建了像主页和用户这样的标题图标。在桌面 View 中，如果我单击用户图像，它会显示相应的重定向页面。如果我在选项卡或移动 View 中将其最小化, 它什么都不显示。此问题仅发生在用户图像上，而
c++ - push_back() 期间 vector 迭代器不可取消引用错误
下面的代码在 Release模式下工作，并且仅在 Debug模式下在 g_ItemList.push_back() 引发错误，我浏览了一些 SO 帖子和论坛。有人提到 "You can't itera
c - mmap 期间 memcpy 重叠的段错误
我遇到了一个我似乎无法解决的 mmap 问题。下面是设置:我使用 malloc 将一个巨大的多维数组分配到内存中，用我的值填充它，然后我想将它保存在一个文件中。该数组包含 3200000000 个字节
c++ - dlopen 期间 undefined symbol
尝试加载共享库: handle = dlopen( "libaaa.so.2.5", RTLD_NOW ); if ( !handle ) { printf("Failed t

行者123

个人简介

我是一名优秀的程序员,十分优秀！

作者热门文章

滴滴打车优惠券免费领取

全站热门文章

首页

博学

6Ren·AI

商城

c++ - 如何使用 thrust::lower_bound() 克服 VBO 索引计算期间的内存限制