OpenCL 遍历内核——进一步优化-6ren

OpenCL 遍历内核——进一步优化

转载作者：行者123 更新时间：2023-12-01 05:22:01

目前，我有一个 OpenCL 内核，用于如下遍历。如果有人对这个相当大的内核的优化有一些看法，我会很高兴。

问题是，我正在使用 SAH BVH 运行此代码，并且我想获得与 Timo Aila 在他的论文(Understanding the Efficiency of Ray Traversal on GPUs)中的遍历相似的性能，当然他的代码使用 SplitBVH(其中我可能会考虑使用 SAH BVH 来代替，但在我看来，它的构建时间真的很慢)。但我问的是遍历，而不是 BVH(而且我到目前为止只处理场景，其中 SplitBVH 不会给你带来比 SAH BVH 太多的优势)。

首先，这是我到目前为止所拥有的(标准的 while-while 遍历内核)。

__constant sampler_t sampler = CLK_FILTER_NEAREST;

// Inline definition of horizontal max
inline float max4(float a, float b, float c, float d)
{
    return max(max(max(a, b), c), d);
}

// Inline definition of horizontal min
inline float min4(float a, float b, float c, float d)
{
    return min(min(min(a, b), c), d);
}

// Traversal kernel
__kernel void traverse( __read_only image2d_t nodes,
                        __global const float4* triangles,
                        __global const float4* rays,
                        __global float4* result,
                        const int num,
                        const int w,
                        const int h)
{
    // Ray index
    int idx = get_global_id(0);

    if(idx < num)
    {
        // Stack
        int todo[32];
        int todoOffset = 0;

        // Current node
        int nodeNum = 0;

        float tmin = 0.0f;
        float depth = 2e30f;

        // Fetch ray origin, direction and compute invdirection
        float4 origin = rays[2 * idx + 0];
        float4 direction = rays[2 * idx + 1];
        float4 invdir = native_recip(direction);

        float4 temp = (float4)(0.0f, 0.0f, 0.0f, 1.0f);

        // Traversal loop
        while(true)
        {
            // Fetch node information
            int2 nodeCoord = (int2)((nodeNum << 2) % w, (nodeNum << 2) / w);
            int4 specs = read_imagei(nodes, sampler, nodeCoord + (int2)(3, 0));

            // While node isn't leaf
            while(specs.z == 0)
            {
                // Fetch child bounding boxes
                float4 n0xy = read_imagef(nodes, sampler, nodeCoord);
                float4 n1xy = read_imagef(nodes, sampler, nodeCoord + (int2)(1, 0));
                float4 nz = read_imagef(nodes, sampler, nodeCoord + (int2)(2, 0));

                // Test ray against child bounding boxes
                float oodx = origin.x * invdir.x;
                float oody = origin.y * invdir.y;
                float oodz = origin.z * invdir.z;
                float c0lox = n0xy.x * invdir.x - oodx;
                float c0hix = n0xy.y * invdir.x - oodx;
                float c0loy = n0xy.z * invdir.y - oody;
                float c0hiy = n0xy.w * invdir.y - oody;
                float c0loz = nz.x * invdir.z - oodz;
                float c0hiz = nz.y * invdir.z - oodz;
                float c1loz = nz.z * invdir.z - oodz;
                float c1hiz = nz.w * invdir.z - oodz;
                float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), tmin);
                float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), depth);
                float c1lox = n1xy.x * invdir.x - oodx;
                float c1hix = n1xy.y * invdir.x - oodx;
                float c1loy = n1xy.z * invdir.y - oody;
                float c1hiy = n1xy.w * invdir.y - oody;
                float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), tmin);
                float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), depth);

                bool traverseChild0 = (c0max >= c0min);
                bool traverseChild1 = (c1max >= c1min);

                nodeNum = specs.x;
                int nodeAbove = specs.y;

                // We hit just one out of 2 childs
                if(traverseChild0 != traverseChild1)
                {
                    if(traverseChild1)
                    {
                        nodeNum = nodeAbove;
                    }
                }
                // We hit either both or none
                else
                {
                    // If we hit none, pop node from stack (or exit traversal, if stack is empty)
                    if (!traverseChild0)
                    {
                        if(todoOffset == 0)
                        {
                            break;
                        }
                        nodeNum = todo[--todoOffset];
                    }
                    // If we hit both
                    else
                    {
                        // Sort them (so nearest goes 1st, further 2nd)
                        if(c1min < c0min)
                        {
                            unsigned int tmp = nodeNum;
                            nodeNum = nodeAbove;
                            nodeAbove = tmp;
                        }

                        // Push further on stack
                        todo[todoOffset++] = nodeAbove;
                    }
                }

                // Fetch next node information
                nodeCoord = (int2)((nodeNum << 2) % w, (nodeNum << 2) / w);
                specs = read_imagei(nodes, sampler, nodeCoord + (int2)(3, 0));
            }

            // If node is leaf & has some primitives
            if(specs.z > 0)
            {
                // Loop through primitives & perform intersection with them (Woop triangles)
                for(int i = specs.x; i < specs.y; i++)
                {
                    // Fetch first point from global memory
                    float4 v0 = triangles[i * 4 + 0];

                    float o_z = v0.w - origin.x * v0.x - origin.y * v0.y - origin.z * v0.z;
                    float i_z = 1.0f / (direction.x * v0.x + direction.y * v0.y + direction.z * v0.z);
                    float t = o_z * i_z;

                    if(t > 0.0f && t < depth)
                    {
                        // Fetch second point from global memory
                        float4 v1 = triangles[i * 4 + 1];

                        float o_x = v1.w + origin.x * v1.x + origin.y * v1.y + origin.z * v1.z;
                        float d_x = direction.x * v1.x + direction.y * v1.y + direction.z * v1.z;
                        float u = o_x + t * d_x;

                        if(u >= 0.0f && u <= 1.0f)
                        {
                            // Fetch third point from global memory
                            float4 v2 = triangles[i * 4 + 2];

                            float o_y = v2.w + origin.x * v2.x + origin.y * v2.y + origin.z * v2.z;
                            float d_y = direction.x * v2.x + direction.y * v2.y + direction.z * v2.z;
                            float v = o_y + t * d_y;

                            if(v >= 0.0f && u + v <= 1.0f)
                            {
                                // We got successful hit, store the information
                                depth = t;
                                temp.x = u;
                                temp.y = v;
                                temp.z = t;
                                temp.w = as_float(i);
                            }
                        }
                    }
                }
            }

            // Pop node from stack (if empty, finish traversal)
            if(todoOffset == 0)
            {
                break;
            }

            nodeNum = todo[--todoOffset];
        }

        // Store the ray traversal result in global memory
        result[idx] = temp;
    }
}

今天的第一个问题是，如何在 OpenCL 中编写他的 Persistent while-while 和 Speculative while-while 内核？

广告持续一段时间 ，我说得对吗，我实际上只是以与本地工作大小相等的全局工作大小启动内核，并且这两个数字都应该等于 GPU 的扭曲/波前大小？
我知道使用 CUDA 持久线程实现如下所示:

  do
  {
        volatile int& jobIndexBase = nextJobArray[threadIndex.y];

        if(threadIndex.x == 0)
        {
              jobIndexBase = atomicAdd(&warpCounter, WARP_SIZE);
        }

        index = jobIndexBase + threadIndex.x;

        if(index >= totalJobs)
              return;

        /* Perform work for task numbered 'index' */
  }
  while(true);

OpenCL 中的等价物如何看起来像，我知道我必须在那里做一些障碍，我也知道一个应该在我原子地将 WARP_SIZE 添加到 warpCounter 的分数之后。

广告投机遍历 - 好吧，我可能没有任何想法应该如何在 OpenCL 中实现，因此欢迎提供任何提示。我也不知道把障碍放在哪里(因为把它们放在模拟的 __any 周围会导致驱动程序崩溃)。

如果您在这里成功，感谢您的阅读，欢迎您提供任何提示、答案等!

最佳答案

您可以做的优化是使用向量变量和融合乘加函数来加速设置数学。至于内核的其余部分，它很慢，因为它是分支的。如果您可以对信号数据进行假设，则可以通过减少代码分支来减少执行时间。我还没有检查 float4 swizles(浮点 4 变量之后的 .xxyy 和 .x .y .z .w)，所以只需检查一下。

            float4 n0xy = read_imagef(nodes, sampler, nodeCoord);
            float4 n1xy = read_imagef(nodes, sampler, nodeCoord + (int2)(1, 0));
            float4 nz = read_imagef(nodes, sampler, nodeCoord + (int2)(2, 0));

            float4 oodf4 = -origin * invdir;

            float4 c0xyf4 = fma(n0xy,invdir.xxyy,oodf4);

            float4 c0zc1z = fma(nz,(float4)(invdir.z),oodf4);

            float c0min = max4(min(c0xyf4.x, c0xyf4.y), min(c0xyf4.z, c0xyf4.w), min(c0zc1z.z, c0zc1z.w), tmin);
            float c0max = min4(max(c0xyf4.x, c0xyf4.y), max(c0xyf4.z, c0xyf4.w), max(c0zc1z.z, c0zc1z.w), depth);

            float4 c1xy = fma(n1xy,invdir.xxyy,oodf4);

            float c1min = max4(min(c1xy.x, c1xy.y), min(c1xy.z, c1xy.w), min(c0zc1z.z, c0zc1z.w), tmin);
            float c1max = min4(max(c1xy.x, c1xy.y), max(c1xy.z, c1xy.w), max(c0zc1z.z, c0zc1z.w), depth);

关于OpenCL 遍历内核——进一步优化，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/15933974/

文章推荐： linux - dup2 真的可以返回 EINTR 吗？

文章推荐： Java:标记/标记文件

文章推荐： java - 在另一个类中使用对象特定变量

文章推荐： javascript - 附加到 DOM 时 XML 属性变为小写

javascript - 进一步/递归对象解构
我有这个: const {ops} = getOplogStreamInterpreter(strm); ops.del.subscribe(v => { console.log('delete
go - 使用 golang 时是否建议(进一步)限制表单的大小？
我四处搜索，据我所知，POST 表单请求已被限制为 10MB (http://golang.org/src/net/http/request.go#L721)。如果我要在我的 ServeHTTP 方

行者123

个人简介

我是一名优秀的程序员,十分优秀！

作者热门文章

滴滴打车优惠券免费领取

全站热门文章

首页

博学

6Ren·AI

商城

OpenCL 遍历内核——进一步优化