gpt4 book ai didi

search - ArrayFire帧搜索算法崩溃

转载 作者:行者123 更新时间:2023-12-02 08:21:03 24 4
gpt4 key购买 nike

我对 ArrayFire 和 CUDA 开发总体来说是个新手,在使用 Thrust 惨遭失败后,我几天前才开始使用 ArrayFire。我正在构建一个基于 ArrayFire 的算法,该算法应该在存储在设备内存中的数十万个 32x32 帧的数据库中搜索单个 32x32 像素帧。首先,我初始化一个矩阵,该矩阵具有 1024 + 1 像素作为行(我需要一个额外的像素来保留帧组 ID)和预定义数量(本例为 1000)的帧,按列索引。

如果我取消注释“pixels_uint32 = device_frame_ptr[pixel_group_idx];”,这是执行搜索的函数程序崩溃。该指针似乎是有效的,所以我不明白为什么会发生这种情况。也许对于以这种方式访问​​设备内存有一些我不知道的事情?

#include <iostream>
#include <stdio.h>
#include <sys/types.h>
#include <arrayfire.h>

#include "utils.h"

using namespace af;
using namespace std;

/////////////////////////// CUDA settings ////////////////////////////////
#define TEST_DEBUG false
#define MAX_NUMBER_OF_FRAMES 1000 // maximum (2499999 frames) X (1024 + 1 pixels per frame) x (2 bytes per pixel) = 5.124.997.950 bytes (~ 5GB)
#define BLOB_FINGERPRINT_SIZE 1024 //32x32

//percentage of macroblocks that should match: 0.9 means 90%
#define MACROBLOCK_COMPARISON_OVERALL_THRESHOLD 768 //1024 * 0.75
//////////////////////// End of CUDA settings ////////////////////////////

array search_frame(array d_db_vec)
{
try {
uint number_of_uint32_for_frame = BLOB_FINGERPRINT_SIZE / 2;

// create one-element array to hold the result of the computation
array frame_found(1,MAX_NUMBER_OF_FRAMES, u32);
frame_found = 0;

gfor (array frame_idx, MAX_NUMBER_OF_FRAMES) {

// get the blob id it's the last coloumn of the matrix
array blob_id = d_db_vec(number_of_uint32_for_frame, frame_idx); // addressing with (pixel_idx, frame_idx)

// define some hardcoded pixel to search for
uint8_t searched_r = 0x0;
uint8_t searched_g = 0x3F;
uint8_t searched_b = 0x0;

uint8_t b1 = 0;
uint8_t g1 = 0;
uint8_t r1 = 0;

uint8_t b2 = 0;
uint8_t g2 = 0;
uint8_t r2 = 0;

uint32_t sum1 = 0;
uint32_t sum2 = 0;

uint32_t *device_frame_ptr = NULL;
uint32_t pixels_uint32 = 0;

uint pixel_match_counter = 0;

//uint pixel_match_counter = 0;
array frame = d_db_vec(span, frame_idx);
device_frame_ptr = frame.device<uint32_t>();

for (uint pixel_group_idx = 0; pixel_group_idx < number_of_uint32_for_frame; pixel_group_idx++) {
// test to see if the whole matrix is traversed
// d_db_vec(pixel_group_idx, frame_idx) = 0;

/////////////////////////////// PROBLEMATIC CODE ///////////////////////////////////
pixels_uint32 = 0x7E007E0;
//pixels_uint32 = device_frame_ptr[pixel_group_idx]; //why does this crash the program?
// if I uncomment the above line the program tries to copy the u32 frame into the pixels_uint32 variable
// something goes wrong, since the pointer device_frame_ptr is not NULL and the elements should be there judging by the lines above
////////////////////////////////////////////////////////////////////////////////////

// splitting the first pixel into its components
b1 = (pixels_uint32 & 0xF8000000) >> 27; //(input & 11111000000000000000000000000000)
g1 = (pixels_uint32 & 0x07E00000) >> 21; //(input & 00000111111000000000000000000000)
r1 = (pixels_uint32 & 0x001F0000) >> 16; //(input & 00000000000111110000000000000000)

// splitting the second pixel into its components
b2 = (pixels_uint32 & 0xF800) >> 11; //(input & 00000000000000001111100000000000)
g2 = (pixels_uint32 & 0x07E0) >> 5; //(input & 00000000000000000000011111100000)
r2 = (pixels_uint32 & 0x001F); //(input & 00000000000000000000000000011111)

// checking if they are a match
sum1 = abs(searched_r - r1) + abs(searched_g - g1) + abs(searched_b - b1);
sum2 = abs(searched_r - r2) + abs(searched_g - g2) + abs(searched_b - b2);

// if they match, increment the local counter
pixel_match_counter = (sum1 <= 16) ? pixel_match_counter + 1 : pixel_match_counter;
pixel_match_counter = (sum2 <= 16) ? pixel_match_counter + 1 : pixel_match_counter;
}

bool is_found = pixel_match_counter > MACROBLOCK_COMPARISON_OVERALL_THRESHOLD;
// write down if the frame is a match or not
frame_found(0,frame_idx) = is_found ? frame_found(0,frame_idx) : blob_id;
}

// test to see if the whole matrix is traversed - this has to print zeroes
if (TEST_DEBUG)
print(d_db_vec);

// return the matches array
return frame_found;

} catch (af::exception& e) {
fprintf(stderr, "%s\n", e.what());
throw;
}
}

// make 2 green pixels
uint32_t make_test_pixel_group() {
uint32_t b1 = 0x0; //11111000000000000000000000000000
uint32_t g1 = 0x7E00000; //00000111111000000000000000000000
uint32_t r1 = 0x0; //00000000000111110000000000000000

uint32_t b2 = 0x0; //00000000000000001111100000000000
uint32_t g2 = 0x7E0; //00000000000000000000011111100000
uint32_t r2 = 0x0; //00000000000000000000000000011111

uint32_t green_pix = b1 | g1 | r1 | b2 | g2 | r2;

return green_pix;
}

int main(int argc, char ** argv)
{
info();

/////////////////////////////////////// CREATE THE DATABASE ///////////////////////////////////////
uint number_of_uint32_for_frame = BLOB_FINGERPRINT_SIZE / 2;

array d_db_vec(number_of_uint32_for_frame + 1, // fingerprint size + 1 extra u32 for blob id
MAX_NUMBER_OF_FRAMES, // number of frames
u32); // type of elements is 32-bit unsigned integer (unsigned) with the configuration RGBRGB (565565)

if (TEST_DEBUG == true) {
for (uint frame_idx = 0; frame_idx < MAX_NUMBER_OF_FRAMES; frame_idx++) {
for (uint pix_idx = 0; pix_idx < number_of_uint32_for_frame; pix_idx++) {
d_db_vec(pix_idx, frame_idx) = make_test_pixel_group(); // fill everything with green :D
}
}
} else {
d_db_vec = rand(number_of_uint32_for_frame + 1, MAX_NUMBER_OF_FRAMES);
}

cout << "Setting blob ids. \n\n";
for (uint frame_idx = 0; frame_idx < MAX_NUMBER_OF_FRAMES; frame_idx++) {
// set the blob id to 123456
d_db_vec(number_of_uint32_for_frame, frame_idx) = 123456; // blob_id = 123456
}

if (TEST_DEBUG)
print(d_db_vec);

cout << "Done setting blob ids. \n\n";

//////////////////////////////////// CREATE THE SEARCHED FRAME ///////////////////////////////////

// to be done, for now we use the hardcoded values at line 37-39 to simulate the searched pixel:
//37 uint8_t searched_r = 0x0;
//38 uint8_t searched_g = 0x3F;
//39 uint8_t searched_b = 0x0;

///////////////////////////////////////////// SEARCH /////////////////////////////////////////////
clock_t timer = startTimer();
for (int i = 0; i< 1000; i++) {
array frame_found = search_frame(d_db_vec);

if (TEST_DEBUG)
print(frame_found);
}
stopTimer(timer);

return 0;
}

这是控制台输出,其中注释了该行:

arrayfire/examples/helloworld$ ./helloworld

ArrayFire v1.9.1(64位Linux,版本9af23ea)

许可证:服务器 (27000@server.acceleyes.com)

CUDA工具包5.0,驱动程序304.54

GPU0 Tesla C2075,5376 MB,计算 2.0

内存使用量:5312 MB 可用空间(总共 5376 MB)

设置 blob ID。

已完成 Blob ID 设置。

时间:0.03秒。

<小时/>

这是控制台输出,其中该行未注释:

arrayfire/examples/helloworld$ ./helloworld

ArrayFire v1.9.1(64位Linux,版本9af23ea)

许可证:服务器 (27000@server.acceleyes.com)

CUDA工具包5.0,驱动程序304.54

GPU0 Tesla C2075,5376 MB,计算 2.0

内存使用量:5312 MB 可用空间(总共 5376 MB)

设置 blob ID。

已完成 Blob ID 设置。

段错误

<小时/>

预先感谢您对此问题的任何帮助。我确实尝试了一切,但没有成功。

最佳答案

免责声明:我是 arrayfire 的首席开发人员。我看到你有posted on AccelerEyes forums同样,但我在这里发帖是为了解决您的代码的一些常见问题。

  1. 不要在 gfor 循环内使用 .device()、.host()、.scalar()。这将导致 GFOR 循环内部出现分歧,而 GFOR 并不是为此设计的。

  2. 您无法索引设备指针。该指针指向GPU 上的位置。当你这样做时device_frame_ptr[pixel_group_idx]; ,系统正在CPU 上寻找等效位置。这就是您的段错误的原因。

  3. 使用矢量化代码。例如,您不需要 gfor 的内部 for 循环。而不是做b1 = (pixels_uint32 & 0xF8000000) >> 27;在 for 循环中,您可以执行 array B1 = (frame & 0xF800000000) >> 27; 。即,您不是将数据传回 CPU 并使用 for 循环,而是在 GPU 内执行整个操作。

  4. 不要在 GFOR 中使用 if-else 或三元运算符。这些再次导致分歧。例如,pixel_match_counter = sum(sum1 <= 16) + sum(sum2 < 16);found(0, found_idx) = is_found * found(0, found_idx) + (1 - is_found) * blob_id .

我已经回答了您面临的具体问题。如果您有任何后续问题,请关注我们的论坛和/或我们的支持电子邮件。 Stackoverflow 适合提出特定问题,但不适用于调试整个程序。

关于search - ArrayFire帧搜索算法崩溃,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/17217822/

24 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com