gpt4 book ai didi

c++ - 我可以从 CUDA 内核函数调用 __device__ 函数吗?

转载 作者:行者123 更新时间:2023-11-28 02:40:53 25 4
gpt4 key购买 nike

我正在尝试从 CUDA 内核函数调用两个设备函数:

编辑:为了避免混淆函数定义与内核定义在不同的文件中,我提供了完整的代码:

Complete code:



#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include<iostream>
#include<fstream>
#include<string>
#include<iterator>
using namespace std;


#define POLYNOMIAL 0x04C11DB7L //Standard CRC-32 polynomial
#define M 62352 //Number of bits in the bloom filter
#define K 4 //Number of bits set per mapping in filter

typedef unsigned short int word16;
typedef unsigned int word32;

__device__ static word32 CrcTable[256]; //Table of 8-bit CRC32 remainders
__device__ char BFilter[M / 8]; //Bloom filter array of M/8 bytes
word32 NumBytes; //Number of bytes in Bloom filter

void gen_crc_table(void);
__device__ word32 update_crc(word32 crc_accum, const char *data_ptr, word32 data_size);
__device__ void mapBloom(word32 hash);
__device__ word32 crc32;
__device__ int retCode;

__global__ void mapBloomKernel(const char* d_wordList, int* sizeOfWords)
{
//access thread id
const unsigned int bid = blockIdx.x;
const unsigned int tid = threadIdx.x;
const unsigned int index = bid * blockDim.x + tid;

const char *current_word = &(*(d_wordList+(index*30)));
for(int i=0; i<K; i++)
{
crc32 = update_crc(i, d_wordList+(index*30), sizeOfWords[index]);
mapBloom(crc32);
}

}

/*
Main Function
*/

int main()
{
FILE *fp1;
FILE *fp2;
word32 i;

cout<<"-----------------------------------------------"<<endl;
cout<<"-- Program to implement a general Bloom filter --\n";
cout<<"-----------------------------------------------"<<endl;

//Determine number of bytes in Bloom Filter
NumBytes = M/8;
if((M%8)!=0)
{
cout<<"*** ERROR - M value must be dibisible by 8 \n";
exit(1);
}

//Initialize the CRC32 table
gen_crc_table();

//Clear the Bloom filter
for(i = 0; i<NumBytes; i++)
{
BFilter[i] = 0x00;
}

fp1 = fopen("word_list_10000.txt","r");
if(fp1 == NULL)
{
cout<<"ERROR in opening input file #1 ***\n";
exit(1);
}

fp2 = fopen("bloom_query.txt","r");
if(fp2 == NULL)
{
cout<<"ERROR in opening input file #2 ***\n";
exit(1);
}

//determine the number of words in list:

std::ifstream f("word_list_10000.txt");
std::istream_iterator<std::string> beg(f), end;
int number_of_words = distance(beg,end);

cout<<"Number of words in file: "<<number_of_words<<endl;
cout<<"size of char: "<<sizeof(char)<<endl;

cout<<"Reading to array!: "<<endl;
ifstream file("word_list_10000.txt");

const int text_length = 30;

char *wordList = new char[10000 * text_length];
int *sizeOfWords = new int[10000];

for(int i=0; i<number_of_words; i++)
{
file>>wordList + (i*text_length);
sizeOfWords[i] = strlen(wordList + (i*text_length));
cout<<wordList + (i*text_length)<<endl;
}

char *dev_wordList;
char *dev_sizeOfWords;

cudaMalloc((void**)&dev_wordList, 30*number_of_words*sizeof(char));
cudaMalloc((void**)&dev_sizeOfWords, number_of_words * sizeof(char));
cudaMemcpy(dev_wordList, wordList, 30 * number_of_words * sizeof(char), cudaMemcpyHostToDevice);
cudaMemcpy(dev_sizeOfWords, sizeOfWords, number_of_words * sizeof(char), cudaMemcpyHostToDevice);


unsigned int crc_size = sizeof(word32) * 256;
unsigned int bfilter_size = sizeof(char) * M/8;

static word32* d_CrcTable;
char* d_BFilter;

cudaMalloc((void**)&d_CrcTable, crc_size);
cudaMalloc((void**)&d_BFilter, bfilter_size);

//copy host arrays CrcTable & BFilter to device memory

cudaMemcpy(d_CrcTable, CrcTable, crc_size, cudaMemcpyHostToDevice);
cudaMemcpy(d_BFilter, BFilter, bfilter_size, cudaMemcpyHostToDevice);

//Setup execution parameters
int n_blocks = (number_of_words + 255)/256;
int threads_per_block = 256;

dim3 grid(n_blocks, 1, 1);
dim3 threads(threads_per_block, 1, 1);

mapBloomKernel<<<grid, threads>>>(dev_wordList, sizeOfWords);

fclose(fp1);

//Output results header
cout<<"----------------------------------------------------------\n";
cout<<"Matching strings are... \n";

/*


...
...
...

*/

fclose(fp2);
}



/*
* Function to initialize CRC32 table
*/

void gen_crc_table(void)
{
register word32 crc_accum;
register word16 i, j;
//Initialize the CRC32 8-bit look-up table
for(i=0; i<256; i++)
{
crc_accum = ((word32) i<<24);
for(j=0; j<8; j++)
{
if(crc_accum & 0x80000000L)
crc_accum = (crc_accum << 1) ^POLYNOMIAL;
else
crc_accum = (crc_accum << 1);
}
CrcTable[i] = crc_accum;
//cout<<CrcTable[i]<<endl;
}
}

/*
* Function to generate CRC32
*/

__device__ word32 update_crc(word32 crc_accum, char *data_blk_ptr, word32 data_blk_size)
{
register word32 i, j;
for(j=0; j<data_blk_size; j++)
{
i = ((int) (crc_accum >>24) ^ *data_blk_ptr++) & 0xFF;
crc_accum = (crc_accum << 8) ^ CrcTable[i];
}
crc_accum = ~crc_accum;

return crc_accum;
}

/*
* Function to map hash into Bloom filter
*/

__device__ void mapBloom(word32 hash)
{
int tempInt;
int bitNum;
int byteNum;
unsigned char mapBit;
tempInt = hash % M;
byteNum = tempInt / 8;
bitNum = tempInt % 8;

mapBit = 0x80;
mapBit = mapBit >> bitNum;

//Map the bit into Bloom filter
BFilter[byteNum] = BFilter[byteNum] | mapBit;
}

/*
* Function to test for a Bloom filter match
*/

__device__ int testBloom(word32 hash)
{
int tempInt;
int bitNum;
int byteNum;
unsigned char testBit;
int retCode;
tempInt = hash % M;
byteNum = tempInt / 8;
bitNum = tempInt % 8;

testBit = 0x80;
testBit = testBit >> bitNum;
if (BFilter[byteNum] & testBit)
retCode = 1;
else
retCode = 0;
return retCode;
}

用于编译的命令行:

/OUT:"E:\Dropbox\Dropbox\Work\CUDA_Bloom_filter_v0.2\Debug\CUDA_Bloom_filter_v0.exe" /INCREMENTAL:NO
/NOLOGO /LIBPATH:"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\lib\Win32" "cudart.lib"
"kernel32.lib" "user32.lib" "gdi32.lib" "winspool.lib" "comdlg32.lib" "advapi32.lib" "shell32.lib"
"ole32.lib" "oleaut32.lib" "uuid.lib" "odbc32.lib" "odbccp32.lib" /MANIFEST
/ManifestFile:"Debug\CUDA_Bloom_filter_v0.exe.intermediate.manifest" /ALLOWISOLATION
/MANIFESTUAC:"level='asInvoker' uiAccess='false'" /DEBUG
/PDB:"E:\Dropbox\Dropbox\Work\CUDA_Bloom_filter_v0.2\Debug\CUDA_Bloom_filter_v0.pdb"
/SUBSYSTEM:CONSOLE
/PGD:"E:\Dropbox\Dropbox\Work\CUDA_Bloom_filter_v0.2\Debug\CUDA_Bloom_filter_v0.pgd" /TLBID:1
/DYNAMICBASE /NXCOMPAT /MACHINE:X86 /ERRORREPORT:QUEUE

完整输出:

7   IntelliSense: expected an expression e:\...\kernel.cu   145 18  CUDA_Bloom_filter_v0
Error 6 error MSB3721: The command ""C:\Program Files\NVIDIA GPU Computing
Toolkit\CUDA\v5.5\bin\nvcc.exe" -gencode=arch=compute_10,code=\"sm_10,compute_10\" --use-local-env --
cl-version 2010 -ccbin "F:\Installed\Microsoft Visual Studio 2010\VC\bin" -I"C:\Program Files\NVIDIA
GPU Computing Toolkit\CUDA\v5.5\include" -I"C:\Program Files\NVIDIA GPU Computing
Toolkit\CUDA\v5.5\include" -G --keep-dir Debug -maxrregcount=0 --machine 32 --compile -cudart
static -g -DWIN32 -D_DEBUG -D_CONSOLE -D_MBCS -Xcompiler "/EHsc /W3 /nologo /Od /Zi /RTC1 /MDd "
-o Debug\kernel.cu.obj
"E:\...\kernel.cu"" exited with code 2. C:\...\CUDA 5.5.targets 592 10 CUDA_Bloom_filter_v0
Error 5 error : **External calls are not supported** (found non-inlined call to _Z10update_crcjPKcj) E:\...\kernel.cu 40 1 CUDA_Bloom_filter_v0

最佳答案

编译器感到困惑,因为你的函数原型(prototype)(前向声明)看起来像这样:

__device__ word32 update_crc(word32 crc_accum, const char *data_ptr, word32 data_size);

但是你的定义是这样的:

__device__ word32 update_crc(word32 crc_accum, char *data_blk_ptr, word32 data_blk_size)
{

您的函数定义要求第二个参数的类型为 char *。但是您传递的是 const char * 参数(并且您的前向声明是 const char * 类型)。

这是一个基本的 C/C++ 编码错误。

您的前向声明应符合您的定义。由于它没有,编译器会在别处寻找匹配的函数,但找不到。

此问题的解决方法是使您的函数定义匹配:

                                               add const here
v
__device__ word32 update_crc(word32 crc_accum, const char *data_blk_ptr, word32 data_blk_size)
{

请注意,当我使用此修复程序编译您的代码时,仍然有一些非常重要的警告:

t573.cu(73): warning: a __device__ variable "BFilter" cannot be directly written in a host function

t573.cu(185): warning: a __device__ variable "CrcTable" cannot be directly written in a host function

这些不应该被忽视。例如,第一个警告,你有这个变量:

__device__ char BFilter[M / 8];         //Bloom filter array of M/8 bytes

你不能在你的主机代码中直接写这个变量(在 main 中):

//Clear the Bloom filter
for(i = 0; i<NumBytes; i++)
{
BFilter[i] = 0x00;
}

而是使用类似 cudaMemcpyToSymbol() 的函数

关于c++ - 我可以从 CUDA 内核函数调用 __device__ 函数吗?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/26038651/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com