gpt4 book ai didi

c - 使用 cuda Throw::max_element 查找数组中的最大元素有时返回不正确

转载 作者:行者123 更新时间:2023-11-30 19:18:02 25 4
gpt4 key购买 nike

我有一个 2^20 元素数组正在设备上填充;这些数字每次都应该相同。然后,我将该数组移至主机,然后搜索数组中的最大元素,此技术适用于 2^10 元素数组,但一旦我开始获得大于该值的值,我开始得到随机答案,不确定推力是否为弄乱或设备计算。

max_element 应该返回的答案是 0.094479,通常程序第一次运行时代码会输出正确的答案,然后答案会每隔几次随机显示

GPU是运行5.0的tesla k20,也在780GTX上进行了测试;两次都是同样的问题

//Host Code
int main( void ) {
float h_c[TOTAL];
float *d_c;

cudaMalloc((void**)&d_c, sizeof(float)*TOTAL);

cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);

//Number of threads
kernel<<<BLOCKS,THREADS>>>(d_c);

cudaEventRecord(stop);
cudaEventSynchronize(stop);
float mil = 0;
cudaEventElapsedTime(&mil, start, stop);

cudaMemcpy(h_c, d_c, sizeof(float)*TOTAL, cudaMemcpyDeviceToHost);

for(int y = 0; y < TOTAL; y++){
printf(" %d: Host C: %f \n",y, h_c[y]);
}
float *result = thrust::max_element(h_c, h_c + TOTAL);
printf("Max is: %f \n", *result);
printf("Time: %f \n", mil/1000);
printf("THREADS: %d \n", THREADS);
printf("BLOCKS: %d \n", BLOCKS);
printf("TOTAL: %d \n", TOTAL);
cudaFree(d_c);
cudaDeviceReset() ;
return 0;
}

设备代码

#include <thrust/extrema.h>
#include <math.h>
#include <stdio.h>

#define ARRAYSIZE 15
#define THREADS 1024
#define BLOCKS 32
#define TOTAL THREADS * BLOCKS

__global__ void kernel(float *cc){

//Get thread for summing all elements
int threadId = threadIdx.x + blockDim.x * blockIdx.x;

int decimalNumber,quotient;
//Size of the array
//const int size = 10;
//Holds the binary number in an array
int binaryNumber[ARRAYSIZE];
int i = 0;


int a[ARRAYSIZE] = {1192, 1315, 1462, 1484, 1476, 1443, 1508, 1489, 1470, 1573, 1633, 1539, 1600, 1707, 1701};//, 1682, 1688, 1681, 1694, 1728};
int b[ARRAYSIZE] = {1162, 1337, 1282, 1491, 1508, 1517, 1488, 1513, 1539, 1576, 1626 ,1634, 1573, 1786, 1741};//, 1782, 1755, 1669, 1700, 1826};

//Holds Product from the dot product
int c[ARRAYSIZE];
//Arrays to hold integers to be summed
int aSumArr[ARRAYSIZE];
int bSumArr[ARRAYSIZE];

for(int i = 0; i < ARRAYSIZE; i++){
c[i] = 0;
aSumArr[i] = 0;
bSumArr[i] = 0;
}

//Holds the value for the dot product
int dotSum = 0;
//Holds sum of valid array positions for array a
int aSum = 0;
//Holds sum of valid array positions for array b
int bSum = 0;

//Holds the Value of the arcCos of the dot product / sqrt(array a) * sqrt(array b)
float finalValue = 0;
//printf("ThreadID: %d \n", threadId);
//ALL 1's 1048575 = Threads
decimalNumber = threadId;
//printf("decimal number: %d \n", decimalNumber);

quotient = decimalNumber;
//Loop to convert decimal into binary and store in array
while(quotient!=0){

binaryNumber[i++]= quotient % 2;

quotient = quotient / 2;

}

//Test if conversion from decimal to binary is complete and correct
//printf("Equivalent binary value of decimal number %d: \n",decimalNumber);

//for(int in = size-1; in >= 0;in--){
//printf("Index: %d | binary number: %d ---- a:%d || b: %d\n",in,binaryNumber[in],a[in],b[in]);
//}
//printf(" \n ");

//Loop through binaryNumber array
for(int x = ARRAYSIZE-1 ; x >= 0; x--){
//If index is == 1 Perform calculation
if(binaryNumber[x] == 1){
//Multiply numbers at good index
c[x] = a[x] * b[x];
//Fill sum arrays at correct index
aSumArr[x] = a[x];
bSumArr[x] = b[x];

//Checks if the loop is executing correctly
//sumArray[x] = 1;
//printf("Multiplied - %d * %d = %f\n", a[x], b[x], c[x]);
//printf("--This should not be run --\n");
}else{
// printf("SKIPPED - %d * %d = %f\n", a[x], b[x], c[x]);
}


}

//Sums up the product array to complete dot product
for(int j = 0; j < ARRAYSIZE; ++j){
dotSum += c[j];
//printf("aSumArr %d \n", aSumArr[j]);
//printf("bSumArr %d \n", bSumArr[j]);
aSum += powf( aSumArr[j], 2 );
bSum += powf( bSumArr[j], 2 );
// printf("aSum: %d + aSumArr %d \n", aSum, aSumArr[j]);
// printf("bSum: %d + bSumArr %d \n", bSum, bSumArr[j]);
}


//printf("\n");
//Print out the dot prudct
//printf("Dot product is: %d \n", dotSum);
//printf("aSum is: %d \n", aSum);
//printf("bSum is: %d \n", bSum);

float sqSum1 = sqrtf(aSum);
float sqSum2 = sqrtf(bSum);
// printf("sqSum1: %f \n", sqSum1);
// printf("sqSum2: %f \n", sqSum2);
float sqSum = sqSum1 * sqSum2;
// printf("sqSum %f \n", sqSum);
float div = dotSum / sqSum ;
// printf("div: %f \n", div);
finalValue = acosf( div ) ;

//Stores the threads final value in array cc, in the respected index
if(finalValue == finalValue){
cc[threadId] = finalValue;
}else{
cc[threadId] = -2;
}
//printf("final value is: %f for number %d \n", finalValue, threadId);
}

最佳答案

这似乎是使用了不正确初始化/未初始化变量的情况。

添加以下行后:

for(int i = 0; i < ARRAYSIZE; i++){
c[i] = 0;
aSumArr[i] = 0;
bSumArr[i] = 0;
binaryNumber[i] = 0; // add this line
}

我无法再重现该问题。

关于c - 使用 cuda Throw::max_element 查找数组中的最大元素有时返回不正确,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/27238084/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com