gpt4 book ai didi

c++ - 为什么时序会随着输入数据中零的数量而急剧变化?

转载 作者:太空宇宙 更新时间:2023-11-04 03:50:14 25 4
gpt4 key购买 nike

我在调试的时候遇到了这个奇怪的问题。

在我的代码中,我可以用两种方式初始化主机数组srcArr_h[totArrElm]:

1)

   for(int ic=0; ic<totArrElm; ic++)
{
srcArr_h[ic] = (float)(rand() % 256);
}

2)(一半的数组元素将在运行时设置为零)

   for(int ic=0; ic<totArrElm; ic++)
{
int randV = (rand() % 256);
srcArr_h[ic] = randV%2;
}

如果我将这些数组用作内核函数的输入,我会得到截然不同的时序。特别是如果 totArrElm = ARRDIM*ARRDIMARRDIM = 8192,我得到

时间 1) 64599.3 毫秒

时间 2) 9764.1 毫秒

有什么技巧?当然,我确实验证了 src 主机初始化不会影响我得到的大时差。这对我来说听起来很奇怪,但这可能是由于运行时的优化吗?

这是我的代码:

#include <string>
#include <stdint.h>
#include <iostream>
#include <stdio.h>
using namespace std;

#define ARRDIM 8192

__global__ void gpuKernel
(
float *sa, float *aux,
size_t memPitchAux, int w,
float *c_glob
)
{
float c_loc[256];
float sc_loc[256];

float g0=0.0f;

int tidx = blockIdx.x * blockDim.x + threadIdx.x; // x-coordinate of pixel = column in device memory
int tidy = blockIdx.y * blockDim.y + threadIdx.y; // y-coordinate of pixel = row in device memory
int idx = tidy * memPitchAux/4 + tidx;

for(int ic=0; ic<256; ic++)
{
c_loc[ic] = 0.0f;
}

for(int ic=0; ic<255; ic++)
{
sc_loc[ic] = 0.0f;
}

for(int is=0; is<255; is++)
{
int ic = fabs(sa[tidy*w +tidx]);
c_loc[ic] += 1.0f;
}

for(int ic=0; ic<255; ic++)
{
g0 += c_loc[ic];
}
aux[idx] = g0;
}

int main(int argc, char* argv[])
{

float time, loop_time;
cudaEvent_t start, stop;
cudaEvent_t start_loop, stop_loop;

cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0) ;
/*
* array src host and device
*/
int heightSrc = ARRDIM;
int widthSrc = ARRDIM;
cudaSetDevice(0);

float *srcArr_h, *srcArr_d;
size_t nBytesSrcArr = sizeof(float)*heightSrc * widthSrc;

srcArr_h = (float *)malloc(nBytesSrcArr); // Allocate array on host
cudaMalloc((void **) &srcArr_d, nBytesSrcArr); // Allocate array on device
cudaMemset((void*)srcArr_d,0,nBytesSrcArr); // set to zero

int totArrElm = heightSrc*widthSrc;

cudaEventCreate(&start_loop);
cudaEventCreate(&stop_loop);
cudaEventRecord(start_loop, 0) ;

for(int ic=0; ic<totArrElm; ic++)
{
srcArr_h[ic] = (float)(rand() % 256); // case 1)
// int randV = (rand() % 256); // case 2)
// srcArr_h[ic] = randV%2;
}

cudaEventRecord(stop_loop, 0);
cudaEventSynchronize(stop_loop);
cudaEventElapsedTime(&loop_time, start_loop, stop_loop);
printf("Timimg LOOP: %3.1f ms\n", loop_time);

cudaMemcpy( srcArr_d, srcArr_h,nBytesSrcArr,cudaMemcpyHostToDevice);

/*
* auxiliary buffer auxD to save final results
*/
float *auxD;
size_t auxDPitch;
cudaMallocPitch((void**)&auxD,&auxDPitch,widthSrc*sizeof(float),heightSrc);
cudaMemset2D(auxD, auxDPitch, 0, widthSrc*sizeof(float), heightSrc);

/*
* auxiliary buffer auxH allocation + initialization on host
*/
size_t auxHPitch;
auxHPitch = widthSrc*sizeof(float);
float *auxH = (float *) malloc(heightSrc*auxHPitch);

/*
* kernel launch specs
*/
int thpb_x = 16;
int thpb_y = 16;

int blpg_x = (int) widthSrc/thpb_x + 1;
int blpg_y = (int) heightSrc/thpb_y +1;
int num_threads = blpg_x * thpb_x + blpg_y * thpb_y;

/* c_glob array */
int cglob_w = 256;
int cglob_h = num_threads;

float *c_glob_d;
size_t c_globDPitch;
cudaMallocPitch((void**)&c_glob_d,&c_globDPitch,cglob_w*sizeof(float),cglob_h);
cudaMemset2D(c_glob_d, c_globDPitch, 0, cglob_w*sizeof(float), cglob_h);

/*
* kernel launch
*/
dim3 dimBlock(thpb_x,thpb_y, 1);
dim3 dimGrid(blpg_x,blpg_y,1);

gpuKernel<<<dimGrid,dimBlock>>>(srcArr_d,auxD, auxDPitch, widthSrc, c_glob_d);

cudaThreadSynchronize();

cudaMemcpy2D(auxH,auxHPitch, // to CPU (host)
auxD,auxDPitch, // from GPU (device)
auxHPitch, heightSrc, // size of data (image)
cudaMemcpyDeviceToHost);
cudaThreadSynchronize();

cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Timimg: %3.1f ms\n", time);

cudaFree(srcArr_d);
cudaFree(auxD);
cudaFree(c_glob_d);

}

我的生成文件:

# OS Name (Linux or Darwin)
OSUPPER = $(shell uname -s 2>/dev/null | tr [:lower:] [:upper:])
OSLOWER = $(shell uname -s 2>/dev/null | tr [:upper:] [:lower:])

# Flags to detect 32-bit or 64-bit OS platform
OS_SIZE = $(shell uname -m | sed -e "s/i.86/32/" -e "s/x86_64/64/")
OS_ARCH = $(shell uname -m | sed -e "s/i386/i686/")

# These flags will override any settings
ifeq ($(i386),1)
OS_SIZE = 32
OS_ARCH = i686
endif

ifeq ($(x86_64),1)
OS_SIZE = 64
OS_ARCH = x86_64
endif

# Flags to detect either a Linux system (linux) or Mac OSX (darwin)
DARWIN = $(strip $(findstring DARWIN, $(OSUPPER)))

# Location of the CUDA Toolkit binaries and libraries
CUDA_PATH ?= /usr/local/cuda-5.0
CUDA_INC_PATH ?= $(CUDA_PATH)/include
CUDA_BIN_PATH ?= $(CUDA_PATH)/bin
ifneq ($(DARWIN),)
CUDA_LIB_PATH ?= $(CUDA_PATH)/lib
else
ifeq ($(OS_SIZE),32)
CUDA_LIB_PATH ?= $(CUDA_PATH)/lib
else
CUDA_LIB_PATH ?= $(CUDA_PATH)/lib64
endif
endif

# Common binaries
NVCC ?= $(CUDA_BIN_PATH)/nvcc
GCC ?= g++

# Extra user flags
EXTRA_NVCCFLAGS ?=
EXTRA_LDFLAGS ?=
EXTRA_CCFLAGS ?=

# CUDA code generation flags
# GENCODE_SM10 := -gencode arch=compute_10,code=sm_10
# GENCODE_SM20 := -gencode arch=compute_20,code=sm_20
# GENCODE_SM30 := -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35
GENCODE_SM10 := -gencode arch=compute_10,code=sm_10
GENCODE_SM20 := -gencode arch=compute_20,code=sm_20
GENCODE_SM30 := -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35
#GENCODE_FLAGS := $(GENCODE_SM20) $(GENCODE_SM10)

GENCODE_FLAGS := $(GENCODE_SM10) $(GENCODE_SM20) $(GENCODE_SM30)

# OS-specific build flags
ifneq ($(DARWIN),)
LDFLAGS := -Xlinker -rpath $(CUDA_LIB_PATH) -L$(CUDA_LIB_PATH) -lcudart
CCFLAGS := -arch $(OS_ARCH)
else
ifeq ($(OS_SIZE),32)
LDFLAGS := -L$(CUDA_LIB_PATH) -lcudart
CCFLAGS := -m32
else
LDFLAGS := -L$(CUDA_LIB_PATH) -lcudart
CCFLAGS := -m64
endif
endif

# OS-architecture specific flags
ifeq ($(OS_SIZE),32)
NVCCFLAGS := -m32
else
NVCCFLAGS := -m64
endif

# OpenGL specific libraries
ifneq ($(DARWIN),)
# Mac OSX specific libraries and paths to include
LIBPATH_OPENGL := -L../../common/lib/darwin -L/System/Library/Frameworks/OpenGL.framework/Libraries -framework GLUT -lGL -lGLU ../../common/lib/darwin/libGLEW.a
else
# Linux specific libraries and paths to include
LIBPATH_OPENGL := -L../../common/lib/linux/$(OS_ARCH) -L/usr/X11R6/lib -lGL -lGLU -lX11 -lXi -lXmu -lglut -lGLEW -lrt
endif

# Debug build flags
ifeq ($(dbg),1)
CCFLAGS += -g
NVCCFLAGS += -g -G
TARGET := debug
else
TARGET := release

endif


# Common includes and paths for CUDA
INCLUDES := -I$(CUDA_INC_PATH) -I. -I.. -I../../common/inc
LDFLAGS += $(LIBPATH_OPENGL)

# Target rules
all: build

build: stackOverflow

stackOverflow.o: stackOverflow.cu
$(NVCC) $(NVCCFLAGS) $(EXTRA_NVCCFLAGS) $(GENCODE_FLAGS) $(INCLUDES) -o $@ -c $<

stackOverflow: stackOverflow.o
$(GCC) $(CCFLAGS) -o $@ $+ $(LDFLAGS) $(EXTRA_LDFLAGS)
mkdir -p ./bin/$(OSLOWER)/$(TARGET)
cp $@ ./bin/$(OSLOWER)/$(TARGET)

run: build
./stackOverflow

clean:
rm -f stackOverflow.o stackOverflow *.pgm

Tesla c1060、Ubuntu 12.04 上的 Cuda 5.0。

最佳答案

Tesla C1060 GPU 设备具有 1.3 的计算能力,这意味着每个线程都有 128 个 32 位寄存器。显然不足以容纳所有局部变量(2 个 float 组,每个数组 256 个元素,以及更多变量)。由于在以下行中访问了本地内存

c_loc[ic] +=  1.0f;

在情况 (1) 中高度分布在整个 0...255 范围内,您可能会观察到 register spilling 这意味着您的数据被放入本地内存。事实上,本地内存位于全局内存中,因此具有相同的吞吐量。可以缓存访问,但由于算法的随机性,我敢打赌缓存效率不高。 (编辑:对于计算能力 1.3,它甚至没有被缓存,它只是非合并内存访问)。可以找到有关 CUDA 中的本地内存和寄存器溢出的良好介绍 here .您还可以在那里找到一些如何检测和解决寄存器溢出问题的指南。

考虑减少每个线程使用的本地数据量或使用位于芯片上的共享内存,从而提高速度。

关于c++ - 为什么时序会随着输入数据中零的数量而急剧变化?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/21280276/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com