gpt4 book ai didi

c - AVX2 1GB长阵列

转载 作者:行者123 更新时间:2023-11-30 16:39:17 25 4
gpt4 key购买 nike

我有一个 1gb 长数组,其中包含 .bin 文件中的 float 。我读完后如何用 avx2 指令对元素求和并打印结果?

我用 Jake 'Alquimista' LEE 的答案编辑了我的代码。问题是结果比实际要小得多。还有另一个问题,如何为从 .bin 文件读取的每个数字添加一个常量?

#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <immintrin.h>

inline float sumf(const float *pSrc, uint32_t len)
{
__m256 sum, in;
float sumr;
uint32_t sumi;
uint32_t lenr = len & 7;
while (len--)
len >>= 3;
sum = _mm256_set1_ps(0.0f);
{
in = _mm256_loadu_ps(pSrc++);
sum = _mm256_add_ps(in, sum);
}

sum = _mm256_hadd_ps(sum, in);
sum = _mm256_hadd_ps(sum, in);
sum = _mm256_hadd_ps(sum, in);
sumi = _mm256_extract_epi32(*(__m256i *)&sum, 0);
sumr = *(float *)&sumi;

while (lenr--)
{
sumr += *pSrc++;
}

return sumr;
}


int main(void)
{

FILE *file;

float *buffer2;
uint32_t fileLen;

if((file = fopen("example.bin","rb"))==NULL)
{
printf("Error! opening file");
exit(1);
}


fseek(file, 0, SEEK_END);
fileLen=ftell(file);
fseek(file, 0, SEEK_SET);
buffer2=(float *)malloc(fileLen+1);
if (!buffer2)
{
fprintf(stderr, "Memory error!");
fclose(file);
return 0;
}


fread(buffer2, fileLen, 1, file);
fclose(file);
printf( "File size : %lu Bits \n", fileLen );
for(int i = 0; i<10; i++)
printf("%f \n", buffer2[i]);

float sum =sumf(buffer2,fileLen);
printf("%f\n",s);
free(buffer2);
return 0;
}

最佳答案

将 1GB 文件读入内存是很大的内存和 I/O 开销。虽然我对AVX2不是很熟悉,我阅读了互联网上的文章,我可以提出以下解决方案,该解决方案经过实际测试并证明是有效的。

我的解决方案包括将文件读取为 512 字节的 block (128 个 float 的 block ),然后对 vector 对(每个 block 总共 16 个 vector )求和,以便最后我们得到最终的 __m256 vector ,通过将其转换为 float*我们可以总结其各个组成部分以获得最终结果。

文件未128浮点对齐的情况在最后for中处理。通过对各个浮点求和来循环。

代码已注释,但如果您有任何建议为答案添加更多解释,请随意这样做。

#include <immintrin.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <errno.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>

int make_floatf(char *, int);
float avx_sfadd(char*);

char error_buf[1024];

#define PERROR() \
do { \
strerror_r(errno, error_buf, 1024); \
printf("Error: %s\n", error_buf); \
fclose(fp); \
return -1; \
} while(0)

/* This function generates a .bin file containing blocks
* of 128 floating point numbers
*/
int make_floatf(char *filename, int nblocks)
{
FILE *fp = NULL;

if(!(fp = fopen(filename, "wb+")))
PERROR();

float *block_ptr = malloc(sizeof(float) * 128); /* 512 Bytes block of 128 floats */
if(!block_ptr)
PERROR();

int j, i;

for(j = 0; j < nblocks; j++)
{
for(i = 0; i < 128; i++)
block_ptr[i] = 1.0;

int ret = fwrite(block_ptr, sizeof(float), 128, fp);
if(ret < 128)
{
free(block_ptr);
PERROR();
}
}

free(block_ptr);
fclose(fp);

return 0;
}

/* This function reads the .bin file as chuncks of 512B
* blocks (128 floating point numbers) and calculates thier sum.
* The final sum in a form of vector is looped through and its
* components are summed up to get the final result.
*/
float avx_sfadd(char *filename)
{
FILE *fp = NULL;

__m256 v1;
__m256 v2;
__m256 sum = _mm256_setzero_ps();

if(!(fp = fopen(filename, "rb")))
PERROR();

struct stat stat_buf;
stat(filename, &stat_buf);

size_t fsize = stat_buf.st_size;
size_t nblocks = fsize / (sizeof(float) * 128);
size_t rem_size = fsize - nblocks * sizeof(float) * 128;
size_t rem_floats = rem_size / (sizeof(float));

printf("File size: %ld\nnblocks:%ld\nnremfloats: %ld\n",\
fsize, nblocks, rem_floats);

/* This memory area will hold the 128 floating point numbers per block */
float *block_ptr = malloc(sizeof(float) * 128);
if(!block_ptr)
PERROR();

int i;
for(i = 0; i < nblocks; i++)
{
int ret = fread(block_ptr, sizeof(float), 128, fp);
if(ret < 128)
PERROR();

/* Summing up vectors in a block of 16 vectors (128 floats) */
int j;
for(j = 0; j < 16; j += 2)
{
v1 = _mm256_loadu_ps(block_ptr + j*8);
v2 = _mm256_loadu_ps(block_ptr + (j+1)*8);

sum += _mm256_add_ps(v1, v2);
}
}

/* Handling the case if the last chunck of the file doesn't make
* a complete block.
*/
float rem_sum = 0;
if(rem_size > 0)
{
int ret = fread(block_ptr, 1, rem_size, fp);
if(ret < rem_floats)
PERROR();

int j;
for(j = 0; j < rem_floats; j++)
rem_sum += block_ptr[j];
}

float final_sum = rem_sum;
float *sum_ptr = (float*)&sum; /* The final vector hold the sum of all vectors */

/* Summing up the values of the last vector to get the final result */
int k;
for(k = 0; k < 8; k++)
final_sum += sum_ptr[k];

free(block_ptr);
fclose(fp);

return final_sum;
}


int main(int argc, char **argv)
{
if(argc < 2){
puts("./main filename [nblocks]");
return 0;
}

/* ./main filename number_of_block_to_create (eg. ./main floats.bin 1024 )*/
else if(argc == 3){

if(!make_floatf(argv[1], atoi(argv[2])))
puts("File has been created sucessfully\n");
}

/* ./main filename (eg. ./main floats.bin) to calculate sum*/
else
printf("avx_sum = %f\n", avx_sfadd(argv[1])) :


return 0;
}

关于c - AVX2 1GB长阵列,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/47105601/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com