gpt4 book ai didi

c++ - 优化一维卷积

转载 作者:行者123 更新时间:2023-11-30 00:59:42 26 4
gpt4 key购买 nike

有没有办法加速这个一维卷积?我试图使 dy 缓存高效但是用 g++ 和 -O3 编译性能更差。

我正在与 [-1. , 0., 1] 在两个方向上。不是作业。

#include<iostream>
#include<cstdlib>
#include<sys/time.h>

void print_matrix( int height, int width, float *matrix){
for (int j=0; j < height; j++){
for (int i=0; i < width; i++){
std::cout << matrix[j * width + i] << ",";
}
std::cout << std::endl;
}
}

void fill_matrix( int height, int width, float *matrix){
for (int j=0; j < height; j++){
for (int i=0; i < width; i++){
matrix[j * width + i] = ((float)rand() / (float)RAND_MAX) ;
}
}
}

#define RESTRICT __restrict__

void dx_matrix( int height, int width, float * RESTRICT in_matrix, float * RESTRICT out_matrix, float *min, float *max){
//init min,max
*min = *max = -1.F * in_matrix[0] + in_matrix[1];

for (int j=0; j < height; j++){
float* row = in_matrix + j * width;
for (int i=1; i < width-1; i++){
float res = -1.F * row[i-1] + row[i+1]; /* -1.F * value + 0.F * value + 1.F * value; */
if (res > *max ) *max = res;
if (res < *min ) *min = res;
out_matrix[j * width + i] = res;
}
}
}

void dy_matrix( int height, int width, float * RESTRICT in_matrix, float * RESTRICT out_matrix, float *min, float *max){
//init min,max
*min = *max = -1.F * in_matrix[0] + in_matrix[ width + 1];

for (int j=1; j < height-1; j++){
for (int i=0; i < width; i++){
float res = -1.F * in_matrix[ (j-1) * width + i] + in_matrix[ (j+1) * width + i] ;
if (res > *max ) *max = res;
if (res < *min ) *min = res;
out_matrix[j * width + i] = res;
}
}
}

double now (void)
{
struct timeval tv;
gettimeofday(&tv, NULL);
return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0;
}


int main(int argc, char **argv){

int width, height;
float *in_matrix;
float *out_matrix;

if(argc < 3){
std::cout << argv[0] << "usage: width height " << std::endl;
return -1;
}

srand(123);

width = atoi(argv[1]);
height = atoi(argv[2]);

std::cout << "Width:"<< width << " Height:" << height << std::endl;

if (width < 3){
std::cout << "Width too short " << std::endl;
return -1;
}
if (height < 3){
std::cout << "Height too short " << std::endl;
return -1;
}

in_matrix = (float *) malloc( height * width * sizeof(float));
out_matrix = (float *) malloc( height * width * sizeof(float));

fill_matrix(height, width, in_matrix);
//print_matrix(height, width, in_matrix);

float min, max;

double a = now();
dx_matrix(height, width, in_matrix, out_matrix, &min, &max);
std::cout << "dx min:" << min << " max:" << max << std::endl;

dy_matrix(height, width, in_matrix, out_matrix, &min, &max);
double b = now();
std::cout << "dy min:" << min << " max:" << max << std::endl;
std::cout << "time: " << b-a << " sec" << std::endl;


return 0;
}

最佳答案

使用局部变量计算最小值和最大值。每次你这样做:

if (res > *max ) *max = res;
if (res < *min ) *min = res;

max 和 min 必须写入内存。在指针上添加 restrict 会有所帮助(表明写入是独立的),但更好的方法是类似

//Setup
float tempMin = ...
float tempMax = ...
...
// Inner loop
tempMin = (res < tempMin) ? res : tempMin;
tempMax = (res > tempMax) ? res : tempMax;
...
// End
*min = tempMin;
*max = tempMax;

关于c++ - 优化一维卷积,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/3886836/

26 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com