gpt4 book ai didi

c - 结构中的指针是否会减慢我的代码速度?

转载 作者:行者123 更新时间:2023-12-04 11:56:33 24 4
gpt4 key购买 nike

我正在寻找一些帮助或提示来加速我的代码。
我已经实现了一个例程,用于从一组球谐系数 C_{n,m} 和 S_{n,m} 计算空间中某个点 (r,phi,lambda) 的引力势。等式显示在下面的链接中:
Equation that is being implemented
并且包括与纬度 (phi) 相关的关联勒让德多项式 P_{n,m} 的递归计算,从前两个值 P{0,0} 和 P_{1,1} 开始。
起初,我将其作为 MATLAB C-MEX 代码实现,只有我代码的核心部分是用 C 语言编写的。我想做一个纯C程序,但发现代码运行速度慢了3-5倍,这让我想知道为什么。这可能是我定义结构并使用指向中央代码中的指针的方式吗?
看起来它是需要额外时间的核心计算部分,但该部分没有改变,尽管在我将指针直接传递给变量之前,现在我在结构内部使用指针。
任何帮助表示赞赏!
在下文中,我将尝试解释我的代码并展示一些摘录:
在程序开始时,我定义了三个结构。一个保存球谐系数,C_{n,m} 和 S_{n,m}, (ggm_struct),一个保存计算坐标 (comp_struct),一个保存结果 (func_struct):

// Define constant variables
const double deg2rad = M_PI/180.0; // degrees to radians conversion factor
const double sfac = 1.0000E-280; // scaling factor
const double sqrt2 = 1.414213562373095; // sqrt(2)
const double sqrt3 = 1.732050807568877; // sqrt(3)

// Define structure to hold geopotential model
struct ggm_struct {
char product_type[100], modelname[100], errors[100], norm[100], tide_system[100];
double GM, R, *C, *S;
int max_degree, ncoef;
};
// Define structure to hold computation coordinates
struct comp_struct {
double *lat, *lon, *h;
double *r, *phi;
int nlat, nlon;
};
/* Define structure to hold results */
struct func_struct {
double *rval;
int npoints;
};
然后我有一个(子)函数,它首先分配空间,然后从 ascii 文件加载系数作为
int read_gfc(char mfile[100], int *nmax, int *mmax, struct ggm_struct *ggm)
{

// Set file identifier
FILE *fid;

// Declare variables
char str[200], var[100];
int n, m, nid, l00 = 0, l10 = 0, l11 = 0;
double c, s;

// Determine number of coefficients
ggm->ncoef = (*nmax+2)*(*nmax+1)/2;

// Allocate memory for coefficients
ggm->C = (double*) malloc(ggm->ncoef*sizeof(double));
if (ggm->C == NULL){
printf("Error: Memory for C not allocated!");
return -ENOMEM;
}
ggm->S = (double*) malloc(ggm->ncoef*sizeof(double));
if (ggm->S == NULL){
printf("Error: Memory for S not allocated!");
return -ENOMEM;
}

// Open file
fid = fopen(mfile,"r");

// Check that file was opened correctly
if (fid == NULL){
printf("Error: opening file %s!",mfile);
return -ENOMEM;
}

// Read file header
while (fgets(str,200,fid) != NULL && strncmp(str,"end_of_head",11) != 0){

// Extract model parameters
if (strncmp(str,"product_type",12) == 0){ sscanf(str,"%s %s",var,ggm->product_type); }
if (strncmp(str,"modelname",9) == 0){ sscanf(str,"%s %s",var,ggm->modelname); }
if (strncmp(str,"earth_gravity_constant",22) == 0){ sscanf(str,"%s %lf",var,&ggm->GM); }
if (strncmp(str,"radius",6) == 0){ sscanf(str,"%s %lf",var,&ggm->R); }
if (strncmp(str,"max_degree",10) == 0){ sscanf(str,"%s %d",var,&ggm->max_degree); }
if (strncmp(str,"errors",6) == 0){ sscanf(str,"%s %s",var,ggm->errors); }
if (strncmp(str,"norm",4) == 0){ sscanf(str,"%s %s",var,ggm->norm); }
if (strncmp(str,"tide_system",11) == 0){ sscanf(str,"%s %s",var,ggm->tide_system); }

}

// Read coefficients
while (fgets(str,200,fid) != NULL){

// Extract parameters
sscanf(str,"%s %d %d %lf %lf",var,&n,&m,&c,&s);

// Store parameters
if (n <= *nmax && m <= *mmax) {

// Determine index
nid = (n+1)*n/2 + m;

// Store values
*(ggm->C+nid) = c;
*(ggm->S+nid) = s;

}

}

// Close fil
fclose(fid);

// Return from function
return 0;

}
之后,计算网格由七个组件的数组定义。例如,数组 [-90 90 -180 180 1 1 0] 定义了从 -90 到 90 度纬度以 1 度增量和从 -180 到 180 度经度以 1 度增量的网格。高度为零。从这个数组,在(子)函数中生成一个计算网格:
int make_grid(double *grid, struct comp_struct *inp)
{

// Declare variables
int n;

/* Echo routine */
printf("Creating grid of coordinates\n");
printf(" [lat1,lat2,dlat] = [%f,%f,%f]\n", *grid, *(grid+1), *(grid+4) );
printf(" [lon1,lon2,dlon] = [%f,%f,%f]\n", *(grid+2), *(grid+3), *(grid+5) );
printf(" h = %f\n", *(grid+6));

/* Latitude ------------------------------------------------------------- */

// Determine number of increments
inp->nlat = ceil( ( *(grid+1) - *grid + *(grid+4) ) / *(grid+4) );

// Allocate memory
inp->lat = (double*) malloc(inp->nlat*sizeof(double));
if (inp->lat== NULL){
printf("Error: Memory for LATITUDE (inp.lat) points not allocated!");
return -ENOMEM;
}

// Fill in values
*(inp->lat) = *(grid+1);
for (n = 1; n < inp->nlat-1; n++) {
*(inp->lat+n) = *(inp->lat+n-1) - *(grid+4);
}
*(inp->lat+inp->nlat-1) = *grid;

/* Longitude ------------------------------------------------------------ */

// Determine number of increments
inp->nlon = ceil( ( *(grid+3) - *(grid+2) + *(grid+5) ) / *(grid+5) );

// Allocate memory
inp->lon = (double*) malloc(inp->nlon*sizeof(double));
if (inp->lon== NULL){
printf("Error: Memory for LONGITUDE (inp.lon) points not allocated!");
return -ENOMEM;
}

// Fill in values
*(inp->lon) = *(grid+2);
for (n = 1; n < inp->nlon-1; n++) {
*(inp->lon+n) = *(inp->lon+n-1) + *(grid+5);
}
*(inp->lon+inp->nlon-1) = *(grid+3);

/* Height --------------------------------------------------------------- */

// Allocate memory
inp->h = (double*) malloc(inp->nlat*sizeof(double));
if (inp->h== NULL){
printf("Error: Memory for HEIGHT (inp.h) points not allocated!");
return -ENOMEM;
}

// Fill in values
for (n = 0; n < inp->nlat; n++) {
*(inp->h+n) = *(inp->h+n-1) + *(grid+6);
}

// Return from function
return 0;

}
然后将这些地理坐标转换为球面坐标以使用另一个(子)例程进行计算
int geo2sph(struct comp_struct *inp, int *lgrid)
{

// Declare variables
double a = 6378137.0, e2 = 6.69437999014E-3; /* WGS84 parameters */
double x, y, z, sinlat, coslat, sinlon, coslon, R_E;
int i, j, nid;

/* Allocate space ------------------------------------------------------- */

// radius
inp->r = (double*) malloc(inp->nlat*sizeof(double));
if (inp->r== NULL){
printf("Error: Memory for SPHERICAL DISTANCE (inp.r) points not allocated!");
return -ENOMEM;
}

// phi
inp->phi = (double*) malloc(inp->nlat*sizeof(double));
if (inp->phi== NULL){
printf("Error: Memory for SPHERICAL LATITUDE (inp.phi) points not allocated!");
return -ENOMEM;
}

/* Loop over latitude =================================================== */
for (i = 0; i < inp->nlat; i++) {

// Compute sine and cosine of latitude
sinlat = sin(*(inp->lat+i));
coslat = cos(*(inp->lat+i));

// Compute radius of curvature
R_E = a / sqrt( 1.0 - e2*sinlat*sinlat );

// Compute sine and cosine of longitude
sinlon = sin(*(inp->lon));
coslon = cos(*(inp->lon));

// Compute rectangular coordinates
x = ( R_E + *(inp->h+i) ) * coslat * coslon;
y = ( R_E + *(inp->h+i) ) * coslat * sinlon;
z = ( R_E*(1.0-e2) + *(inp->h+i) ) * sinlat;

// Compute sqrt( x^2 + y^2 )
R_E = sqrt( x*x + y*y );

// Derive radial distance
*(inp->r+i) = sqrt( R_E * R_E + z*z );

// Derive spherical latitude
if (R_E < 1) {
if (z > 0) { *(inp->phi+i) = M_PI/2.0; }
else { *(inp->phi+i) = -M_PI/2.0; }
}
else {
*(inp->phi+i) = asin( z / *(inp->r+i) );
}

}

// Return from function
return 0;

}
最后,引力势是在自己的(子)函数内计算的。这是代码的核心部分,与 MATLAB C-MEX 函数或多或少相同。唯一的区别似乎是之前(在 MATLAB MEX 中)一切都被定义为(简单的)双变量 - 现在变量位于包含指针的结构内。
int gravpot(struct comp_struct *inp, struct ggm_struct *ggm, int *nmax,
int *mmax, int *lgrid, struct func_struct *out)
{

// Declare variables
double GMr, ar, t, u, u2, arn, gnm, hnm, P, Pp1, Pp2, msum;
double Pmm[*nmax+1], CPnm[*mmax+1], SPnm[*mmax+1];
int i, j, n, m, id;

// Allocate memory
out->rval = (double*) malloc(inp->nlat*inp->nlon*sizeof(double));
if (out->rval== NULL){
printf("Error: Memory for OUTPUT (out.rval) not allocated!");
return -ENOMEM;
}

/* Compute sectorial values of associated Legendre polynomials ========== */

// Define seed values ( divided by u^m )
Pmm[0] = sfac;
Pmm[1] = sqrt3 * sfac;

// Compute sectorial values, [1] eq. 13 and 28 ( divided by u^m )
for (m = 2; m <= *nmax; m++) {
Pmm[m] = sqrt( (2.0*m+1.0) / (2.0*m) ) * Pmm[m-1];
}

/* ====================================================================== */

/* Loop over latitude =================================================== */
for (i = 0; i < inp->nlat; i++) {

// Compute ratios to be used in summation
GMr = ggm->GM / *(inp->r+i);
ar = ggm->R / *(inp->r+i);

/* ---------------------------------------------------------------------
* Compute product of Legendre values and spherical harmonic coefficients.
* Products of similar degree are summed together, resulting in mmax
* values. The degree terms are latitude dependent, such that these mmax
* sums are valid for every point with the same latitude.
* The values of the associated Legendre polynomials, Pnm, are scaled by
* sfac = 10^(-280) and divided by u^m in order to prevent underflow and
* overflow of the coefficients.
* ------------------------------------------------------------------ */

// Form coefficients for Legendre recursive algorithm
t = sin(*(inp->phi+i));
u = cos(*(inp->phi+i));
u2 = u * u;
arn = ar;

/* Degree n = 0 terms ----------------------------------------------- */

// Compute order m = 0 term (S term is zero)
CPnm[0] = Pmm[0] * *(ggm->C);

/* Degree n = 1 terms ----------------------------------------------- */

// Compute (1,1) terms, [1] eq. 3
CPnm[1] = ar * Pmm[1] * *(ggm->C+2);
SPnm[1] = ar * Pmm[1] * *(ggm->S+2);

// Compute (1,0) Legendre value, [1] eq. 18 and 27
P = t * Pmm[1];

// Add (1,0) terms to sum (S term is zero), [1] eq. 3
CPnm[0] = CPnm[0] + ar * P * *(ggm->C+1);

/* Degree n = [2,n_max] --------------------------------------------- */
for (n = 2; n <= *nmax; n++) {

// Compute power term
arn = arn * ar;

/* Compute sectorial (m=n) terms ++++++++++++++++++++++++++++++++ */

// Extract associated Legendre value
Pp1 = Pmm[n];

// Compute product terms, [1] eq. 3
if (n <= *mmax) {
id = (n+1)*n/2 + n;
CPnm[n] = arn * Pp1 * *(ggm->C+id);
SPnm[n] = arn * Pp1 * *(ggm->S+id);
}

/* Compute first non-sectorial terms (m=n-1) ++++++++++++++++++++ */

// Compute associated Legendre value, [1] eq. 18 and 27
gnm = sqrt( 2.0*n );
P = gnm * t * Pp1;

// Add terms to summation, eq. 3 in [1]
if (n-1 <= *mmax) {
id = (n+1)*n/2 + n - 1;
CPnm[n-1] = CPnm[n-1] + arn * P * *(ggm->C+id);
SPnm[n-1] = SPnm[n-1] + arn * P * *(ggm->S+id);
}

/* Compute terms of order m = [n-2,1] +++++++++++++++++++++++++++ */
for (m = n-2; m > 0; m--) {

// Set previous values
Pp2 = Pp1;
Pp1 = P;

// Compute associated Legendre value, [1] eq. 18, 19 and 27
gnm = 2.0*(m+1.0) / sqrt( (n-m)*(n+m+1.0) );
hnm = sqrt( (n+m+2.0)*(n-m-1.0)/(n-m)/(n+m+1.0) );
P = gnm * t * Pp1 - hnm * u2 * Pp2;

// Add product terms to summation, eq. 3 in [1]
if (m <= *mmax) {
id = (n+1)*n/2 + m;
CPnm[m] = CPnm[m] + arn * P * *(ggm->C+id);
SPnm[m] = SPnm[m] + arn * P * *(ggm->S+id);
}

}

/* Compute zonal terms (m=0) ++++++++++++++++++++++++++++++++++++ */

// Compute associated Legendre value, [1] eq. 18, 19 and 27
gnm = 2.0 / sqrt( n*(n+1.0) );
hnm = sqrt( (n+2.0)*(n-1.0)/n/(n+1.0) );
P = ( gnm * t * P - hnm * u2 * Pp1 ) / sqrt2;

// Add terms to summation (S term is zero), [1] eq. 3
id = (n+1)*n/2;
CPnm[0] = CPnm[0] + arn * P * *(ggm->C+id);

} /* ---------------------------------------------------------------- */

/* Loop over longitude ============================================== */
for (j = 0; j < inp->nlon; j++) {

/* -----------------------------------------------------------------
* All associated Legendre polynomials (latitude dependent) are now
* computed and multiplied by the corresponding spherical harmonic
* coefficient. These products are scaled by u^m, meaning that
* Horner's scheme is used in the following summation.
* -------------------------------------------------------------- */

// Initialise order-dependent sum
msum = 0.0;

// Derive longitude id
id = j + i * *lgrid;

// Loop over order (m > 0)
for (m = *mmax; m > 0; m--) {

// Add to order-dependent sum using Horner's scheme, [1] eq. 2, 3 and 31
msum = ( msum + cos( m * *(inp->lon+id) ) * CPnm[m]
+ sin( m * *(inp->lon+id) ) * SPnm[m] ) * u;

}

// Add zero order term to sum
msum = msum + CPnm[0];

// Rescale value into gravitational potential, [1] eq. 1
*(out->rval+i+j*inp->nlat) = GMr * msum / sfac;

} /* ================================================================ */

} /* ==================================================================== */

// Return from function
return 0;

}
再次,非常感谢任何帮助,如果相关,可以提供其他信息,但这已经成为一个很长的帖子。我很难接受纯 c 代码的运行速度比 MATLAB C-MEX 代码慢。

最佳答案

简而言之,是的,指针可以防止某些编译器优化,从而导致潜在的减速。至少,对于 ICC 和 GCC 来说显然是这种情况。程序的性能受到 的强烈影响指针别名和矢量化 .
实际上,编译器无法轻易知道所提供的指针是否彼此别名或与所提供数据结构的某些字段的地址别名。结果,编译器趋于保守,并假设指向的值可以随时更改并经常重新加载它们。这可以防止优化,例如在 gravpot 中拆分某些循环。 (使用 GCC——见 this modified code 的第 119 行)。此外,间接和别名往往会阻止热循环的矢量化(即使用目标处理器提供的 SIMD 指令)。矢量化可以强烈影响代码的性能。
举个例子,heregeo2sph的初始代码和 here是一个稍微修改的实现。在第一种情况下,ICC 生成缓慢的标量实现,而在第二种情况下,ICC 生成明显更快的矢量化实现。两种实现的唯一区别是使用了 restrict 关键词。这个关键字告诉编译器,在指针的生命周期内,只有指针本身或直接派生自它的值(如指针+1)将用于访问它指向的对象(更多信息请参见 here信息)。注意使用restrict关键字是危险的,使用它时应该非常小心,因为如果限制提示错误(很难调试),编译器可能会生成错误的代码。或者,您可以使用 帮助编译器生成矢量化代码。 OpenMP SIMD 指令 #pragma omp simd (结果见 here)。请注意,您应该确保目标代码可以安全地向量化(例如, 的迭代必须是独立的)。

关于c - 结构中的指针是否会减慢我的代码速度?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/68648181/

24 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com