- android - 多次调用 OnPrimaryClipChangedListener
- android - 无法更新 RecyclerView 中的 TextView 字段
- android.database.CursorIndexOutOfBoundsException : Index 0 requested, 光标大小为 0
- android - 使用 AppCompat 时,我们是否需要明确指定其 UI 组件(Spinner、EditText)颜色
我正在尝试编写混合 MPI/OpenACC 代码,其中代码需要执行 8 项不同的工作(在本例中为 8 项不同的扫描)。这 8 个作业使用 MPI 划分为 [1-8] 个进程/节点,并使用 OpenACC 并行化这 8 个作业中需要完成的计算。
在每个进程完成其计算后,我减少解决方案并将最小值传递给进程 0,这是最终解决方案。
下面是生成 .txt 输出文件的完整代码 (test.c) 的 MCVE
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
#include "mpi.h"
#define min(a,b) (a > b) ? b : a
#define max(a,b) (a < b) ? b : a
#define NPES 8 // max number of PEs allowed
#define DEFAULT_BORDER_LOCATION -1
#define DEFAULT_BORDER_DISTANCE INFINITY
#define DEFAULT_INTERIOR_DISTANCE 90000
typedef struct {
int order;
int firstLevel, lastLevel, level;
int xDim, yDim, zDim;
int xSweepOff, ySweepOff, zSweepOff;
double dx, dy, dz;
} SweepInfo;
typedef struct {
double dx, dy, dz;
int * location;
double * distance;
} Phi;
typedef struct {
int x, y, z;
} Grid3D;
void calc_dist_field( Phi * p, int totalNodes );
void write_to_file(double * dist);
static SweepInfo make_sweepInfo( Phi * p, int my_rank );
static void fast_sweep( Phi * p, SweepInfo * s );
static double solveEikonal(Phi * p, int index, int max_x, int max_y);
static void update_distance(Phi * p, int totalNodes);
static void set_distance_negative_inside(Phi * p, int totalNodes);
static void adjust_boundary( Phi * p );
// public method declarations
Grid3D make_grid3D(int x, int y, int z);
void vti_get_dimensions(FILE *vti, double *d);
void vti_get_data(FILE *vti, int *l, int b_l, double *d, double b_d, Grid3D g);
// private method declarations
static void move_file_pointer(FILE *file_ptr, int lineNumber, int r);
static void get_location(FILE *vti, int *l, int b_l, Grid3D g);
static void get_distance(FILE *vti, double *d, double b_d, Grid3D g);
static int npes; // Number of PEs
static int my_rank; // Rank of the PE
static char * fileName;
static char * outfileName;
static int NX, NY, NZ, totalNodes;
int main(int argc, char *argv[]) {
// MPI startup routine
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
MPI_Comm_size(MPI_COMM_WORLD, &npes);
fileName = argv[1];
outfileName = argv[2];
FILE *f = fopen(fileName, "r");
double dims[6];
vti_get_dimensions(f, dims);
NX = dims[0] + 3;
NY = dims[1] + 3;
NZ = dims[2] + 3;
totalNodes = NX * NY * NZ;
Phi *p = (Phi *) malloc(sizeof(Phi));
p->location = (int *) malloc(sizeof(int) * totalNodes);
p->distance = (double *) malloc(sizeof(double) * totalNodes);
p->dx = dims[3]; p->dy = dims[4]; p->dz = dims[5];
vti_get_data( f, p->location, DEFAULT_BORDER_LOCATION,
p->distance, DEFAULT_BORDER_DISTANCE,
make_grid3D(NX, NY, NZ));
update_distance(p, totalNodes);
calc_dist_field(p, totalNodes);
MPI_Finalize();
return 0;
}
void calc_dist_field( Phi * p, int totalNodes ) {
int sweepNumber = my_rank + 1;
double * tmp_dist;
MPI_Barrier(MPI_COMM_WORLD);
if(my_rank == 0){
tmp_dist = (double *) malloc( totalNodes * sizeof(double) );
}
// sn represents the sweep number
for( int sn = sweepNumber; sn <= NPES; sn += npes) {
SweepInfo s = make_sweepInfo(p, sn);
printf("PE: [%d] - performing sweep number ..... [%d/%d]\n", my_rank, sn, NPES);
fast_sweep(p, &s);
printf("PE: [%d] - completed sweep number ...... [%d/%d]\n", my_rank, sn, NPES);
}
MPI_Barrier(MPI_COMM_WORLD);
#pragma acc update host(p->distance[0:totalNodes])
MPI_Reduce(p->distance, tmp_dist, totalNodes, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);
if( my_rank == 0 ) {
free( p->distance );
p->distance = tmp_dist;
set_distance_negative_inside(p, totalNodes);
adjust_boundary(p);
write_to_file(p->distance);
printf("%s file created\n", outfileName);
}
}
static void update_distance(Phi * p, int totalNodes) {
int *l = &p->location[0];
double *d = &p->distance[0];
for(int i = 0; i < totalNodes; i++) {
if(*l != DEFAULT_BORDER_LOCATION && *d != DEFAULT_BORDER_DISTANCE ) {
*d = (*l == 1 && *d == INFINITY) ? -1 : (*d > 0.0 || *d < 0.0) ? *d : DEFAULT_INTERIOR_DISTANCE;
}
l++; d++;
}
}
void write_to_file(double * dist) {
int x = NX;
int y = NY;
int z = NZ;
char fname[255];
sprintf(fname, "%s.txt", outfileName);
FILE *fp = fopen(fname, "w");
int i,j,k;
double *t = &dist[0];
for(i = 0; i < z; i++){
for(j = 0; j < y; j++){
for(k = 0; k < x; k++) {
fprintf(fp, "%f ", *(t++));
}
fprintf(fp, "\n");
}
fprintf(fp, "\n");
}
}
static SweepInfo make_sweepInfo( Phi * p, int my_rank ) {
SweepInfo s;
s.order = my_rank;
s.firstLevel = 3;
s.lastLevel = (NX + NY + NZ) - 6;
s.xDim = NX-2; s.dx = p->dx;
s.yDim = NY-2; s.dy = p->dy;
s.zDim = NZ-2; s.dz = p->dz;
s.xSweepOff = (s.order == 4 || s.order == 8 ) ? s.xDim + 1 : 0;
s.ySweepOff = (s.order == 2 || s.order == 6 ) ? s.yDim + 1 : 0;
s.zSweepOff = (s.order == 3 || s.order == 7 ) ? s.zDim + 1 : 0;
return s;
}
static void fast_sweep( Phi * p, SweepInfo * s ) {
int start, end, incr;
start = ( s->order == 2 || s->order == 5 || s->order == 7 || s->order == 8 ) ? s->lastLevel : s->firstLevel;
if ( start == s->firstLevel ) {
end = s->lastLevel + 1;
incr = 1;
}
else {
end = s->firstLevel - 1;
incr = 0;
}
int max_x = s->xDim + 2;
int max_y = s->yDim + 2;
int max_xy = max_x * max_y;
#pragma acc data create(p[0:1]) copy(p->distance[0:totalNodes])
for(int level = start; level != end; level = (incr) ? level+1 : level-1) {
// s - start, e - end
int xs, xe, ys, ye;
xs = max(1, level-(s->yDim + s->zDim)) , ys = max(1,level-(s->xDim + s->zDim));
xe = min(s->xDim, level-(s->firstLevel-1)), ye = min(s->yDim, level-(s->firstLevel-1));
int x, y, z, i, j, k, index;
#pragma acc parallel
{
#pragma acc loop independent
for(x = xs; x <= xe; x++) {
#pragma acc loop independent
for(y = ys; y <= ye; y++) {
z = level - (x+y);
if(z > 0 && z <= NZ-2) {
i = abs(z-s->zSweepOff);
j = abs(y-s->ySweepOff);
k = abs(x-s->xSweepOff);
index = i * max_xy + j * max_x + k;
p->distance[index] = solveEikonal(p, index, NX, NY);
}
}
} // end of acc parallel
}
}
}
#pragma acc routine seq
static double solveEikonal(Phi * p, int index, int max_x, int max_y) {
int max_xy = max_x * max_y;
double dist_new = 0;
double dist_old = p->distance[index];
double dx = p->dx, dy = p->dy, dz = p->dz;
double minX = min(p->distance[index-1], p->distance[index+1]);
double minY = min(p->distance[abs(index-max_x)], p->distance[abs(index+max_x)]);
double minZ = min(p->distance[abs(index-max_xy)],p->distance[abs(index+max_xy)]);
double m[] = { minX, minY, minZ} ;
double d[] = { dx, dy, dz};
// sort the mins
for(int i = 1; i < 3; i++){
for(int j = 0; j < 3-i; j++) {
if(m[j] > m[j+1]) {
double tmp_m = m[j];
double tmp_d = d[j];
m[j] = m[j+1]; d[j] = d[j+1];
m[j+1] = tmp_m; d[j+1] = tmp_d;
}
}
}
// simplifying the variables
double m_0 = m[0], m_1 = m[1], m_2 = m[2];
double d_0 = d[0], d_1 = d[1], d_2 = d[2];
double m2_0 = m_0 * m_0, m2_1 = m_1 * m_1, m2_2 = m_2 * m_2;
double d2_0 = d_0 * d_0, d2_1 = d_1 * d_1, d2_2 = d_2 * d_2;
dist_new = m_0 + d_0;
if(dist_new > m_1) {
double s = sqrt(- m2_0 + 2 * m_0 * m_1 - m2_1 + d2_0 + d2_1);
dist_new = ( m_1 * d2_0 + m_0 * d2_1 + d_0 * d_1 * s) / (d2_0 + d2_1);
if(dist_new > m_2) {
double a = sqrt(- m2_0 * d2_1 - m2_0 * d2_2 + 2 * m_0 * m_1 * d2_2
- m2_1 * d2_0 - m2_1 * d2_2 + 2 * m_0 * m_2 * d2_1
- m2_2 * d2_0 - m2_2 * d2_1 + 2 * m_1 * m_2 * d2_0
+ d2_0 * d2_1 + d2_0 * d2_2 + d2_1 * d2_2);
dist_new = (m_2 * d2_0 * d2_1 + m_1 * d2_0 * d2_2 + m_0 * d2_1 * d2_2 + d_0 * d_1 * d_2 * a) /
(d2_0 * d2_1 + d2_0 * d2_2 + d2_1 * d2_2);
}
}
return min(dist_old, dist_new);
}
static void set_distance_negative_inside(Phi * p, int totalNodes) {
int *l = &p->location[0];
double *d = &p->distance[0];
for(int i = 0; i < totalNodes; i++) {
if(*l != DEFAULT_BORDER_LOCATION && *d != DEFAULT_BORDER_DISTANCE ) {
if( *l == 1) *d = -1;
}
l++; d++;
}
}
static void adjust_boundary( Phi * p ) {
int x, y, z, xy, i, j, k;
x = NX;
y = NY;
z = NZ;
xy = x * y;
for(i = 0; i < z; i++){
for(j = 0; j < y; j++){
for(k = 0; k < x; k++){
int I = i, J = j, K = k;
I = (i == z-1) ? I-1 : (!i) ? I+1 : I;
J = (j == y-1) ? J-1 : (!j) ? J+1 : J;
K = (k == x-1) ? K-1 : (!k) ? K+1 : K;
if( i != I || j != J || k != K) {
int l_index = i * xy + j * x + k;
int r_index = I * xy + J * x + K;
p->distance[l_index] = p->distance[r_index];
}
}
}
}
}
/**************** vti_parser ********************************/
static void move_file_pointer(FILE *file_ptr, int lineNumber, int r) {
char tmpStr[512];
if(r) rewind(file_ptr);
while (lineNumber > 0){
fgets (tmpStr, 511, file_ptr);
lineNumber--;
}
}
void vti_get_dimensions(FILE *vti, double *d) {
char tmpStr[512];
rewind(vti);
while (1) {
fgets (tmpStr, 511, vti);
if ( strstr(tmpStr, "ImageData WholeExtent") ) {
sscanf(tmpStr, " <ImageData WholeExtent=\"0 %lf 0 %lf 0 %lf\" Spacing=\"%lf %lf %lf\">",
&d[0], &d[1], &d[2], &d[3], &d[4], &d[5]);
break;
}
}
}
void vti_get_data(FILE *vti, int *l, int b_l, double *d, double b_d, Grid3D g) {
// move the file pointer to
// line 6 from beginning
move_file_pointer(vti, 6, 1);
get_location(vti, l, b_l, g);
// move the file pointer 2 lines
// forward from its last position
move_file_pointer(vti, 2, 0);
get_distance(vti, d, b_d, g);
}
static void get_location(FILE *vti, int *l, int b_l, Grid3D g) {
int i, j, k, *t = &l[0];
for (i = 0; i < g.z; i++){
for (j = 0; j < g.y; j++) {
for (k = 0; k < g.x; k++) {
// Border
if (k == 0 || k == g.x-1 || j == 0 || j == g.y-1 || i == 0 || i == g.z-1 ) {
*(t++) = b_l;
}
else{ // Interior
fscanf(vti, "%d ", t++);
}
}
}
}
}
static void get_distance(FILE *vti, double *d, double b_d, Grid3D g) {
int i, j, k;
double *t = &d[0];
for (i = 0; i < g.z; i++){
for (j = 0; j < g.y; j++) {
for (k = 0; k < g.x; k++) {
// Border distance
if (k == 0 || k == g.x-1 || j == 0 || j == g.y-1 || i == 0 || i == g.z-1 ) {
*(t++) = b_d;
}
else{ // Interior distance
fscanf(vti, "%lf ", t++);
}
}
}
}
}
Grid3D make_grid3D(int x, int y, int z){
Grid3D g;
g.x = x; g.y = y; g.z = z;
return g;
}
当我放弃 openacc 指令并使用 [1-8] 进程运行它时,代码可以正常工作,但是当使用 open acc 编译器时,我得到一个 cudaError。
调用 cuStreamSynchronize 返回错误 700:内核执行期间地址非法
MPI 编译:
mpicc -Wall -g -std=c99 -I/cm/shared/apps/openmpi/gcc/64/1.8.5_wocuda/include -L/cm/shared/apps/openmpi/gcc/64/1.8.5_wocuda/lib -lmpi test.c -o mpi_exec.out
OpenACC 编译:
pgcc -acc -ta=tesla:managed -Minfo=accel -g -lm -I/cm/shared/apps/openmpi/gcc/64/1.8.5_wocuda/include -L/cm/shared/apps/openmpi/gcc/64/1.8.5_wocuda/lib -lmpi test.c -o oacc_exec.out
要运行可执行文件,您需要传入一个输入 vti 文件和输出文件名。
mpirun -np <1-8> <executable> input.vti outputName
输入文件的链接 input.vti
我希望这段代码非常灵活,我想让它可以在具有 1 个 GPU 的单个节点上运行,同时运行 [1-8] 个进程,也可以在 [1-8] 个节点上运行,每个节点具有[1-2] 图形处理器。而且我没有使用 CUDA MPS。
我的规范
GNU/Linux x86_64
NVIDIA GeForce GTX Titan CC: 3.5
pgcc 15.7-0 64-bit target on x86-64 Linux -tp sandybridge
gcc (GCC) 4.8.1
对此有任何帮助或建议,我们将不胜感激。
编辑:
** Compiling with OpenACC
`$ pgcc -fast -ta=tesla:managed -Minfo=accel - I/cm/shared/apps/openmpi/gcc/64/1.8.5_wocuda/include -L/cm/shared/apps/openmpi/gcc/64/1.8.5_wocuda/lib -lmpi rcrovella.c -o withacc
PGC-W-0129-Floating point overflow. Check constants and constant expressions (rcrovella.c: 88)
PGC-W-0129-Floating point overflow. Check constants and constant expressions (rcrovella.c: 142)
PGC-W-0129-Floating point overflow. Check constants and constant expressions (rcrovella.c: 143)
PGC-W-0129-Floating point overflow. Check constants and constant expressions (rcrovella.c: 308)
fast_sweep:
225, Generating copy(p[:1])
228, Loop is parallelizable
230, Loop is parallelizable
Accelerator kernel generated
Generating Tesla code
228, #pragma acc loop gang /* blockIdx.y */
230, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
solveEikonal:
246, Generating acc routine seq
262, Loop is parallelizable
263, Loop carried dependence of m prevents parallelization
Loop carried backward dependence of m prevents vectorization
Loop carried dependence of d prevents parallelization
Loop carried backward dependence of d prevents vectorization
PGC/x86-64 Linux 15.7-0: compilation completed with warnings`
** Compiling without OpenACC
pgcc -I/cm/shared/apps/openmpi/gcc/64/1.8.5_wocuda/include -L/cm/shared/apps/openmpi/gcc/64/1.8.5_wocuda/lib -lmpi rcrovella.c -o noaccPGC-W-0129-Floating point overflow. Check constants and constant expressions (rcrovella.c: 88)
PGC-W-0129-Floating point overflow. Check constants and constant expressions (rcrovella.c: 142)
PGC-W-0129-Floating point overflow. Check constants and constant expressions (rcrovella.c: 143)
PGC-W-0129-Floating point overflow. Check constants and constant expressions (rcrovella.c: 308)
PGC/x86-64 Linux 15.7-0: compilation completed with warnings
** Running with OpenACC
$ mpirun -n 1 withacc ../my_test/input.vti withacc1
PE: [0] - performing sweep number ..... [1/8]
PE: [0] - completed sweep number ...... [1/8]
PE: [0] - performing sweep number ..... [2/8]
PE: [0] - completed sweep number ...... [2/8]
PE: [0] - performing sweep number ..... [3/8]
PE: [0] - completed sweep number ...... [3/8]
PE: [0] - performing sweep number ..... [4/8]
PE: [0] - completed sweep number ...... [4/8]
PE: [0] - performing sweep number ..... [5/8]
PE: [0] - completed sweep number ...... [5/8]
PE: [0] - performing sweep number ..... [6/8]
PE: [0] - completed sweep number ...... [6/8]
PE: [0] - performing sweep number ..... [7/8]
PE: [0] - completed sweep number ...... [7/8]
PE: [0] - performing sweep number ..... [8/8]
PE: [0] - completed sweep number ...... [8/8]
withacc1 file created
** Running without OpenACC
$ mpirun -n 1 noacc ../my_test/input.vti noacc1
PE: [0] - performing sweep number ..... [1/8]
PE: [0] - completed sweep number ...... [1/8]
PE: [0] - performing sweep number ..... [2/8]
PE: [0] - completed sweep number ...... [2/8]
PE: [0] - performing sweep number ..... [3/8]
PE: [0] - completed sweep number ...... [3/8]
PE: [0] - performing sweep number ..... [4/8]
PE: [0] - completed sweep number ...... [4/8]
PE: [0] - performing sweep number ..... [5/8]
PE: [0] - completed sweep number ...... [5/8]
PE: [0] - performing sweep number ..... [6/8]
PE: [0] - completed sweep number ...... [6/8]
PE: [0] - performing sweep number ..... [7/8]
PE: [0] - completed sweep number ...... [7/8]
PE: [0] - performing sweep number ..... [8/8]
PE: [0] - completed sweep number ...... [8/8]
noacc1 file created
** Compare
$ diff -q noacc1.txt withacc1.txt
Files noacc1.txt and withacc1.txt differ
最佳答案
Also in this version I couldn't get openacc to work at all however solution to this should help me a lot.
这是我发现的:
当您使用托管内存设施时:
-ta=tesla:managed
通常我们不在代码中包含data
指令或子句。这个想法是让 cuda 管理的内存运行时为我们管理数据移动。所以我注释掉了我认为是两个“无关的”数据指令。
我认为您的parallel
加速器指令格式不正确。我的编译器 (PGI 15.7) 告诉我并行区域中的 independent
指令不正确:
PGCC-S-0155-Illegal context(parallel) for independent (t2.c: 228)
将 #pragma acc parallel
更改为 #pragma acc kernels
是一种可能的解决方法。
您的代码在使用 INFINITY
时发出一些编译器警告。由于这些只是警告,我没有费心去解决它们。
出于某种原因,我发现编译器在进入加速器区域时没有正确处理 SweepInfo
结构 (s
)。为了解决这个问题,我修改了这个:
int x, y, z, i, j, k, index;
#pragma acc parallel
{
#pragma acc loop independent
for(x = xs; x <= xe; x++) {
#pragma acc loop independent
for(y = ys; y <= ye; y++) {
z = level - (x+y);
if(z > 0 && z <= NZ-2) {
i = abs(z-s->zSweepOff);
j = abs(y-s->ySweepOff);
k = abs(x-s->xSweepOff);
为此:
int x, y, z, i, j, k, index;
int xSO = s->xSweepOff;
int ySO = s->ySweepOff;
int zSO = s->zSweepOff;
#pragma acc kernels
{
#pragma acc loop independent
for(x = xs; x <= xe; x++) {
#pragma acc loop independent
for(y = ys; y <= ye; y++) {
z = level - (x+y);
if(z > 0 && z <= NZ-2) {
i = abs(z-zSO);
j = abs(y-ySO);
k = abs(x-xSO);
我可能会再仔细研究一下。我认为这里要么存在我不理解的限制,要么存在编译器错误。
通过上述更改,我能够让您的代码运行完成,没有任何明显的问题。这是我修改后的代码:
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
#include "mpi.h"
#define min(a,b) (a > b) ? b : a
#define max(a,b) (a < b) ? b : a
#define NPES 8 // max number of PEs allowed
#define DEFAULT_BORDER_LOCATION -1
#define DEFAULT_BORDER_DISTANCE INFINITY
#define DEFAULT_INTERIOR_DISTANCE 90000
typedef struct {
int order;
int firstLevel, lastLevel, level;
int xDim, yDim, zDim;
int xSweepOff, ySweepOff, zSweepOff;
double dx, dy, dz;
} SweepInfo;
typedef struct {
double dx, dy, dz;
int * location;
double * distance;
} Phi;
typedef struct {
int x, y, z;
} Grid3D;
void calc_dist_field( Phi * p, int totalNodes );
void write_to_file(double * dist);
static SweepInfo make_sweepInfo( Phi * p, int my_rank );
static void fast_sweep( Phi * p, SweepInfo * s );
static double solveEikonal(Phi * p, int index, int max_x, int max_y);
static void update_distance(Phi * p, int totalNodes);
static void set_distance_negative_inside(Phi * p, int totalNodes);
static void adjust_boundary( Phi * p );
// public method declarations
Grid3D make_grid3D(int x, int y, int z);
void vti_get_dimensions(FILE *vti, double *d);
void vti_get_data(FILE *vti, int *l, int b_l, double *d, double b_d, Grid3D g);
// private method declarations
static void move_file_pointer(FILE *file_ptr, int lineNumber, int r);
static void get_location(FILE *vti, int *l, int b_l, Grid3D g);
static void get_distance(FILE *vti, double *d, double b_d, Grid3D g);
static int npes; // Number of PEs
static int my_rank; // Rank of the PE
static char * fileName;
static char * outfileName;
static int NX, NY, NZ, totalNodes;
int main(int argc, char *argv[]) {
// MPI startup routine
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
MPI_Comm_size(MPI_COMM_WORLD, &npes);
fileName = argv[1];
outfileName = argv[2];
FILE *f = fopen(fileName, "r");
double dims[6];
vti_get_dimensions(f, dims);
NX = dims[0] + 3;
NY = dims[1] + 3;
NZ = dims[2] + 3;
totalNodes = NX * NY * NZ;
Phi *p = (Phi *) malloc(sizeof(Phi));
p->location = (int *) malloc(sizeof(int) * totalNodes);
p->distance = (double *) malloc(sizeof(double) * totalNodes);
p->dx = dims[3]; p->dy = dims[4]; p->dz = dims[5];
vti_get_data( f, p->location, DEFAULT_BORDER_LOCATION,
p->distance, DEFAULT_BORDER_DISTANCE,
make_grid3D(NX, NY, NZ));
update_distance(p, totalNodes);
calc_dist_field(p, totalNodes);
MPI_Finalize();
return 0;
}
void calc_dist_field( Phi * p, int totalNodes ) {
int sweepNumber = my_rank + 1;
double * tmp_dist;
MPI_Barrier(MPI_COMM_WORLD);
if(my_rank == 0){
tmp_dist = (double *) malloc( totalNodes * sizeof(double) );
}
// sn represents the sweep number
for( int sn = sweepNumber; sn <= NPES; sn += npes) {
SweepInfo s = make_sweepInfo(p, sn);
printf("PE: [%d] - performing sweep number ..... [%d/%d]\n", my_rank, sn, NPES);
fast_sweep(p, &s);
printf("PE: [%d] - completed sweep number ...... [%d/%d]\n", my_rank, sn, NPES);
}
MPI_Barrier(MPI_COMM_WORLD);
// #pragma acc update host(p->distance[0:totalNodes])
MPI_Reduce(p->distance, tmp_dist, totalNodes, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);
if( my_rank == 0 ) {
free( p->distance );
p->distance = tmp_dist;
set_distance_negative_inside(p, totalNodes);
adjust_boundary(p);
write_to_file(p->distance);
printf("%s file created\n", outfileName);
}
}
static void update_distance(Phi * p, int totalNodes) {
int *l = &p->location[0];
double *d = &p->distance[0];
for(int i = 0; i < totalNodes; i++) {
if(*l != DEFAULT_BORDER_LOCATION && *d != DEFAULT_BORDER_DISTANCE ) {
*d = (*l == 1 && *d == INFINITY) ? -1 : (*d > 0.0 || *d < 0.0) ? *d : DEFAULT_INTERIOR_DISTANCE;
}
l++; d++;
}
}
void write_to_file(double * dist) {
int x = NX;
int y = NY;
int z = NZ;
char fname[255];
sprintf(fname, "%s.txt", outfileName);
FILE *fp = fopen(fname, "w");
int i,j,k;
double *t = &dist[0];
for(i = 0; i < z; i++){
for(j = 0; j < y; j++){
for(k = 0; k < x; k++) {
fprintf(fp, "%f ", *(t++));
}
fprintf(fp, "\n");
}
fprintf(fp, "\n");
}
}
static SweepInfo make_sweepInfo( Phi * p, int my_rank ) {
SweepInfo s;
s.order = my_rank;
s.firstLevel = 3;
s.lastLevel = (NX + NY + NZ) - 6;
s.xDim = NX-2; s.dx = p->dx;
s.yDim = NY-2; s.dy = p->dy;
s.zDim = NZ-2; s.dz = p->dz;
s.xSweepOff = (s.order == 4 || s.order == 8 ) ? s.xDim + 1 : 0;
s.ySweepOff = (s.order == 2 || s.order == 6 ) ? s.yDim + 1 : 0;
s.zSweepOff = (s.order == 3 || s.order == 7 ) ? s.zDim + 1 : 0;
return s;
}
static void fast_sweep( Phi * p, SweepInfo * s ) {
int start, end, incr;
start = ( s->order == 2 || s->order == 5 || s->order == 7 || s->order == 8 ) ? s->lastLevel : s->firstLevel;
if ( start == s->firstLevel ) {
end = s->lastLevel + 1;
incr = 1;
}
else {
end = s->firstLevel - 1;
incr = 0;
}
int max_x = s->xDim + 2;
int max_y = s->yDim + 2;
int max_xy = max_x * max_y;
//#pragma acc data create(p[0:1]) copy(p->distance[0:totalNodes])
for(int level = start; level != end; level = (incr) ? level+1 : level-1) {
// s - start, e - end
int xs, xe, ys, ye;
xs = max(1, level-(s->yDim + s->zDim)) , ys = max(1,level-(s->xDim + s->zDim));
xe = min(s->xDim, level-(s->firstLevel-1)), ye = min(s->yDim, level-(s->firstLevel-1));
int x, y, z, i, j, k, index;
int xSO = s->xSweepOff;
int ySO = s->ySweepOff;
int zSO = s->zSweepOff;
#pragma acc kernels
{
#pragma acc loop independent
for(x = xs; x <= xe; x++) {
#pragma acc loop independent
for(y = ys; y <= ye; y++) {
z = level - (x+y);
if(z > 0 && z <= NZ-2) {
i = abs(z-zSO);
j = abs(y-ySO);
k = abs(x-xSO);
index = i * max_xy + j * max_x + k;
p->distance[index] = solveEikonal(p, index, NX, NY);
}
}
} // end of acc parallel
}
}
}
#pragma acc routine seq
static double solveEikonal(Phi * p, int index, int max_x, int max_y) {
int max_xy = max_x * max_y;
double dist_new = 0;
double dist_old = p->distance[index];
double dx = p->dx, dy = p->dy, dz = p->dz;
double minX = min(p->distance[index-1], p->distance[index+1]);
double minY = min(p->distance[abs(index-max_x)], p->distance[abs(index+max_x)]);
double minZ = min(p->distance[abs(index-max_xy)],p->distance[abs(index+max_xy)]);
double m[] = { minX, minY, minZ} ;
double d[] = { dx, dy, dz};
// sort the mins
for(int i = 1; i < 3; i++){
for(int j = 0; j < 3-i; j++) {
if(m[j] > m[j+1]) {
double tmp_m = m[j];
double tmp_d = d[j];
m[j] = m[j+1]; d[j] = d[j+1];
m[j+1] = tmp_m; d[j+1] = tmp_d;
}
}
}
// simplifying the variables
double m_0 = m[0], m_1 = m[1], m_2 = m[2];
double d_0 = d[0], d_1 = d[1], d_2 = d[2];
double m2_0 = m_0 * m_0, m2_1 = m_1 * m_1, m2_2 = m_2 * m_2;
double d2_0 = d_0 * d_0, d2_1 = d_1 * d_1, d2_2 = d_2 * d_2;
dist_new = m_0 + d_0;
if(dist_new > m_1) {
double s = sqrt(- m2_0 + 2 * m_0 * m_1 - m2_1 + d2_0 + d2_1);
dist_new = ( m_1 * d2_0 + m_0 * d2_1 + d_0 * d_1 * s) / (d2_0 + d2_1);
if(dist_new > m_2) {
double a = sqrt(- m2_0 * d2_1 - m2_0 * d2_2 + 2 * m_0 * m_1 * d2_2
- m2_1 * d2_0 - m2_1 * d2_2 + 2 * m_0 * m_2 * d2_1
- m2_2 * d2_0 - m2_2 * d2_1 + 2 * m_1 * m_2 * d2_0
+ d2_0 * d2_1 + d2_0 * d2_2 + d2_1 * d2_2);
dist_new = (m_2 * d2_0 * d2_1 + m_1 * d2_0 * d2_2 + m_0 * d2_1 * d2_2 + d_0 * d_1 * d_2 * a) /
(d2_0 * d2_1 + d2_0 * d2_2 + d2_1 * d2_2);
}
}
return min(dist_old, dist_new);
}
static void set_distance_negative_inside(Phi * p, int totalNodes) {
int *l = &p->location[0];
double *d = &p->distance[0];
for(int i = 0; i < totalNodes; i++) {
if(*l != DEFAULT_BORDER_LOCATION && *d != DEFAULT_BORDER_DISTANCE ) {
if( *l == 1) *d = -1;
}
l++; d++;
}
}
static void adjust_boundary( Phi * p ) {
int x, y, z, xy, i, j, k;
x = NX;
y = NY;
z = NZ;
xy = x * y;
for(i = 0; i < z; i++){
for(j = 0; j < y; j++){
for(k = 0; k < x; k++){
int I = i, J = j, K = k;
I = (i == z-1) ? I-1 : (!i) ? I+1 : I;
J = (j == y-1) ? J-1 : (!j) ? J+1 : J;
K = (k == x-1) ? K-1 : (!k) ? K+1 : K;
if( i != I || j != J || k != K) {
int l_index = i * xy + j * x + k;
int r_index = I * xy + J * x + K;
p->distance[l_index] = p->distance[r_index];
}
}
}
}
}
/**************** vti_parser ********************************/
static void move_file_pointer(FILE *file_ptr, int lineNumber, int r) {
char tmpStr[512];
if(r) rewind(file_ptr);
while (lineNumber > 0){
fgets (tmpStr, 511, file_ptr);
lineNumber--;
}
}
void vti_get_dimensions(FILE *vti, double *d) {
char tmpStr[512];
rewind(vti);
while (1) {
fgets (tmpStr, 511, vti);
if ( strstr(tmpStr, "ImageData WholeExtent") ) {
sscanf(tmpStr, " <ImageData WholeExtent=\"0 %lf 0 %lf 0 %lf\" Spacing=\"%lf %lf %lf\">",
&d[0], &d[1], &d[2], &d[3], &d[4], &d[5]);
break;
}
}
}
void vti_get_data(FILE *vti, int *l, int b_l, double *d, double b_d, Grid3D g) {
// move the file pointer to
// line 6 from beginning
move_file_pointer(vti, 6, 1);
get_location(vti, l, b_l, g);
// move the file pointer 2 lines
// forward from its last position
move_file_pointer(vti, 2, 0);
get_distance(vti, d, b_d, g);
}
static void get_location(FILE *vti, int *l, int b_l, Grid3D g) {
int i, j, k, *t = &l[0];
for (i = 0; i < g.z; i++){
for (j = 0; j < g.y; j++) {
for (k = 0; k < g.x; k++) {
// Border
if (k == 0 || k == g.x-1 || j == 0 || j == g.y-1 || i == 0 || i == g.z-1 ) {
*(t++) = b_l;
}
else{ // Interior
fscanf(vti, "%d ", t++);
}
}
}
}
}
static void get_distance(FILE *vti, double *d, double b_d, Grid3D g) {
int i, j, k;
double *t = &d[0];
for (i = 0; i < g.z; i++){
for (j = 0; j < g.y; j++) {
for (k = 0; k < g.x; k++) {
// Border distance
if (k == 0 || k == g.x-1 || j == 0 || j == g.y-1 || i == 0 || i == g.z-1 ) {
*(t++) = b_d;
}
else{ // Interior distance
fscanf(vti, "%lf ", t++);
}
}
}
}
}
Grid3D make_grid3D(int x, int y, int z){
Grid3D g;
g.x = x; g.y = y; g.z = z;
return g;
}
这是我的编译命令:
pgc++ -fast -acc -ta=tesla:managed -Minfo=accel -I/opt/pgi/linux86-64/15.7/mpi/mpich/include -L/opt/pgi/linux86-64/15.7/mpi/mpich/lib -lmpi t2.c -o t2
这是输出:
$ LD_LIBRARY_PATH=/opt/pgi/linux86-64/15.7/mpi/mpich/lib ./t2 input.vti output
PE: [0] - performing sweep number ..... [1/8]
PE: [0] - completed sweep number ...... [1/8]
PE: [0] - performing sweep number ..... [2/8]
PE: [0] - completed sweep number ...... [2/8]
PE: [0] - performing sweep number ..... [3/8]
PE: [0] - completed sweep number ...... [3/8]
PE: [0] - performing sweep number ..... [4/8]
PE: [0] - completed sweep number ...... [4/8]
PE: [0] - performing sweep number ..... [5/8]
PE: [0] - completed sweep number ...... [5/8]
PE: [0] - performing sweep number ..... [6/8]
PE: [0] - completed sweep number ...... [6/8]
PE: [0] - performing sweep number ..... [7/8]
PE: [0] - completed sweep number ...... [7/8]
PE: [0] - performing sweep number ..... [8/8]
PE: [0] - completed sweep number ...... [8/8]
output file created
$
在我的例子中,它似乎比非 OpenACC 版本运行得更快(编译时没有 -acc -ta=tesla:managed -Minfo=accel
)并且我 diff
-ed 创建的输出文件,它们在非 OpenACC 和 OpenACC 版本之间是相同的。
我还尝试使用 2 个 MPI 等级运行此代码。它似乎运行没有崩溃:
$ LD_LIBRARY_PATH=/opt/pgi/linux86-64/15.7/mpi/mpich/lib /opt/pgi/linux86-64/15.7/mpi/mpich/bin/mpirun -n 2 ./t2 input.vti output
PE: [1] - performing sweep number ..... [2/8]
PE: [0] - performing sweep number ..... [1/8]
PE: [0] - completed sweep number ...... [1/8]
PE: [0] - performing sweep number ..... [3/8]
PE: [1] - completed sweep number ...... [2/8]
PE: [1] - performing sweep number ..... [4/8]
PE: [0] - completed sweep number ...... [3/8]
PE: [0] - performing sweep number ..... [5/8]
PE: [1] - completed sweep number ...... [4/8]
PE: [1] - performing sweep number ..... [6/8]
PE: [0] - completed sweep number ...... [5/8]
PE: [0] - performing sweep number ..... [7/8]
PE: [1] - completed sweep number ...... [6/8]
PE: [1] - performing sweep number ..... [8/8]
PE: [0] - completed sweep number ...... [7/8]
PE: [1] - completed sweep number ...... [8/8]
output file created
$
输出数据文件与单秩版本不同,但与双秩非 OpenACC 运行生成的版本相匹配。因此,如果还有任何遗留问题,我认为它们与 MPI 相关,而不与 OpenACC 相关。
编辑:
为了扩展最后一点,让我们将 OpenACC 排除在外,而只使用 PGI 15.7 工具链附带的 MPI (MPICH):
$ /opt/pgi/linux86-64/15.7/mpi/mpich/bin/mpicc t2.c -o t2 -lmpi
PGC-W-0129-Floating point overflow. Check constants and constant expressions (t2.c: 88)
PGC-W-0129-Floating point overflow. Check constants and constant expressions (t2.c: 142)
PGC-W-0129-Floating point overflow. Check constants and constant expressions (t2.c: 143)
PGC-W-0129-Floating point overflow. Check constants and constant expressions (t2.c: 308)
PGC/x86-64 Linux 15.7-0: compilation completed with warnings
$ LD_LIBRARY_PATH=/opt/pgi/linux86-64/15.7/mpi/mpich/lib /opt/pgi/linux86-64/15.7/mpi/mpich/bin/mpirun -n 1 ./t2 input.vti output1rank
PE: [0] - performing sweep number ..... [1/8]
PE: [0] - completed sweep number ...... [1/8]
PE: [0] - performing sweep number ..... [2/8]
PE: [0] - completed sweep number ...... [2/8]
PE: [0] - performing sweep number ..... [3/8]
PE: [0] - completed sweep number ...... [3/8]
PE: [0] - performing sweep number ..... [4/8]
PE: [0] - completed sweep number ...... [4/8]
PE: [0] - performing sweep number ..... [5/8]
PE: [0] - completed sweep number ...... [5/8]
PE: [0] - performing sweep number ..... [6/8]
PE: [0] - completed sweep number ...... [6/8]
PE: [0] - performing sweep number ..... [7/8]
PE: [0] - completed sweep number ...... [7/8]
PE: [0] - performing sweep number ..... [8/8]
PE: [0] - completed sweep number ...... [8/8]
output1rank file created
$ LD_LIBRARY_PATH=/opt/pgi/linux86-64/15.7/mpi/mpich/lib /opt/pgi/linux86-64/15.7/mpi/mpich/bin/mpirun -n 2 ./t2 input.vti output2rank
PE: [0] - performing sweep number ..... [1/8]
PE: [1] - performing sweep number ..... [2/8]
PE: [1] - completed sweep number ...... [2/8]
PE: [1] - performing sweep number ..... [4/8]
PE: [0] - completed sweep number ...... [1/8]
PE: [0] - performing sweep number ..... [3/8]
PE: [1] - completed sweep number ...... [4/8]
PE: [1] - performing sweep number ..... [6/8]
PE: [0] - completed sweep number ...... [3/8]
PE: [0] - performing sweep number ..... [5/8]
PE: [1] - completed sweep number ...... [6/8]
PE: [1] - performing sweep number ..... [8/8]
PE: [1] - completed sweep number ...... [8/8]
PE: [0] - completed sweep number ...... [5/8]
PE: [0] - performing sweep number ..... [7/8]
PE: [0] - completed sweep number ...... [7/8]
output2rank file created
$ diff -q output1rank.txt output2rank.txt
Files output1rank.txt and output2rank.txt differ
$
关于c - 编写混合 MPI/OpenACC 程序,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/33130676/
是否有等级编号对组编号的概括?对于我的代码,我想创建 MPI::COMM_WORLD 的分层分解。假设我们使用 16 个线程。我使用 MPI::COMM_WORLD.Split 创建 4 个通信器,每
能够使用多个节点执行 MPI 作业以加快流程 这是我目前使用的命令: mpirun --hostfile myhost -np 2 --map-by slot Job.x//只在第一个节点执行 mpi
我想创建一个新的通信器,它只保留处理中使用的行列,如果我有 24 个可用处理器而我只需要 10 个,那么该组应该只保留这 10 个,否则它将保留所有他们。出于某种原因,当我尝试创建一个通信器时,一切都
我正在开发一些程序,而不是在每个节点上有 4 个内核的 4 节点集群上运行。我有一个非常快的 OpenMP 版本的程序,它只在一个集群上运行,我正在尝试使用 MPI 扩展它。由于我的经验有限,我想知道
这是我尝试在 MPI 中编写经典平滑像素平均算法的代码。我几乎让它工作了,但是光晕交换发生了一些奇怪的事情,因为可以看到边缘的线条。我似乎找不到错误。我是否正确地交换了光环?我应该收集最终数组的哪一部
我有兴趣使用 MPI(消息传递接口(interface))实现一种事件驱动的调度队列。我要解决的基本问题是:我有一个主进程,它将作业插入全局队列,每个可用的从进程检索队列中的下一个作业(如果有的话)。
当我们在集群上使用命令 say mpirun -np 4 a.out 启动 MPI 程序时,然后MPI 运行时系统如何跨 CPU 分配进程? 我的意思是,假设它在集群中找到一个空闲的四核 CPU,它会
使用 mpirun 启动 MPI 作业时或 mpiexec ,我可以理解人们如何开始每个单独的过程。但是,如果没有任何编译器魔法,这些包装器可执行文件如何将安排(MPI 通信器)传达给 MPI 进程?
MPI 中的等级和进程有什么区别? 最佳答案 Here是我从中学习所有 MPI 的资源,您可能会发现它很有用。 关于你的问题:流程 是正在运行的程序的实际实例。 MPI 允许您创建逻辑 团体进程,并且
MPI 障碍的时间复杂度是多少?它们是否可以扩展到大量核心(>> 10k)? 最佳答案 屏障复杂性与实现高度相关。它可以是线性的,可以是对数的,也可以是更好或更差。某些架构为某些集体操作提供专用网络,
我正在尝试 MPI,想知道这段代码是否会导致死锁。 MPI_Comm_rank (comm, &my_rank); if (my_rank == 0) { MPI_Send (sendbuf,
我有一个简单的 MPI 代码,可以打印出进程的等级,使用 Intel 编译器和 MPI 库进行编译和链接。然后我在集群的主节点上以交互方式运行它:mpiexec -n 50 ./a.out该节点只有
我正在尝试使用 mpi run 使用扭矩调度程序来运行我的程序。虽然在我的 pbs 文件中我通过 加载了所有库 export LD_LIBRARY_PATH=/path/to/library 但它给出
我在所有等级上都有一个相同长度的数组(假设为 10)。数组中的某些值包含处理器的等级。例如 ... Proc 1: [1 0 0 0 0 1 0 0 0 1] Proc 2: [0 2 2 0 0 0
我想在集群中的每台机器上独立地轻松执行集体通信。假设我有 4 台机器,每台机器有 8 个内核,我的 MPI 程序将运行 32 个 MPI 任务。对于给定的功能,我想要的是: 在每个主机上,只有一个任务
我希望能够获得某个通信器的唯一 ID,但是当您通过 MPI_Comm_split() 调用获得此通信器时,这似乎是一项不可能完成的任务。我知道什么时候 MPI_Comm_split() 被集体调用,每
非阻塞发送/接收立即在 MPI 中返回,操作在后台完成。我看到这种情况发生的唯一方法是当前进程/线程调用/创建另一个进程/线程并将发送/接收代码的图像加载到其中并返回。然后这个新进程/线程完成这个操作
冗长的背景 我正在为心脏电生理学模拟并行处理一些代码。由于用户可以使用内置脚本语言指定他们自己的模拟,我无法知道如何在通信与计算之间进行权衡。为了解决这个问题,我正在制作一种运行时分析器,一旦看到要运
我打算做一些并行计算,我完全是这方面的初学者。我将使用 MPI 来完成并行工作,采用 Master-Slave 模型。我现在有四台机器,希望其中一台作为主节点。但是,我不知道如何指定运行该程序的其他机
我可以在我的大学使用 PelicanHPC 访问集群网络,其中运行各种 MPI 程序,但在家里我想练习编写/使用其他 MPI 程序。有没有办法在我自己的系统上运行 MPI 程序? (我在 Ubuntu
我是一名优秀的程序员,十分优秀!