gpt4 book ai didi

比较 C 中不同大小的矩阵乘法执行时间

转载 作者:行者123 更新时间:2023-12-05 02:32:56 30 4
gpt4 key购买 nike

我需要用 C 语言计算和比较 3 种不同大小(100 * 100、1000 * 1000 和 10000 * 10000)的 2 个矩阵相乘的执行时间。我编写了以下简单代码来为 1000 * 1000 执行此操作,我得到了执行时间

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

int main()
{
int r1 = 1000, c1 = 1000, r2 = 1000, c2 = 1000, i, j, k;

// Dynamic allocation.
double(*a)[r1][c1] = malloc(sizeof *a);
double(*b)[r2][c2] = malloc(sizeof *b);
double(*result)[r1][c2] = malloc(sizeof *result);

// Storing elements of first matrix.
for (i = 0; i < r1; ++i)
{
for (j = 0; j < c1; ++j)
{
(*a)[i][j] = rand() / RAND_MAX;
}
}

// Storing elements of second matrix.
for (i = 0; i < r2; ++i)
{
for (j = 0; j < c2; ++j)
{
(*b)[i][j] = rand() / RAND_MAX;
}
}

// Initializing all elements of result matrix to 0
for (i = 0; i < r1; ++i)
{
for (j = 0; j < c2; ++j)
{
(*result)[i][j] = 0;
}
}

clock_t begin1 = clock();
// Multiplying matrices a and b and
// storing result in result matrix
for (i = 0; i < r1; ++i)
for (j = 0; j < c2; ++j)
for (k = 0; k < c1; ++k)
{
(*result)[i][j] += (*a)[i][k] * (*b)[k][j];
}

clock_t end1 = clock();
double time_taken = (double)(end1 - begin1) / CLOCKS_PER_SEC;
printf("\n function took %f seconds to execute \n", time_taken);
return 0;
}

现在我想对其他两种尺寸重复这部分,并在我的程序结束时运行一次得到这样的结果:

the execution time for 100 * 100 is 1 second
the execution time for 1000 * 1000 is 2 seconds
the execution time for 10000 * 10000 is 3 seconds

最好的解决方案是什么?当我在 1000 * 1000 之后重复这部分 10000 * 10000 时,我得到了段错误。

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

int main()
{
int r1 = 1000, c1 = 1000, r2 = 1000, c2 = 1000, i, j, k;

// Dynamic allocation.
double(*a)[r1][c1] = malloc(sizeof *a);
double(*b)[r2][c2] = malloc(sizeof *b);
double(*result)[r1][c2] = malloc(sizeof *result);

// Storing elements of first matrix.
for (i = 0; i < r1; ++i)
{
for (j = 0; j < c1; ++j)
{
(*a)[i][j] = rand() / RAND_MAX;
}
}

// Storing elements of second matrix.
for (i = 0; i < r2; ++i)
{
for (j = 0; j < c2; ++j)
{
(*b)[i][j] = rand() / RAND_MAX;
}
}

// Initializing all elements of result matrix to 0
for (i = 0; i < r1; ++i)
{
for (j = 0; j < c2; ++j)
{
(*result)[i][j] = 0;
}
}

clock_t begin1 = clock();
// Multiplying matrices a and b and
// storing result in result matrix
for (i = 0; i < r1; ++i)
for (j = 0; j < c2; ++j)
for (k = 0; k < c1; ++k)
{
(*result)[i][j] += (*a)[i][k] * (*b)[k][j];
}

clock_t end1 = clock();
double time_taken = (double)(end1 - begin1) / CLOCKS_PER_SEC;
printf("\n \nfunction took %f seconds to execute \n",
time_taken);
free(a);
free(b);
free(result);

r1 = 10000, c1 = 10000, r2 = 10000, c2 = 10000;
printf("\n run second one for %d \n",r1);
// Storing elements of first matrix.
for (i = 0; i < r1; ++i)
{
for (j = 0; j < c1; ++j)
{
(*a)[i][j] = rand() / RAND_MAX;
}
}

// Storing elements of second matrix.
for (i = 0; i < r2; ++i)
{
for (j = 0; j < c2; ++j)
{
(*b)[i][j] = rand() / RAND_MAX;
}
}

// Initializing all elements of result matrix to 0
for (i = 0; i < r1; ++i)
{
for (j = 0; j < c2; ++j)
{
(*result)[i][j] = 0;
}
}

begin1 = clock();
// Multiplying matrices a and b and
// storing result in result matrix
for (i = 0; i < r1; ++i)
for (j = 0; j < c2; ++j)
for (k = 0; k < c1; ++k)
{
(*result)[i][j] += (*a)[i][k] * (*b)[k][j];
}

end1 = clock();
time_taken = (double)(end1 - begin1) / CLOCKS_PER_SEC;
printf("\n second function took %f seconds to execute \n",
time_taken);
free(a);
free(b);
free(result);

return 0;
}

最佳答案

程序的简化版本:

...
int main()
{

int r1 = 1000, c1 = 1000, r2 = 1000, c2 = 1000, i, j, k;

// Dynamic allocation.

double(*a)[r1][c1] = malloc(sizeof *a);
double(*b)[r2][c2] = malloc(sizeof *b);
double(*result)[r1][c2] = malloc(sizeof *result);

...

free(a);
free(b);
free(result);

r1 = 10000, c1 = 10000, r2 = 10000, c2 = 10000;
for (i = 0; i < r1; ++i)
for (j = 0; j < c1; ++j)
(*a)[i][j] = rand() /RAND_MAX; // KABOOM !
...
}

有关 VLA 阵列的快速但重要的信息。在“variable-length-array”中命名“variable”意味着大小存储在变量中,而不是大小是变量。此变量是隐藏的,只能通过 sizeof 运算符读取。

数组的大小与其类型有关,而不是其值。因此,无论对象是动态的还是自动的,VLA 类型(和对象)的维度都不能改变。

行:

double(*a)[r1][c1] = malloc(sizeof *a);

它解释为:

typedef double __hidden_type[r1][c1];
__hidden_type* a = malloc(sizeof *a);

... changes of r1 or c1 do not affect sizeof(__hidden_type)

在定义类型时,大小会绑定(bind)到类型。之后类型是不可变的。

因此改变r1不会改变*a的大小。您需要创建一个新的 a(或者说它的类型)并为这个新的 *a 分配内存。

我建议将整个测试移动到一个函数,该函数将 r1r2c1c2 作为参数。数组将是函数的局部数组。

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
void bench(int r1, int c1, int r2, int c2) {
int i, j, k;

// Dynamic allocation.

double(*a)[r1][c1] = malloc(sizeof *a);
double(*b)[r2][c2] = malloc(sizeof *b);
double(*result)[r1][c2] = malloc(sizeof *result);

// Storing elements of first matrix.
for (i = 0; i < r1; ++i)
{
for (j = 0; j < c1; ++j)
{
(*a)[i][j] = rand() /RAND_MAX;
}
}

// Storing elements of second matrix.

for (i = 0; i < r2; ++i)
{
for (j = 0; j < c2; ++j)
{
(*b)[i][j] = rand()/ RAND_MAX;
}
}
// Initializing all elements of result matrix to 0
for (i = 0; i < r1; ++i)
{
for (j = 0; j < c2; ++j)
{
(*result)[i][j] = 0;
}
}
clock_t begin1 = clock();
// Multiplying matrices a and b and
// storing result in result matrix
for (i = 0; i < r1; ++i)
for (j = 0; j < c2; ++j)
for (k = 0; k < c1; ++k)
{
(*result)[i][j] += (*a)[i][k] * (*b)[k][j];
}

clock_t end1 = clock();
double time_taken = (double)(end1 - begin1) / CLOCKS_PER_SEC;
printf("\n \nfunction took %f seconds to execute \n", time_taken);
free(a);
free(b);
free(result);
}

int main()
{
bench(1000, 1000, 1000, 1000);
bench(2000, 2000, 2000, 2000);
}

我已将大小从 10000 减少到 2000,以便在合理的时间内获得结果。在我的机器上,我得到了:

function took 1.966788 seconds to execute 
function took 37.370633 seconds to execute

请注意,该函数对缓存非常不友好。

    for (i = 0; i < r1; ++i)
for (j = 0; j < c2; ++j)
for (k = 0; k < c1; ++k)
(*result)[i][j] += (*a)[i][k] * (*b)[k][j];

k 的每次迭代中,访问 (*b)[k][j] 时都会出现缓存未命中。尝试交换 jk 循环:

    for (i = 0; i < r1; ++i)
for (k = 0; k < c1; ++k)
for (j = 0; j < c2; ++j)
(*result)[i][j] += (*a)[i][k] * (*b)[k][j];

现在当增加 j 然后 (*result)[i][j](*b)[k][j]可能在缓存中。在我的机器上,这个微不足道的改变带来了 10 倍的加速:

function took 0.319594 seconds to execute 
function took 3.829459 seconds to execute

关于比较 C 中不同大小的矩阵乘法执行时间,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/71084580/

30 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com