gpt4 book ai didi

c - SIMD 与 Altivec : why is multiplying two vectors faster than adding two vectors?

转载 作者:行者123 更新时间:2023-11-30 14:56:56 26 4
gpt4 key购买 nike

我一直在使用 altivec 实现基本的数学运算,作为为即将进行的项目学习 simd 的一种方式。另外,作为查看其性能优势的一种方式,我跟踪了执行操作所需的时间,但我遇到了一些奇怪的情况。

我做的第一件事是将两个 vector 相加并减去两个 vector 。这很好用。我做的下一件事是将两个 vector 相乘。然而,乘法比加法更快,尽管根据我的特定 CPU 数据表中有关所使用指令的说明,用于加法和乘法的时钟周期更少。

我有两个数组,每个数组大小为 10MB,并通过这两个例程运行它们:

void av_AddValues(int32_t* intArrayA, int32_t* intArrayB, int32_t* outputBuffer, int size)
{
int iterations = size / (sizeof(__vector int32_t) / sizeof(int32_t));

__vector int32_t* tempA = (__vector int32_t *) intArrayA;
__vector int32_t* tempB = (__vector int32_t *) intArrayB;
__vector int32_t* tempOut = (__vector int32_t *) outputBuffer;
for(int i = 0; i < iterations; i++)
{
__vector int32_t sum = vec_add(*tempA, *tempB);
vec_st(sum, 0, tempOut);

tempA++;
tempB++;
tempOut++;
}
}

void av_MultiplyValues(int16_t* intArrayA, int16_t* intArrayB, int32_t* outputBuffer, int size)
{
int iterations = size / (sizeof(__vector int16_t) / sizeof(int16_t));
__vector int16_t* tempA = (__vector int16_t *) intArrayA;
__vector int16_t* tempB = (__vector int16_t *) intArrayB;
__vector int32_t* tempOut = (__vector int32_t *) outputBuffer;


for(int i = 0; i < iterations; i++)
{
__vector int32_t productEven = vec_mule(*tempA, *tempB);
__vector int32_t productOdd = vec_mulo(*tempA, *tempB);

__vector int32_t mergedProductHigh = vec_mergeh(productEven, productOdd);
__vector int32_t mergedProductLow = vec_mergel(productEven, productOdd);

vec_st(mergedProductHigh, 0, tempOut);
tempOut++;
vec_st(mergedProductLow, 0, tempOut);

tempA++;
tempB++;
tempOut++;
}
}

在我的特定平台上,av_AddValues 需要 81ms 来处理,av_MultiplyValues 需要 48ms 来处理。 (使用 std::chrono::high_resolution_clock 记录的时间)

为什么乘法的处理时间比加法要少?

考虑到 __vector 类型总是处理 16 字节的数据,我认为 32 位值相加与 16 位值相乘并没有什么区别。

我的第一个想法是,由于将数字相加是一项微不足道的任务,因此 CPU 完成操作的速度比从内存中获取数据的速度要快。而在乘法中,这种获取延迟被 CPU 忙于工作而无需等待那么长时间的事实所隐藏。

这是一个正确的假设吗?

完整代码:

#include <chrono>
#include <random>
#include <limits>

#include <iostream>
#include <cassert>
#include <cstring>
#include <cstdint>
#include <malloc.h>

#include <altivec.h>
#undef vector

void GenerateRandom16bitValues(int16_t* inputABuffer, int16_t* inputBBuffer, int32_t* outputBuffer, int size);
void GenerateRandom32bitValues(int32_t* inputABuffer, int32_t* inputBBuffer, int32_t* outputBuffer, int size);
void TestAdd();
void TestMultiply();
void av_AddValues(int32_t* intArrayA, int32_t* intArrayB, int32_t* outputBuffer, int size);
void av_MultiplyValues(int16_t* intArrayA, int16_t* intArrayB, int32_t* outputBuffer, int size);

int main()
{
TestAdd();
TestMultiply();
}

void GenerateRandom16bitValues(int16_t* inputABuffer, int16_t* inputBBuffer, int32_t* outputBuffer, int size)
{
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_int_distribution<> dis(std::numeric_limits<int16_t>::min(), std::numeric_limits<int16_t>::max());

for(int i = 0; i < size; i++)
{
inputABuffer[i] = dis(gen);
inputBBuffer[i] = dis(gen);
outputBuffer[i] = 0;
}
}

void GenerateRandom32bitValues(int32_t* inputABuffer, int32_t* inputBBuffer, int32_t* outputBuffer, int size)
{
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_int_distribution<> dis(std::numeric_limits<int32_t>::min(), std::numeric_limits<int32_t>::max());

for(int i = 0; i < size; i++)
{
inputABuffer[i] = dis(gen);
inputBBuffer[i] = dis(gen);
outputBuffer[i] = 0;
}
}

void TestAdd()
{
int size = 10'485'760;
int bytes = size * sizeof(int32_t);

int32_t* inputABuffer = (int32_t*) memalign(64, bytes);
int32_t* inputBBuffer = (int32_t*) memalign(64, bytes);
int32_t* outputBuffer = (int32_t*) memalign(64, bytes);
assert(inputABuffer != nullptr);
assert(inputBBuffer != nullptr);
assert(outputBuffer != nullptr);

GenerateRandom32bitValues(inputABuffer, inputBBuffer, outputBuffer, size);

for(int i = 0; i < 20; i++)
{
auto start = std::chrono::high_resolution_clock::now();
av_AddValues(inputABuffer, inputBBuffer, outputBuffer, size);
auto end = std::chrono::high_resolution_clock::now();
auto diff = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);

for(int k = 0; k < size; k++)
{
assert(outputBuffer[k] == (inputABuffer[k] + inputBBuffer[k]));
}

std::cout << "Vector Sum - " << diff.count() << "ms\n";
memset(outputBuffer, 0, size);
}
}

void TestMultiply()
{
int size = 10'485'760;
int16_t* inputABuffer = (int16_t*) memalign(64, size * sizeof(int16_t));
int16_t* inputBBuffer = (int16_t*) memalign(64, size * sizeof(int16_t));
int32_t* outputBuffer = (int32_t*) memalign(64, size * sizeof(int32_t));
assert(inputABuffer != nullptr);
assert(inputBBuffer != nullptr);
assert(outputBuffer != nullptr);

GenerateRandom16bitValues(inputABuffer, inputBBuffer, outputBuffer, size);

for(int i = 0; i < 20; i++)
{
auto start = std::chrono::high_resolution_clock::now();
av_MultiplyValues(inputABuffer, inputBBuffer, outputBuffer, size);
auto end = std::chrono::high_resolution_clock::now();
auto diff = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);

for(int k = 0; k < size; k++)
{
assert(outputBuffer[k] == (inputABuffer[k] * inputBBuffer[k]));
}

std::cout << "Vector product - " << diff.count() << "ms\n";
memset(outputBuffer, 0, size);
}
}

void av_AddValues(int32_t* intArrayA, int32_t* intArrayB, int32_t* outputBuffer, int size)
{
int iterations = size / (sizeof(__vector int32_t) / sizeof(int32_t));

__vector int32_t* tempA = (__vector int32_t *) intArrayA;
__vector int32_t* tempB = (__vector int32_t *) intArrayB;
__vector int32_t* tempOut = (__vector int32_t *) outputBuffer;

for(int i = 0; i < iterations; i++)
{
__vector int32_t sum = vec_add(*tempA, *tempB);
vec_st(sum, 0, tempOut);

tempA++;
tempB++;
tempOut++;
}
}

void av_MultiplyValues(int16_t* intArrayA, int16_t* intArrayB, int32_t* outputBuffer, int size)
{
int iterations = size / (sizeof(__vector int16_t) / sizeof(int16_t));
__vector int16_t* tempA = (__vector int16_t *) intArrayA;
__vector int16_t* tempB = (__vector int16_t *) intArrayB;
__vector int32_t* tempOut = (__vector int32_t *) outputBuffer;
for(int i = 0; i < iterations; i++)
{
__vector int32_t productEven = vec_mule(*tempA, *tempB);
__vector int32_t productOdd = vec_mulo(*tempA, *tempB);

__vector int32_t mergedProductHigh = vec_mergeh(productEven, productOdd);
__vector int32_t mergedProductLow = vec_mergel(productEven, productOdd);

vec_st(mergedProductHigh, 0, tempOut);
tempOut++;
vec_st(mergedProductLow, 0, tempOut);

tempA++;
tempB++;
tempOut++;
}
}

性能统计和性能记录的输出:

  Adding
Performance counter stats for './alti':

2151.146080 task-clock (msec) # 0.999 CPUs utilized
9 context-switches # 0.004 K/sec
0 cpu-migrations # 0.000 K/sec
30957 page-faults # 0.014 M/sec
3871497132 cycles # 1.800 GHz
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
1504538891 instructions # 0.39 insns per cycle
234038234 branches # 108.797 M/sec
687912 branch-misses # 0.29% of all branches
270305159 L1-dcache-loads # 125.656 M/sec
79819113 L1-dcache-load-misses # 29.53% of all L1-dcache hits
<not supported> LLC-loads
<not supported> LLC-load-misses

2.152697186 seconds time elapsed


CPU Utilization
76.04% alti alti [.] av_AddValues

Multiply

Performance counter stats for './alti':

1583.016640 task-clock (msec) # 0.999 CPUs utilized
4 context-switches # 0.003 K/sec
0 cpu-migrations # 0.000 K/sec
20717 page-faults # 0.013 M/sec
2849050875 cycles # 1.800 GHz
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
1520409634 instructions # 0.53 insns per cycle
179185029 branches # 113.192 M/sec
535437 branch-misses # 0.30% of all branches
205341530 L1-dcache-loads # 129.715 M/sec
27124936 L1-dcache-load-misses # 13.21% of all L1-dcache hits
<not supported> LLC-loads
<not supported> LLC-load-misses

1.584145737 seconds time elapsed


CPU Utilization
60.35% alti alti [.] av_MultiplyValues

最佳答案

这与输入缓冲区的大小有关。

在一种情况下(TestAdd):

int size = 10'485'760;
int bytes = size * sizeof(int32_t);

int32_t* inputABuffer = (int32_t*) memalign(64, bytes);
int32_t* inputBBuffer = (int32_t*) memalign(64, bytes);
int32_t* outputBuffer = (int32_t*) memalign(64, bytes);

您分配 3 * size * 4 字节 (sizeof(int32_t) = 4)

在另一个(test_mul)中:

int size = 10'485'760;
int16_t* inputABuffer = (int16_t*) memalign(64, size * sizeof(int16_t));
int16_t* inputBBuffer = (int16_t*) memalign(64, size * sizeof(int16_t));
int32_t* outputBuffer = (int32_t*) memalign(64, size * sizeof(int32_t));

您分配 size*4 + 2*size*2 (sizeof(int16_t) = 2)

由于此代码完全受内存限制,因此第二个代码的速度为 (3*4)/(4 + 2*2) = 快 1.5 倍

这与您的测量结果一致,因为 2.15/1.5 = 1.43,接近 1.58。

关于c - SIMD 与 Altivec : why is multiplying two vectors faster than adding two vectors?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/44273494/

26 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com