gpt4 book ai didi

c++ - 这个 SSE2 转置有什么问题?

转载 作者:行者123 更新时间:2023-11-30 04:53:01 26 4
gpt4 key购买 nike

我正在尝试转换这段代码:

double *pB = b[voiceIndex];
double *pC = c[voiceIndex];
double phase = mPhase;
double bp0 = mNoteFrequency * mHostPitch;

for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex++) {
// some other code (that will use phase, like sin(phase))

phase += std::clamp(radiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
}

mPhase = phase;

在 SSE2 中,试图加速整个 block (经常被调用)。我正在使用带有 Fast optimizazion 标志的 MSVC,但自动矢量化非常糟糕。因为我也在学习矢量化,所以我觉得这是一个很好的挑战。

所以我采用了上面的公式,并进行了简化,例如:

radiansPerSampleBp0 = radiansPerSample * bp0;
phase += std::clamp(radiansPerSampleBp0 * pB[sampleIndex] + radiansPerSample * pC[sampleIndex]), 0.0, PI);

它可以被静音成一个串行依赖,例如:

phase[0] += (radiansPerSampleBp0 * pB[0] + radiansPerSample * pC[0])
phase[1] += (radiansPerSampleBp0 * pB[1] + radiansPerSample * pC[1]) + (radiansPerSampleBp0 * pB[0] + radiansPerSample * pC[0])

phase[2] += (radiansPerSampleBp0 * pB[2] + radiansPerSample * pC[2]) + (radiansPerSampleBp0 * pB[1] + radiansPerSample * pC[1])
phase[3] += (radiansPerSampleBp0 * pB[3] + radiansPerSample * pC[3]) + (radiansPerSampleBp0 * pB[2] + radiansPerSample * pC[2])

phase[4] += (radiansPerSampleBp0 * pB[4] + radiansPerSample * pC[4]) + (radiansPerSampleBp0 * pB[3] + radiansPerSample * pC[3])
phase[5] += (radiansPerSampleBp0 * pB[5] + radiansPerSample * pC[5]) + (radiansPerSampleBp0 * pB[4] + radiansPerSample * pC[4])

因此,我做的代码:

double *pB = b[voiceIndex];
double *pC = c[voiceIndex];
double phase = mPhase;
double bp0 = mNoteFrequency * mHostPitch;

__m128d v_boundLower = _mm_set1_pd(0.0);
__m128d v_boundUpper = _mm_set1_pd(PI);
__m128d v_radiansPerSampleBp0 = _mm_set1_pd(mRadiansPerSample * bp0);
__m128d v_radiansPerSample = _mm_set1_pd(mRadiansPerSample);

__m128d v_pB0 = _mm_load_pd(pB);
v_pB0 = _mm_mul_pd(v_pB0, v_radiansPerSampleBp0);
__m128d v_pC0 = _mm_load_pd(pC);
v_pC0 = _mm_mul_pd(v_pC0, v_radiansPerSample);

__m128d v_pB1 = _mm_setr_pd(0.0, pB[0]);
v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
__m128d v_pC1 = _mm_setr_pd(0.0, pC[0]);
v_pC1 = _mm_mul_pd(v_pC1, v_radiansPerSample);

__m128d v_phase = _mm_set1_pd(phase);
__m128d v_phaseAcc;

for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex += 2, pB += 2, pC += 2) {
// some other code (that will use phase, like sin(phase))

v_phaseAcc = _mm_add_pd(v_pB0, v_pC0);
v_phaseAcc = _mm_max_pd(v_phaseAcc, v_boundLower);
v_phaseAcc = _mm_min_pd(v_phaseAcc, v_boundUpper);
v_phaseAcc = _mm_add_pd(v_phaseAcc, v_pB1);
v_phaseAcc = _mm_add_pd(v_phaseAcc, v_pC1);
v_phase = _mm_add_pd(v_phase, v_phaseAcc);

v_pB0 = _mm_load_pd(pB + 2);
v_pB0 = _mm_mul_pd(v_pB0, v_radiansPerSampleBp0);
v_pC0 = _mm_load_pd(pC + 2);
v_pC0 = _mm_mul_pd(v_pC0, v_radiansPerSample);

v_pB1 = _mm_load_pd(pB + 1);
v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
v_pC1 = _mm_load_pd(pC + 1);
v_pC1 = _mm_mul_pd(v_pC1, v_radiansPerSample);
}

mPhase = v_phase.m128d_f64[blockSize % 2 == 0 ? 1 : 0];

但是,不幸的是,在求和“步骤”之后,每个相位值的结果都变得非常不同。尝试调试,但我无法真正找到问题所在。

此外,与旧版本相比,它并没有那么“快”。

你能识别问题吗?您将如何加速代码?

这是完整的代码,如果你想检查两个不同的输出:

#include <iostream>
#include <algorithm>
#include <immintrin.h>
#include <emmintrin.h>

#define PI 3.14159265358979323846

constexpr int voiceSize = 1;
constexpr int bufferSize = 256;

class Param
{
public:
alignas(16) double mPhase = 0.0;
alignas(16) double mPhaseOptimized = 0.0;
alignas(16) double mNoteFrequency = 10.0;
alignas(16) double mHostPitch = 1.0;
alignas(16) double mRadiansPerSample = 1.0;

alignas(16) double b[voiceSize][bufferSize];
alignas(16) double c[voiceSize][bufferSize];

Param() { }

inline void Process(int voiceIndex, int blockSize) {
double *pB = b[voiceIndex];
double *pC = c[voiceIndex];
double phase = mPhase;
double bp0 = mNoteFrequency * mHostPitch;

for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex++) {
// some other code (that will use phase, like sin(phase))

phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);

std::cout << sampleIndex << ": " << phase << std::endl;
}

mPhase = phase;
}
inline void ProcessOptimized(int voiceIndex, int blockSize) {
double *pB = b[voiceIndex];
double *pC = c[voiceIndex];
double phase = mPhaseOptimized;
double bp0 = mNoteFrequency * mHostPitch;

__m128d v_boundLower = _mm_set1_pd(0.0);
__m128d v_boundUpper = _mm_set1_pd(PI);
__m128d v_radiansPerSampleBp0 = _mm_set1_pd(mRadiansPerSample * bp0);
__m128d v_radiansPerSample = _mm_set1_pd(mRadiansPerSample);

__m128d v_pB0 = _mm_load_pd(pB);
v_pB0 = _mm_mul_pd(v_pB0, v_radiansPerSampleBp0);
__m128d v_pC0 = _mm_load_pd(pC);
v_pC0 = _mm_mul_pd(v_pC0, v_radiansPerSample);

__m128d v_pB1 = _mm_setr_pd(0.0, pB[0]);
v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
__m128d v_pC1 = _mm_setr_pd(0.0, pC[0]);
v_pC1 = _mm_mul_pd(v_pC1, v_radiansPerSample);

__m128d v_phase = _mm_set1_pd(phase);
__m128d v_phaseAcc;

for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex += 2, pB += 2, pC += 2) {
// some other code (that will use phase, like sin(phase))

v_phaseAcc = _mm_add_pd(v_pB0, v_pC0);
v_phaseAcc = _mm_max_pd(v_phaseAcc, v_boundLower);
v_phaseAcc = _mm_min_pd(v_phaseAcc, v_boundUpper);
v_phaseAcc = _mm_add_pd(v_phaseAcc, v_pB1);
v_phaseAcc = _mm_add_pd(v_phaseAcc, v_pC1);
v_phase = _mm_add_pd(v_phase, v_phaseAcc);

v_pB0 = _mm_load_pd(pB + 2);
v_pB0 = _mm_mul_pd(v_pB0, v_radiansPerSampleBp0);
v_pC0 = _mm_load_pd(pC + 2);
v_pC0 = _mm_mul_pd(v_pC0, v_radiansPerSample);

v_pB1 = _mm_load_pd(pB + 1);
v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
v_pC1 = _mm_load_pd(pC + 1);
v_pC1 = _mm_mul_pd(v_pC1, v_radiansPerSample);

std::cout << sampleIndex << ": " << v_phase.m128d_f64[0] << std::endl;
std::cout << sampleIndex + 1 << ": " << v_phase.m128d_f64[1] << std::endl;
}

mPhaseOptimized = v_phase.m128d_f64[blockSize % 2 == 0 ? 1 : 0];
}
};

class MyPlugin
{
public:
Param mParam1;

MyPlugin() {
// fill b
for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {
double value = (sampleIndex / ((double)bufferSize - 1));

mParam1.b[voiceIndex][sampleIndex] = value;
}
}

// fill c
for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {
double value = 0.0;

mParam1.c[voiceIndex][sampleIndex] = value;
}
}
}
~MyPlugin() { }

void Process(int blockSize) {
for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
mParam1.Process(voiceIndex, blockSize);
}
}
void ProcessOptimized(int blockSize) {
for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
mParam1.ProcessOptimized(voiceIndex, blockSize);
}
}
};

int main() {
MyPlugin myPlugin;

long long numProcessing = 1;
long long counterProcessing = 0;

// I'll only process once block, just for analysis
while (counterProcessing++ < numProcessing) {
// variable blockSize (i.e. it can vary, being even or odd)
int blockSize = 256;

// process data
myPlugin.Process(blockSize);
std::cout << "#########" << std::endl;
myPlugin.ProcessOptimized(blockSize);
}
}

最佳答案

(更新:这个答案是在显示 v_phase 在循环中使用的编辑之前写的。)

等一下,我想in your previous question每一步都需要 phase 的值。是的,循环内有一个 //一些其他代码(将使用阶段) 注释。

但这看起来您只对最终值感兴趣。因此您可以自由地重新排序,因为每个步骤的夹紧都是独立的。


这只是一个缩减(如数组的总和),并进行了一些动态处理以生成缩减的输入。

您希望 v_phase 的 2 个元素是偶数/奇数元素的 2 个独立部分和。然后你在最后横向求和。 (例如 _mm_unpackhi_pd(v_phase, v_phase) 将高半部分置于底部,或参见 Fastest way to do horizontal float vector sum on x86 )。

然后可选择对结果使用标量 fmod 以将范围缩小到 [0..2Pi) 范围内。 (如果事实证明精度成为问题,则在求和期间偶尔缩小范围可以通过阻止值变得太大来帮助提高精度。)


如果不是这种情况,并且您确实需要一个 { phase[i+0], phase[i+1] } 的 vector ,用于每个 i+=2 步骤,那么您的问题似乎与 prefix sum 有关.但是每个 vector 只有 2 个元素,对具有未对齐负载的元素冗余地执行所有操作可能是有意义的。

节省的费用可能比我想象的要少,因为您需要分别限制每个步骤:在乘法之前执行 pB[i+0] + pB[i+1] 可能会导致不同的限制。

但是您显然已经在我们的简化公式中删除了限制,因此您可以在应用 mul/add 公式之前添加元素。

或者,同时执行两个步骤的乘法/加法可能是一个胜利,然后将其洗牌以添加正确的东西。

关于c++ - 这个 SSE2 转置有什么问题?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/54021454/

26 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com