gpt4 book ai didi

c - 如何将这个汇编代码转换为内部代码?

转载 作者:行者123 更新时间:2023-11-30 20:02:59 25 4
gpt4 key购买 nike

下面看起来像是内在函数,但是,我对内在函数不熟悉。请帮我转换真实的代码。特别是, testFunc() 对我来说更加模糊。我想这也适用于两个浮点 vector 的点积,但是标签 Lrep 和 Lexit 让我感到困惑。请帮我弄清楚。移动处理器可以使用内在函数吗?

void testFunc(int M, int N, int K, float* A, float* B, float* C)
{
float *a;
float *b = new float[K*N];
float *pointb = B;
float *bb;
float *answer = C;
float c[8];

for (int j = 0, k; j < K; j++) {
bb = b + j;
for (k = N / 8; k > 0; k--) {
*bb = *pointb++; bb += K;
*bb = *pointb++; bb += K;
*bb = *pointb++; bb += K;
*bb = *pointb++; bb += K;
*bb = *pointb++; bb += K;
*bb = *pointb++; bb += K;
*bb = *pointb++; bb += K;
*bb = *pointb++; bb += K;
}
for (k = N / 8 * 8; k < N; k++) {
*bb = *pointb++; bb += K;
}
}

int K8 = K / 8 * 8;

for (int i = 0; i < M; i++) for (int k = 0; k < N; k++) {
a = A + i * K;
bb = b + k * K;
__asm {
mov esi, K8;
sub esi, 8;
shl esi, 2;
xor edi, edi;
mov edx, a;
mov ebx, bb;
vxorps ymm3, ymm3, ymm3;
Lrep:
cmp edi, esi;
jg Lexit;
vmovups ymm0, ymmword ptr[edx + edi];
vfmadd231ps ymm3, ymm0, ymmword ptr[ebx + edi];
add edi, 32;
jmp Lrep;
Lexit:
vmovups ymmword ptr[c], ymm3;
}

for (int j = K8; j < K; ) {
*c += *(a + j) * *(bb + j); j++;
}

*answer = (c[0] + c[1] + c[2] + c[3] + c[4] + c[5] + c[6] + c[7]);
answer++;
}
}

pA = A;
for (k = 0; k < K; k++) {
pC = C;
for (i = 0; i < M; i++) {
pA = A + i * K + k;
pB = B + k * N;
for (j = N / 32; j > 0; j--) {
_asm {
mov eax, pC;
mov ebx, pA;
mov ecx, pB;
vmovups ymm0, ymmword ptr[eax];
vmovss xmm1, dword ptr[ebx];
vbroadcastss ymm4, xmm1;
vmovups ymm2, ymmword ptr[ecx];
vfmadd231ps ymm0, ymm4, ymm2;
vmovups ymmword ptr[eax], ymm0;
}
pC += 8; pB += 8;
_asm {
mov eax, pC;
mov ebx, pA;
mov ecx, pB;
vmovups ymm0, ymmword ptr[eax];
vmovss xmm1, dword ptr[ebx];
vbroadcastss ymm4, xmm1;
vmovups ymm2, ymmword ptr[ecx];
vfmadd231ps ymm0, ymm4, ymm2;
vmovups ymmword ptr[eax], ymm0;
}
pC += 8; pB += 8;
_asm {
mov eax, pC;
mov ebx, pA;
mov ecx, pB;
vmovups ymm0, ymmword ptr[eax];
vmovss xmm1, dword ptr[ebx];
vbroadcastss ymm4, xmm1;
vmovups ymm2, ymmword ptr[ecx];
vfmadd231ps ymm0, ymm4, ymm2;
vmovups ymmword ptr[eax], ymm0;
}
pC += 8; pB += 8;
_asm {
mov eax, pC;
mov ebx, pA;
mov ecx, pB;
vmovups ymm0, ymmword ptr[eax];
vmovss xmm1, dword ptr[ebx];
vbroadcastss ymm4, xmm1;
vmovups ymm2, ymmword ptr[ecx];
vfmadd231ps ymm0, ymm4, ymm2;
vmovups ymmword ptr[eax], ymm0;
}
pC += 8; pB += 8;
}
for (j = N / 32 * 32; j < N; j++) {
*pC += *pA * *pB;
pC += 1; pB += 1;
}
}
}

最佳答案

在内联函数中,这段代码重复了 4 次。

{
// vmovups ymm0, ymmword ptr[eax];
__m256 tempC = _mm256_loadu_ps((float*)pC);

// vmovss xmm1, dword ptr[ebx];
// vbroadcastss ymm4, xmm1;
__m256 tempA = _mm256_set1_ps(*pA);

// vmovups ymm2, ymmword ptr[ecx];
__m256 tempB = _mm256_loadu_ps((float*)pB);

// vfmadd231ps ymm0, ymm4, ymm2;
__m256 result = _mm256_fmadd_ps(tempA, tempB, tempC);

// vmovups ymmword ptr[eax], ymm0;
_mm256_storeu_ps(pC, result);
}

pC += 8; pB += 8;

不过,不断地从 pA 广播相同的值似乎有点多余。

关于c - 如何将这个汇编代码转换为内部代码?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/57138565/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com