gpt4 book ai didi

c++ - 你能在 C++ 内联汇编中建议我更好的解决方案吗?

转载 作者:塔克拉玛干 更新时间:2023-11-03 07:31:22 25 4
gpt4 key购买 nike

我正在学习汇编,并开始在 Digital-Mars C++ 编译器(intel sytanx 更易于阅读)中对 SSE 和 MMX 寄存器进行实验。我已经完成了一个将 var_1 作为值并将其转换为 var_2 数字系统的程序(目前为 8 位。稍后将其扩展为 32 64 128)。程序通过两种方式做到这一点:

  1. __asm 内联

  2. %(modulo) 运算符的常用 C++ 方式。

问题:你能告诉我使用 xmm0-7 和 mm0-7 寄存器的更有效方法吗?你能告诉我如何与 al,ah...8 位寄存器交换它们的确切字节吗?

与我计算机(pentium-m centrino 2.0GHz)上的 __asm 相比,C++ 常用方式中的常用 %(模)运算符非常慢。如果你能告诉我如何去掉 __asmm 中的除法指令,它会更快。

当我运行程序时,它会给我:

(for the values: var_1=17,var_2=2,all loops are 200M times)

17 is 10001 in number system 2
__asm(clock)...........: 7250 <------too bad. it is 8-bit calc.
C++(clock).............: 12250 <------not very slow(var_2 is a power of 2)


(for the values: var_1=33,var_2=7,all loops are 200M times)
33 is 45 in number system 7
__asm(clock)..........: 2875 <-------not good. it is 8-bit calc.
C++(clock)............: 6328 <----------------really slow(var_2 is not a power of 2)

第二个 C++ 代码(带有 % 运算符的代码)://////////////////////////////////////////////////////

t1=clock();//reference time
for(int i=0;i<200000000;i++)
{
y=x;
counter=0;
while(y>g)
{

var_3[counter]=y%g;
y/=g;
counter++;
}

var_3[counter]=y%g;
}
t2=clock();//final time

_asm 代码:///////////////////////////////////////////////////////////////////////////////////////////////////////

     __asm  // i love assembly in some parts of C++
{

pushf //here does register backup
push eax
push ebx
push ecx
push edx
push edi

mov eax,0h //this will be outer loop counter init to zero
//init of medium-big registers to zero
movd xmm0,eax //cannot set to immediate constant: xmm0=outer loop counter
shufps xmm0,xmm0,0h //this makes all bits zero
movd xmm1,eax
movd xmm2,eax
shufps xmm1,xmm1,0h
shufps xmm2,xmm2,0h
movd xmm2,eax
shufps xmm3,xmm3,0h//could have made pxor xmm3,xmm3(single instruction)
//init complete(xmm0,xmm1,xmm2,xmm3 are zero)

movd xmm1,[var_1] //storing variable_1 to register
movd xmm2,[var_2] //storing var_2 to register
lea ebx,var_3 //calculate var_3 address
movd xmm3,ebx //storing var_3's address to register
for_loop:
mov eax,0h
//this line is index-init to zero(digit array index)
movd edx,xmm2
mov cl,dl //this is the var_1 stored in cl
movd edx,xmm1
mov al,dl //this is the var_2 stored in al
mov edx,0h
dng:
mov ah,00h //preparation for a 8-bit division
div cl //divide

movd ebx,xmm3 //get var_3 address
add ebx,edx //i couldnt find a way to multiply with 4
add ebx,edx //so i added 4 times ^^
add ebx,edx //add
add ebx,edx //last adding
//below, mov [ebx],ah is the only memory accessing instruction
mov [ebx],ah //(8 bit)this line is equivalent to var_3[i]=remainder


inc edx //i++;
cmp al,00h //is division zero?
jne dng //if no, loop again

//here edi register has the number of digits

movd eax,xmm0 //get the outer loop counter from medium-big register
add eax,01h //j++;
movd xmm0,eax //store the new counter to medium-big register
cmp eax,0BEBC200h //is j<(200,000,000) ?
jb for_loop //if yes, go loop again
mov [var_3_size],edx //now we have number of digits too!
//here does registers revert back to old values
pop edi
pop edx
pop ecx
pop ebx
pop eax
popf

}

整个代码:///////////////////////////////////////////////////////////////////////////////////

#include <iostream.h>
#include <cmath>
#include<stdlib.h>
#include<stdio.h>
#include<time.h>
int main()
{

srand(time(0));


clock_t t1=clock();
clock_t t2=clock();

int var_1=17; //number itself
int var_2=2; //number system
int var_3[100]; //digits to be showed(maximum 100 as seen )
int var_3_size=0;//asm block will decide what will the number of digits be

for(int i=0;i<100;i++)
{
var_3[i]=0; //here we initialize digits to zeroes
}


t1=clock();//reference time to take
__asm // i love assembly in some parts of C++
{

pushf //here does register backup
push eax
push ebx
push ecx
push edx
push edi

mov eax,0h //this will be outer loop counter init to zero
//init of medium-big registers to zero
movd xmm0,eax //cannot set to immediate constant: xmm0=outer loop counter
shufps xmm0,xmm0,0h //this makes all bits zero
movd xmm1,eax
movd xmm2,eax
shufps xmm1,xmm1,0h
shufps xmm2,xmm2,0h
movd xmm2,eax
shufps xmm3,xmm3,0h
//init complete(xmm0,xmm1,xmm2,xmm3 are zero)

movd xmm1,[var_1] //storing variable_1 to register
movd xmm2,[var_2] //storing var_2 to register
lea ebx,var_3 //calculate var_3 address
movd xmm3,ebx //storing var_3's address to register
for_loop:
mov eax,0h
//this line is index-init to zero(digit array index)
movd edx,xmm2
mov cl,dl //this is the var_1 stored in cl
movd edx,xmm1
mov al,dl //this is the var_2 stored in al
mov edx,0h
dng:
mov ah,00h //preparation for a 8-bit division
div cl //divide

movd ebx,xmm3 //get var_3 address
add ebx,edx //i couldnt find a way to multiply with 4
add ebx,edx //so i added 4 times ^^
add ebx,edx //add
add ebx,edx //last adding
//below, mov [ebx],ah is the only memory accessing instruction
mov [ebx],ah //(8 bit)this line is equivalent to var_3[i]=remainder


inc edx //i++;
cmp al,00h //is division zero?
jne dng //if no, loop again

//here edi register has the number of digits

movd eax,xmm0 //get the outer loop counter from medium-big register
add eax,01h //j++;
movd xmm0,eax //store the new counter to medium-big register
cmp eax,0BEBC200h //is j<(200,000,000) ?
jb for_loop //if yes, go loop again
mov [var_3_size],edx //now we have number of digits too!
//here does registers revert back to old values
pop edi
pop edx
pop ecx
pop ebx
pop eax
popf

}
t2=clock(); //finish time
printf("\n assembly_inline(clocks): %i for the 200 million calculations",(t2-t1));

printf("\n value %i(in decimal) is: ",var_1);
for(int i=var_3_size-1;i>=0;i--)
{
printf("%i",var_3[i]);
}
printf(" in the number system: %i \n",var_2);




//and: more readable form(end easier)
int counter=var_3_size;
int x=var_1;
int g=var_2;
int y=x;// backup
t1=clock();//reference time

for(int i=0;i<200000000;i++)
{
y=x;
counter=0;
while(y>g)
{

var_3[counter]=y%g;
y/=g;
counter++;
}

var_3[counter]=y%g;
}

t2=clock();//final time
printf("\n C++(clocks): %i for the 200 million calculations",(t2-t1));

printf("\n value %i(in decimal) is: ",x);
for(int i=var_3_size-1;i>=0;i--)
{
printf("%i",var_3[i]);
}
printf(" in the number system: %i \n",g);
return 0;

编辑:这是 32 位版本

    void get_digits_asm()
{
__asm
{

pushf //couldnt store this in other registers
movd xmm0,eax//storing in xmm registers instead of pushing
movd xmm1,ebx//
movd xmm2,ecx//
movd xmm3,edx//
movd xmm4,edi//end of push backups

mov eax,[variable_x]
mov ebx,[number_system]
mov ecx,0h
mov edi,0h

begin_loop:
mov edx,0h
div ebx
lea edi,digits
mov [edi+ecx*4],edx
add ecx,01h
cmp eax,ebx
ja begin_loop

mov edx,0
div ebx
lea edi,digits
mov [edi+ecx*4],edx
inc ecx
mov [digits_total],ecx


movd edi,xmm4//pop edi
movd edx,xmm3//pop edx
movd ecx,xmm2//pop ecx
movd ebx,xmm1//pop ebx
movd eax,xmm0//pop eax
popf
}

}

最佳答案

代码当然可以更简单:(仿照C++版本,不包含push和pops,未测试)

  mov esi,200000000
_bigloop:
mov eax,[y]
mov ebx,[g]
lea edi,var_3
; eax = y
; ebx = g
; edi = var_3
xor ecx,ecx
; ecx = counter
_loop:
xor edx,edx
div ebx
mov [edi+ecx*4],edx
add ecx,1
test eax,eax
jnz _loop
sub esi,1
jnz _bigloop

但如果它比 C++ 版本更快,我会感到惊讶,事实上,如果基数是 2 的幂,它几乎肯定会更慢——所有理智的编译器都知道如何通过除法和/或取模2 的幂转换成位移位和按位与。


这是一个使用 ab 8 位除法的版本。类似的注意事项适用,但现在除法甚至可能溢出(如果 y/g 大于 255)。

  mov esi,200000000
_bigloop:
mov eax,[y]
mov ebx,[g]
lea edi,var_3
; eax = y
; ebx = g
; edi = var_3
xor ecx,ecx
; ecx = counter
_loop:
div bl
mov [edi+ecx],ah
add ecx,1
and eax,0xFF
jnz _loop
sub esi,1
jnz _bigloop

关于c++ - 你能在 C++ 内联汇编中建议我更好的解决方案吗?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/11579732/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com