c++ - 你能在 C++ 内联汇编中建议我更好的解决方案吗？-6ren

c++ - 你能在 C++ 内联汇编中建议我更好的解决方案吗？

转载作者：塔克拉玛干更新时间：2023-11-03 07:31:22

我正在学习汇编，并开始在 Digital-Mars C++ 编译器(intel sytanx 更易于阅读)中对 SSE 和 MMX 寄存器进行实验。我已经完成了一个将 var_1 作为值并将其转换为 var_2 数字系统的程序(目前为 8 位。稍后将其扩展为 32 64 128)。程序通过两种方式做到这一点:

__asm 内联
%(modulo) 运算符的常用 C++ 方式。

问题:你能告诉我使用 xmm0-7 和 mm0-7 寄存器的更有效方法吗？你能告诉我如何与 al,ah...8 位寄存器交换它们的确切字节吗？

与我计算机(pentium-m centrino 2.0GHz)上的 __asm 相比，C++ 常用方式中的常用 %(模)运算符非常慢。如果你能告诉我如何去掉 __asmm 中的除法指令，它会更快。

当我运行程序时，它会给我:

(for the values: var_1=17,var_2=2,all loops are 200M times)

17 is 10001 in number system 2
__asm(clock)...........: 7250    <------too bad. it is 8-bit calc.
C++(clock).............: 12250   <------not very slow(var_2 is a power of 2)


(for the values: var_1=33,var_2=7,all loops are 200M times)
33 is 45 in number system 7
 __asm(clock)..........: 2875   <-------not good. it is 8-bit calc.
 C++(clock)............: 6328   <----------------really slow(var_2 is not a power of 2)

第二个 C++ 代码(带有 % 运算符的代码)://////////////////////////////////////////////////////

t1=clock();//reference time
for(int i=0;i<200000000;i++)
{
    y=x;
    counter=0;
    while(y>g)
    {   

        var_3[counter]=y%g;
        y/=g;
        counter++;
    }

     var_3[counter]=y%g;
}   
t2=clock();//final time

_asm 代码:///////////////////////////////////////////////////////////////////////////////////////////////////////

     __asm  // i love assembly in some parts of C++
        {

        pushf   //here does register backup
        push eax
        push ebx
        push ecx
        push edx
        push edi

            mov eax,0h      //this will be outer loop counter init to zero
            //init of medium-big registers to zero
            movd xmm0,eax    //cannot set to immediate constant: xmm0=outer loop counter 
            shufps xmm0,xmm0,0h //this makes all bits zero
            movd xmm1,eax
            movd xmm2,eax   
            shufps xmm1,xmm1,0h
            shufps xmm2,xmm2,0h
            movd xmm2,eax 
            shufps xmm3,xmm3,0h//could have made pxor xmm3,xmm3(single instruction)
            //init complete(xmm0,xmm1,xmm2,xmm3 are zero)

            movd xmm1,[var_1] //storing variable_1 to register
            movd xmm2,[var_2] //storing var_2 to register    
            lea ebx,var_3     //calculate var_3 address
            movd xmm3,ebx     //storing var_3's address to register
            for_loop:
            mov eax,0h      
            //this line is index-init to zero(digit array index)
            movd edx,xmm2
            mov cl,dl       //this is the var_1 stored in cl
            movd edx,xmm1
            mov al,dl       //this is the var_2 stored in al
            mov edx,0h
            dng:
                mov ah,00h      //preparation for a 8-bit division
                div cl          //divide

                movd ebx,xmm3   //get var_3 address
                add ebx,edx     //i couldnt find a way to multiply with 4
                add ebx,edx     //so i added 4 times ^^
                add ebx,edx     //add   
                add ebx,edx     //last adding
                //below, mov [ebx],ah is the only memory accessing instruction
                mov [ebx],ah    //(8 bit)this line is equivalent to var_3[i]=remainder


                inc edx         //i++;
                cmp al,00h      //is division zero?
            jne dng             //if no, loop again

            //here edi register has the number of digits

            movd eax,xmm0       //get the outer loop counter from medium-big register
            add eax,01h         //j++;
            movd xmm0,eax       //store the new counter to medium-big register
            cmp eax,0BEBC200h           //is j<(200,000,000) ?
            jb for_loop     //if yes, go loop again
            mov [var_3_size],edx //now we have number of digits too!
         //here does registers revert back to old values
        pop edi
        pop edx
        pop ecx
        pop ebx
        pop eax
        popf     

        }

整个代码:///////////////////////////////////////////////////////////////////////////////////

#include <iostream.h>
#include <cmath>
#include<stdlib.h>
#include<stdio.h>
#include<time.h>
int main()
    {

    srand(time(0));


    clock_t t1=clock();
    clock_t t2=clock();

    int var_1=17;  //number itself
    int var_2=2;   //number system
    int var_3[100];  //digits to be showed(maximum 100 as seen )
    int var_3_size=0;//asm block will decide what will the number of  digits be

    for(int i=0;i<100;i++)
    {
    var_3[i]=0; //here we initialize digits to zeroes
    }


    t1=clock();//reference time to take
     __asm  // i love assembly in some parts of C++
        {

        pushf   //here does register backup
        push eax
        push ebx
        push ecx
        push edx
        push edi

            mov eax,0h      //this will be outer loop counter init to zero
            //init of medium-big registers to zero
            movd xmm0,eax    //cannot set to immediate constant: xmm0=outer loop counter 
            shufps xmm0,xmm0,0h //this makes all bits zero
            movd xmm1,eax
            movd xmm2,eax   
            shufps xmm1,xmm1,0h
            shufps xmm2,xmm2,0h
            movd xmm2,eax 
            shufps xmm3,xmm3,0h
            //init complete(xmm0,xmm1,xmm2,xmm3 are zero)

            movd xmm1,[var_1] //storing variable_1 to register
            movd xmm2,[var_2] //storing var_2 to register    
            lea ebx,var_3     //calculate var_3 address
            movd xmm3,ebx     //storing var_3's address to register
            for_loop:
            mov eax,0h      
            //this line is index-init to zero(digit array index)
            movd edx,xmm2
            mov cl,dl       //this is the var_1 stored in cl
            movd edx,xmm1
            mov al,dl       //this is the var_2 stored in al
            mov edx,0h
            dng:
                mov ah,00h      //preparation for a 8-bit division
                div cl          //divide

                movd ebx,xmm3   //get var_3 address
                add ebx,edx     //i couldnt find a way to multiply with 4
                add ebx,edx     //so i added 4 times ^^
                add ebx,edx     //add   
                add ebx,edx     //last adding
                //below, mov [ebx],ah is the only memory accessing instruction
                mov [ebx],ah    //(8 bit)this line is equivalent to var_3[i]=remainder


                inc edx         //i++;
                cmp al,00h      //is division zero?
            jne dng             //if no, loop again

            //here edi register has the number of digits

            movd eax,xmm0       //get the outer loop counter from medium-big register
            add eax,01h         //j++;
            movd xmm0,eax       //store the new counter to medium-big register
            cmp eax,0BEBC200h           //is j<(200,000,000) ?
            jb for_loop     //if yes, go loop again
            mov [var_3_size],edx //now we have number of digits too!
         //here does registers revert back to old values
        pop edi
        pop edx
        pop ecx
        pop ebx
        pop eax
        popf     

        }
    t2=clock(); //finish time
    printf("\n assembly_inline(clocks): %i  for the 200 million calculations",(t2-t1)); 

        printf("\n value %i(in decimal) is: ",var_1);
for(int i=var_3_size-1;i>=0;i--)
{
    printf("%i",var_3[i]);
}
printf(" in the number system: %i \n",var_2);




//and: more readable form(end easier)
    int counter=var_3_size;
    int x=var_1;
    int g=var_2;
    int y=x;// backup
t1=clock();//reference time

for(int i=0;i<200000000;i++)
{
    y=x;
    counter=0;
    while(y>g)
    {   

        var_3[counter]=y%g;
        y/=g;
        counter++;
    }

     var_3[counter]=y%g;
}

t2=clock();//final time
printf("\n C++(clocks): %i  for the 200 million calculations",(t2-t1)); 

printf("\n value %i(in decimal) is: ",x);
for(int i=var_3_size-1;i>=0;i--)
{
    printf("%i",var_3[i]);
}
printf(" in the number system: %i \n",g);
return 0;

编辑:这是 32 位版本

    void get_digits_asm()
{
    __asm
    {

        pushf       //couldnt store this in other registers 
        movd xmm0,eax//storing in xmm registers instead of pushing
        movd xmm1,ebx//
        movd xmm2,ecx//
        movd xmm3,edx//
        movd xmm4,edi//end of push backups

        mov eax,[variable_x]
        mov ebx,[number_system]
        mov ecx,0h
        mov edi,0h

        begin_loop:
        mov edx,0h
        div ebx             
        lea edi,digits  
        mov [edi+ecx*4],edx
        add ecx,01h
        cmp eax,ebx
        ja begin_loop

        mov edx,0
        div ebx
        lea edi,digits
        mov [edi+ecx*4],edx
        inc ecx
        mov [digits_total],ecx


        movd edi,xmm4//pop edi
        movd edx,xmm3//pop edx
        movd ecx,xmm2//pop ecx
        movd ebx,xmm1//pop ebx
        movd eax,xmm0//pop eax
        popf            
    }

}

最佳答案

代码当然可以更简单:(仿照C++版本，不包含push和pops，未测试)

  mov esi,200000000
_bigloop:
  mov eax,[y]
  mov ebx,[g]
  lea edi,var_3
  ; eax = y
  ; ebx = g
  ; edi = var_3
  xor ecx,ecx
  ; ecx = counter
_loop:
  xor edx,edx
  div ebx
  mov [edi+ecx*4],edx
  add ecx,1
  test eax,eax
  jnz _loop
  sub esi,1
  jnz _bigloop

但如果它比 C++ 版本更快，我会感到惊讶，事实上，如果基数是 2 的幂，它几乎肯定会更慢——所有理智的编译器都知道如何通过除法和/或取模2 的幂转换成位移位和按位与。

这是一个使用 ab 8 位除法的版本。类似的注意事项适用，但现在除法甚至可能溢出(如果 y/g 大于 255)。

  mov esi,200000000
_bigloop:
  mov eax,[y]
  mov ebx,[g]
  lea edi,var_3
  ; eax = y
  ; ebx = g
  ; edi = var_3
  xor ecx,ecx
  ; ecx = counter
_loop:
  div bl
  mov [edi+ecx],ah
  add ecx,1
  and eax,0xFF
  jnz _loop
  sub esi,1
  jnz _bigloop

关于c++ - 你能在 C++ 内联汇编中建议我更好的解决方案吗？，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/11579732/

文章推荐： C++ 异常与错误代理

文章推荐： c++ - 可编辑 QComboBox : synchronize edit text with item text

文章推荐： c++ - 如何在不读/写 key 文件的情况下使用 crypto++？

文章推荐： c++ - 使用 Cairo 和 Allegro 5

javascript - 我需要将文本放在一个中，它位于一个 Div 中，该 Div 位于另一个 Div 中，该 Div 位于另一个 Div 中
我需要将文本放在中在一个 Div 中，在另一个 Div 中，在另一个 Div 中。所以这是它的样子: #document Change PIN
html - 两个背景图像。一个在 HTML 中，一个在 BODY 中。在 Firefox 中，主体图像未呈现
奇怪的事情发生了。我有一个基本的 html 代码。 html，头部， body 。(因为我收到了一些反对票，这里是完整的代码) 这是我的CSS: html { backgroun
ios - 将图像从 asset.xcassets 加载到 imageArray 中，并将其动态加载到 UIImageView 中，该 UIImageView 存在于 UICollectionView 中 - swift
我正在尝试将 Assets 中的一组图像加载到 UICollectionview 中存在的 ImageView 中，但每当我运行应用程序时它都会显示错误。而且也没有显示图像。我在ViewDidLoa
linux - 在 BASH 中，我需要根据 perl 脚本的输出更改一些环境变量。在 tcsh 中，我可以使用别名 eval 组合。不能在 bash 中
我需要根据带参数的 perl 脚本的输出更改一些环境变量。在 tcsh 中，我可以使用别名命令来评估 perl 脚本的输出。 tcsh: alias setsdk 'eval `/localhome/
asp.net - Windows 身份验证适用于 IIS，但不适用于 Kestrel/Microsoft.AspNetCore.Authentication.Negotiate(不在 Chrome 中，有时在 Edge 中，始终在 IE 中)？
我使用 Windows 身份验证创建了一个新的 Blazor(服务器端)应用程序，并使用 IIS Express 运行它。它将显示一条消息“Hello Domain\User!”来自右上方的以下 Ra
java - java 中 Kotlin 中的等价物是什么？
这是我的方法 void login(Event event);我想知道 Kotlin 中应该如何最佳答案在 Kotlin 中通配符运算符是 * 。它指示编译器它是未知的，但一旦知道，就不会有其他类
express - 在 Jade 中，为什么有时我可以按原样使用变量而有时必须将它们包含在#{......} 中？
看下面的代码 for story in book if story.title.length < 140 - var story
c - C 中 strstr() 中 for 循环的错误使用
我正在尝试用 C 语言学习字符串处理。我写了一个程序，它存储了一些音乐轨道，并帮助用户检查他/她想到的歌曲是否存在于存储的轨道中。这是通过要求用户输入一串字符来完成的。然后程序使用 strstr()
c - * 在 sscanf 中，* 在 [] 中
我正在学习 sscanf 并遇到如下格式字符串: sscanf("%[^:]:%[^*=]%*[*=]%n",a,b,&c); 我理解 %[^:] 部分意味着扫描直到遇到 ':' 并将其分配给 a。:
python - 在 Python (2.7.3) 中，如果 str(x) 中的任何字符在 str(y) 中(或 str(y) 在 str(x) 中)，我如何编写一个函数来回答？
def char_check(x,y): if (str(x) in y or x.find(y) > -1) or (str(y) in x or y.find(x) > -1):
ansible - 在 Ansible 中，如何将一行移动到一个 block 中？
我有一种情况，我想将文本文件中的现有行包含到一个新 block 中。 line 1 line 2 line in block line 3 line 4 应该变成 line 1 line 2 line
Django 调试工具栏显示在根 URL 中，但不显示在应用程序 URL 中
我有一个新项目，我正在尝试设置 Django 调试工具栏。首先，我尝试了快速设置，它只涉及将 'debug_toolbar' 添加到我的已安装应用程序列表中。有了这个，当我转到我的根 URL 时，调试
r - 在 R 中，Matlab 中 @ 函数句柄的等价物是什么？
在 Matlab 中，如果我有一个函数 f，例如签名是 f(a,b,c)，我可以创建一个只有一个变量 b 的函数，它将使用固定的 a=a1 和 c=c1 调用 f: g = @(b) f(a1, b,
swiftui - SwiftUI 中 ScrollView 中 VStack 元素中的神秘间距或填充
我不明白为什么 ForEach 中的元素之间有多余的垂直间距在 VStack 里面在 ScrollView 里面使用 GeometryReader 时渲染自定义水平分隔线。 Scrol
cookies - 什么应该存储在 session 中，什么应该存储在 cookie 中？
我想知道，是否有关于何时使用 session 和 cookie 的指南或最佳实践？什么应该和什么不应该存储在其中？谢谢! 最佳答案这些文档很好地了解了 session cookie 的安全问题以及
python - Python 中 matplotlib 中 3d 直方图的奇怪行为
我在 scipy/numpy 中有一个 Nx3 矩阵，我想用它制作一个 3 维条形图，其中 X 轴和 Y 轴由矩阵的第一列和第二列的值、高度确定每个条形的是矩阵中的第三列，条形的数量由 N 确定。
c - c 中 sem_init(...) 中 value 参数的不同用法
假设我用两种不同的方式初始化信号量 sem_init(&randomsem,0,1) sem_init(&randomsem,0,0) 现在， sem_wait(&randomsem) 在这两种情况下
c - 实际值存储在 pstr 中，但是该值如何存储在数组 "WORD"中
我怀疑该值如何存储在“WORD”中，因为 PStr 包含实际输出。？既然Pstr中存储的是小写到大写的字母，那么在printf中如何将其给出为“WORD”。有人可以吗？解释一下？ #include
javascript - 数组索引选择像在 numpy 中，但在 javascript 中
我有一个 3x3 数组: var my_array = [[0,1,2], [3,4,5], [6,7,8]]; 并想获得它的第一个 2
javascript - 在 Javascript 中，如何检测浏览器窗口何时在 View 中？
我意识到您可以使用如下方式轻松检查焦点: var hasFocus = true; $(window).blur(function(){ hasFocus = false; }); $(win

塔克拉玛干

个人简介

我是一名优秀的程序员,十分优秀！

作者热门文章

滴滴打车优惠券免费领取

全站热门文章

首页

博学

6Ren·AI

商城

c++ - 你能在 C++ 内联汇编中建议我更好的解决方案吗？