最多不能超过 50%。矩阵乘法的理论性能-6ren

最多不能超过 50%。矩阵乘法的理论性能

转载作者：太空狗更新时间：2023-10-29 16:29:58

问题

我正在学习 HPC 和代码优化。我试图复制 Goto 开创性的矩阵乘法论文 ( http://www.cs.utexas.edu/users/pingali/CS378/2008sp/papers/gotoPaper.pdf ) 中的结果。尽管我尽了最大努力，但我无法获得超过 ~50% 的最大理论 CPU 性能。

背景

在此处查看相关问题 ( Optimized 2x2 matrix multiplication: Slow assembly versus fast SIMD )，包括有关我的硬件的信息

我尝试过的

这篇相关论文 ( http://www.cs.utexas.edu/users/flame/pubs/blis3_ipdps14.pdf ) 很好地描述了 Goto 的算法结构。我在下面提供了我的源代码。

我的问题

我正在寻求一般帮助。我从事此工作的时间太长了，尝试了许多不同的算法、内联汇编、各种大小的内核(2x2、4x4、2x8、...、mxn，其中 m 和 n 较大)，但 我似乎无法打破 50% CPU Gflops。这纯粹是为了教育目的，而不是家庭作业。

源代码

希望是可以理解的。如果没有请问。我按照上面第二篇论文中的描述设置了宏结构(for 循环)。我按照两篇论文中讨论的方式打包矩阵，并在此处的图 11 (http://www.cs.utexas.edu/users/flame/pubs/BLISTOMSrev2.pdf) 中以图形方式显示。我的内核计算 2x8 block ，因为这似乎是 Nehalem 架构的最佳计算(参见 GotoBLAS 源代码 - 内核)。内核基于此处描述的计算 rank-1 更新的概念 (http://code.google.com/p/blis/source/browse/config/template/kernels/3/bli_gemm_opt_mxn.c)

#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <string.h>
#include <x86intrin.h>
#include <math.h>
#include <omp.h>
#include <stdint.h>


// define some prefetch functions
#define PREFETCHNTA(addr,nrOfBytesAhead) \
        _mm_prefetch(((char *)(addr))+nrOfBytesAhead,_MM_HINT_NTA)

#define PREFETCHT0(addr,nrOfBytesAhead) \
        _mm_prefetch(((char *)(addr))+nrOfBytesAhead,_MM_HINT_T0)

#define PREFETCHT1(addr,nrOfBytesAhead) \
        _mm_prefetch(((char *)(addr))+nrOfBytesAhead,_MM_HINT_T1)

#define PREFETCHT2(addr,nrOfBytesAhead) \
        _mm_prefetch(((char *)(addr))+nrOfBytesAhead,_MM_HINT_T2)

// define a min function
#ifndef min
    #define min( a, b ) ( ((a) < (b)) ? (a) : (b) )
#endif

// zero a matrix
void zeromat(double *C, int n)
{
    int i = n;
    while (i--) {
        int j = n;
        while (j--) {
            *(C + i*n + j) = 0.0;
        }
    }
}

// compute a 2x8 block from (2 x kc) x (kc x 8) matrices
inline void 
__attribute__ ((gnu_inline))        
__attribute__ ((aligned(64))) dgemm_2x8_sse(
                int k,
                const double* restrict a1, const int cs_a,
                const double* restrict b1, const int rs_b,
                      double* restrict c11, const int rs_c
                )
{

    register __m128d xmm1, xmm4, //
                    r8, r9, r10, r11, r12, r13, r14, r15; // accumulators

    // 10 registers declared here

    r8 = _mm_xor_pd(r8,r8); // ab
    r9 = _mm_xor_pd(r9,r9);
    r10 = _mm_xor_pd(r10,r10);
    r11 = _mm_xor_pd(r11,r11);

    r12 = _mm_xor_pd(r12,r12); // ab + 8
    r13 = _mm_xor_pd(r13,r13);
    r14 = _mm_xor_pd(r14,r14);
    r15 = _mm_xor_pd(r15,r15);

        // PREFETCHT2(b1,0);
        // PREFETCHT2(b1,64);



    //int l = k;
    while (k--) {

        //PREFETCHT0(a1,0); // fetch 64 bytes from a1

            // i = 0
            xmm1 = _mm_load1_pd(a1);

            xmm4 = _mm_load_pd(b1);
            xmm4 = _mm_mul_pd(xmm1,xmm4);
            r8 = _mm_add_pd(r8,xmm4);

            xmm4 = _mm_load_pd(b1 + 2);
            xmm4 = _mm_mul_pd(xmm1,xmm4);
            r9 = _mm_add_pd(r9,xmm4);

            xmm4 = _mm_load_pd(b1 + 4);
            xmm4 = _mm_mul_pd(xmm1,xmm4);
            r10 = _mm_add_pd(r10,xmm4);

            xmm4 = _mm_load_pd(b1 + 6);
            xmm4 = _mm_mul_pd(xmm1,xmm4);
            r11 = _mm_add_pd(r11,xmm4);

            //
            // i = 1
            xmm1 = _mm_load1_pd(a1 + 1);

            xmm4 = _mm_load_pd(b1);
            xmm4 = _mm_mul_pd(xmm1,xmm4);
            r12 = _mm_add_pd(r12,xmm4);

            xmm4 = _mm_load_pd(b1 + 2);
            xmm4 = _mm_mul_pd(xmm1,xmm4);
            r13 = _mm_add_pd(r13,xmm4);

            xmm4 = _mm_load_pd(b1 + 4);
            xmm4 = _mm_mul_pd(xmm1,xmm4);
            r14 = _mm_add_pd(r14,xmm4);

            xmm4 = _mm_load_pd(b1 + 6);
            xmm4 = _mm_mul_pd(xmm1,xmm4);
            r15 = _mm_add_pd(r15,xmm4);

        a1 += cs_a;
        b1 += rs_b;

        //PREFETCHT2(b1,0);
        //PREFETCHT2(b1,64);

    }

        // copy result into C

        PREFETCHT0(c11,0);
        xmm1 = _mm_load_pd(c11);
        xmm1 = _mm_add_pd(xmm1,r8);
        _mm_store_pd(c11,xmm1);

        xmm1 = _mm_load_pd(c11 + 2);
        xmm1 = _mm_add_pd(xmm1,r9);
        _mm_store_pd(c11 + 2,xmm1);

        xmm1 = _mm_load_pd(c11 + 4);
        xmm1 = _mm_add_pd(xmm1,r10);
        _mm_store_pd(c11 + 4,xmm1);

        xmm1 = _mm_load_pd(c11 + 6);
        xmm1 = _mm_add_pd(xmm1,r11);
        _mm_store_pd(c11 + 6,xmm1);

        c11 += rs_c;

        PREFETCHT0(c11,0);
        xmm1 = _mm_load_pd(c11);
        xmm1 = _mm_add_pd(xmm1,r12);
        _mm_store_pd(c11,xmm1);

        xmm1 = _mm_load_pd(c11 + 2);
        xmm1 = _mm_add_pd(xmm1,r13);
        _mm_store_pd(c11 + 2,xmm1);

        xmm1 = _mm_load_pd(c11 + 4);
        xmm1 = _mm_add_pd(xmm1,r14);
        _mm_store_pd(c11 + 4,xmm1);

        xmm1 = _mm_load_pd(c11 + 6);
        xmm1 = _mm_add_pd(xmm1,r15);
        _mm_store_pd(c11 + 6,xmm1);

}

// packs a matrix into rows of slivers
inline void 
__attribute__ ((gnu_inline))        
__attribute__ ((aligned(64))) rpack(        double* restrict dst, 
          const double* restrict src, 
            const int kc, const int mc, const int mr, const int n)
{
    double tmp[mc*kc] __attribute__ ((aligned(64)));
    double* restrict ptr = &tmp[0];

    for (int i = 0; i < mc; ++i)
        for (int j = 0; j < kc; ++j)
            *ptr++ = *(src + i*n + j);

    ptr = &tmp[0];

    //const int inc_dst = mr*kc;
    for (int k = 0; k < mc; k+=mr)
        for (int j = 0; j < kc; ++j)
            for (int i = 0; i < mr*kc; i+=kc)
                *dst++ = *(ptr + k*kc + j + i);

}

// packs a matrix into columns of slivers
inline void 
__attribute__ ((gnu_inline))        
__attribute__ ((aligned(64)))  cpack(double* restrict dst, 
                const double* restrict src, 
                const int nc, 
                const int kc, 
                const int nr, 
                const int n)
{
    double tmp[kc*nc] __attribute__ ((aligned(64)));
    double* restrict ptr = &tmp[0];

    for (int i = 0; i < kc; ++i)
        for (int j = 0; j < nc; ++j)
            *ptr++ = *(src + i*n + j);

    ptr = &tmp[0];

    // const int inc_k = nc/nr;
    for (int k = 0; k < nc; k+=nr)
        for (int j = 0; j < kc*nc; j+=nc)
            for (int i = 0; i < nr; ++i)
                *dst++ = *(ptr + k + i + j);

}

void blis_dgemm_ref(
        const int n,
        const double* restrict A,
        const double* restrict B,
        double* restrict C,
        const int mc,
        const int nc,
        const int kc
    )
{
    int mr = 2;
    int nr = 8;
    double locA[mc*kc] __attribute__ ((aligned(64)));
    double locB[kc*nc] __attribute__ ((aligned(64)));
    int ii,jj,kk,i,j;
    #pragma omp parallel num_threads(4) shared(A,B,C) private(ii,jj,kk,i,j,locA,locB)
    {//use all threads in parallel
        #pragma omp for
        // partitions C and B into wide column panels
        for ( jj = 0; jj < n; jj+=nc) {
        // A and the current column of B are partitioned into col and row panels
            for ( kk = 0; kk < n; kk+=kc) {
                cpack(locB, B + kk*n + jj, nc, kc, nr, n);
                // partition current panel of A into blocks
                for ( ii = 0; ii < n; ii+=mc) {
                    rpack(locA, A + ii*n + kk, kc, mc, mr, n);
                    for ( i = 0; i < min(n-ii,mc); i+=mr) {
                        for ( j = 0; j < min(n-jj,nc); j+=nr) {
                            // inner kernel that compues 2 x 8 block
                            dgemm_2x8_sse( kc,
                                       locA + i*kc          ,  mr,
                                       locB + j*kc          ,  nr,
                                       C + (i+ii)*n + (j+jj),  n );
                        }
                    }
                }
            }
        }
    }
}

double compute_gflops(const double time, const int n)
{
    // computes the gigaflops for a square matrix-matrix multiplication
    double gflops;
    gflops = (double) (2.0*n*n*n)/time/1.0e9;
    return(gflops);
}

// ******* MAIN ********//
void main() {
    clock_t time1, time2;
    double time3;
    double gflops;
    const int trials = 10;

    int nmax = 4096;
    printf("%10s %10s\n","N","Gflops/s");

    int mc = 128;
    int kc = 256;
    int nc = 128;

    for (int n = kc; n <= nmax; n+=kc) { //assuming kc is the max dim
        double *A = NULL;
        double *B = NULL;
        double *C = NULL;

        A = _mm_malloc (n*n * sizeof(*A),64);
        B = _mm_malloc (n*n * sizeof(*B),64);
        C = _mm_malloc (n*n * sizeof(*C),64);

        srand(time(NULL));

        // Create the matrices
        for (int i = 0; i < n; i++) {
            for (int j = 0; j < n; j++) {
                A[i*n + j] = (double) rand()/RAND_MAX;
                B[i*n + j] = (double) rand()/RAND_MAX;
                //D[j*n + i] = B[i*n + j]; // Transpose
                C[i*n + j] = 0.0;
            }
        }

            // warmup
            zeromat(C,n);
            blis_dgemm_ref(n,A,B,C,mc,nc,kc);
            zeromat(C,n);
            time2 = 0;
            for (int count = 0; count < trials; count++){// iterations per experiment here
                    time1 = clock();
                    blis_dgemm_ref(n,A,B,C,mc,nc,kc);
                    time2 += clock() - time1;
                    zeromat(C,n);
                }
            time3 = (double)(time2)/CLOCKS_PER_SEC/trials;
            gflops = compute_gflops(time3, n);
            printf("%10d %10f\n",n,gflops);

        _mm_free(A);
        _mm_free(B);
        _mm_free(C);

        }

    printf("tests are done\n");
}

编辑 1

操作系统 = Win 7 64 位

编译器 = gcc 4.8.1，但是 32 位和 mingw(也是 32 位。我正在努力获得 mingw64 的“非安装”版本，这样我就可以生成更快的代码/使用更多的 XMM 寄存器等。如果任何人都有指向与 mingw-get 时尚相似的 mingw64 安装的链接，请发布。我的工作计算机有太多管理员限制。

最佳答案

包装

您似乎过于频繁地打包 A 矩阵的 block 。你做

rpack(locA, A + ii*n + kk, kc, mc, mr, n);

但是这只依赖于 ii 和 kk 而不是依赖于 jj 但它在 jj 的内部循环中> 所以你为 jj 的每次迭代重新打包相同的东西。我认为没有必要。在我的代码中，我在矩阵乘法之前进行了打包。当值仍在缓存中时，将矩阵乘法打包到内部可能更有效，但这样做比较棘手。但是打包是一个 O(n^2) 操作，矩阵乘法是一个 O(n^3) 操作，所以在矩阵乘法之外打包大型矩阵并不是很低效(我从测试中也知道 - 注释掉包装只会改变几个百分点的效率)。但是，通过在每次 jj 迭代中使用 rpack 重新打包，您已经有效地使其成为一个 O(n^3) 操作。

墙时间

你想要墙上的时间。 On Unix the clock() function does not return the wall time (虽然它在 Windows 上使用 MSVC)。它返回每个线程的累计时间。这是我在 SO for OpenMP 上看到的最常见的错误之一。

使用 omp_get_wtime() 获取挂钟时间。

请注意，我不知道 clock() 函数如何与 MinGW 或 MinGW-w64(它们是单独的项目)一起工作。 MinGW 链接到 MSVCRT，所以我猜 clock() 与 MinGW 返回挂钟时间，就像它与 MSVC 一样。但是，MinGW-w64 没有链接到 MSVCRT(据我所知，它链接到 glibc 之类的东西)。 MinGW-w64 中的 clock() 可能与 Unix 中的 clock() 执行相同的操作。

超线程

超线程适用于经常使 CPU 停顿的代码。这实际上是大部分代码，因为很难编写不停止 CPU 的代码。这就是英特尔发明超线程的原因。与优化代码相比，任务切换和让 CPU 做其他事情更容易。然而，对于高度优化的代码，超线程实际上会产生更糟糕的结果。在我自己的矩阵乘法代码中确实如此。将线程数设置为您拥有的物理内核数(在您的情况下为两个)。

我的代码

下面是我的代码。我没有在此处包含 inner64 函数。您可以在 Difference in performance between MSVC and GCC for highly optimized matrix multplication code 找到它(使用令人讨厌且误导性的名称 AddDot4x4_vec_block_8wide)

我在阅读 Goto 论文之前以及在阅读 Agner Fog 的优化手册之前编写了这段代码。您似乎在主循环中重新排序/打包矩阵。这可能更有意义。我认为我不会像您那样对它们重新排序，而且我只对其中一个输入矩阵 (B) 重新排序，而不是像您那样重新排序。

此代码在我的系统 (Xeon E5-1620@3.6) 上使用 Linux 和 GCC 的性能约为此矩阵大小 (4096x4096) 峰值的 75%。对于这种矩阵大小，英特尔的 MKL 在我的系统上获得了大约 94% 的峰值，因此显然还有改进的余地。

#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
#include <immintrin.h>

extern "C" void inner64(const float *a, const float *b, float *c);
void (*fp)(const float *a, const float *b, float *c) = inner64;

void reorder(float * __restrict a, float * __restrict b, int n, int bs) {
    int nb = n/bs;
    #pragma omp parallel for
    for(int i=0; i<nb; i++) {
        for(int j=0; j<nb; j++) {
            for(int i2=0; i2<bs; i2++) {
                for(int j2=0; j2<bs; j2++) {
                    b[bs*bs*(nb*i+j) + bs*i2+j2]= a[bs*(i*n+j) + i2*n + j2];    
                }
            }
        }
    }
}

inline void gemm_block(float * __restrict a, float * __restrict b, float * __restrict c, int n, int n2) {
    for(int i=0; i<n2; i++) {
        fp(&a[i*n], b, &c[i*n]);
    }
}

void gemm(float * __restrict a, float * __restrict b, float * __restrict c, int n, int bs) {
    int nb = n/bs;
    float *b2 = (float*)_mm_malloc(sizeof(float)*n*n,64);
    reorder(b,b2,n,bs);
    #pragma omp parallel for
    for(int i=0; i<nb; i++) {
        for(int j=0; j<nb; j++) {
            for(int k=0; k<nb; k++) {
                gemm_block(&a[bs*(i*n+k)],&b2[bs*bs*(k*nb+j)],&c[bs*(i*n+j)], n, bs);
            }
        }
    }
    _mm_free(b2);
}

int main() {
    float peak = 1.0f*8*4*2*3.69f;
    const int n = 4096;
    float flop = 2.0f*n*n*n*1E-9f;
    omp_set_num_threads(4);

    float *a = (float*)_mm_malloc(sizeof(float)*n*n,64);
    float *b = (float*)_mm_malloc(sizeof(float)*n*n,64);
    float *c = (float*)_mm_malloc(sizeof(float)*n*n,64);
    for(int i=0; i<n*n; i++) {
        a[i] = 1.0f*rand()/RAND_MAX;
        b[i] = 1.0f*rand()/RAND_MAX;
    }

    gemm(a,b,c,n,64); //warm OpenMP up
    while(1) {
        for(int i=0; i<n*n; i++) c[i] = 0;
        double dtime = omp_get_wtime();
        gemm(a,b,c,n,64);   
        dtime = omp_get_wtime() - dtime;
        printf("time %.2f s, efficiency %.2f%%\n", dtime, 100*flop/dtime/peak);
    }
}

关于最多不能超过 50%。矩阵乘法的理论性能，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/23964334/

文章推荐： android - 如何通过其引用 ID 获取主题

文章推荐： java - 安卓加密音频播放器

javascript - (不能)在javascript中获取元素样式的一部分
我是 javascript 的新手(今天开始弄乱它)。我正在尝试更改名为“bar”的元素(div)的高度。条形图将成为图表的一部分。我可以毫无问题地将按钮连接到更改栏高度的函数。一切正常，除了条形
ios - 不能 "addSubView"
错误 -> “UIVIew”没有名为“addSubView”的成员 override func viewDidLoad() { super.viewDidLoad() // Do an
swift - 不能 CGEventTapCreate
我在命令行工具项目中复制并粘贴了 main.swift 下面链接中的代码。 How do you use CGEventTapCreate in Swift? 它构建没有错误，但是当我运行时， gua
c++ - 不能 dynamic_cast
我在尝试编译我的代码时遇到以下错误。 ERROR! ..\myCode\CPOI.cpp:68:41: error: cannot dynamic_cast 'screenType' (of type
不能 strcat_s 多个字符到字符串指针
我正在尝试将多个字符串连接到一个我已为其分配内存的字符串指针。这是一个例子: char *finalNumString = malloc(sizeof(char)*1024); finalNumStr
不能 dup2 将管道的末端写入标准输出
我在使用 dup2() 和 pipe() 时遇到问题。当我尝试将管道的写入端 dup2 到 STDOUT_FILENO 时，我收到了 EBADF。我用 gdb 在 dup2(pout[1], ST
Git:不能 pull
首先，我应该说我运行的是 Windows 7。因此，今天早上我尝试像往常一样从我的存储库中提取数据，但我做不到。我得到了错误: The authenticity of host 'github.co
python - 不能 "activate"virtualenv
刚开始在虚拟环境中运行Python，乱用Django，无法激活虚拟环境。花了最后 4 个小时尝试在本地终端/VS 代码上激活虚拟环境 (venv)，但没有成功。避免使用“sudo pip inst
r - 数据框可以做什么而 tibble 不能？
Tidyverse 的粉丝经常给出使用小标题而不是数据框的几个优点。它们中的大多数似乎旨在保护用户免于犯错误。例如，与数据框不同，小标题: 不需要 ,drop=FALSE不从数据中删除维度的论据。不
javascript - 不能 Dockerize Elm
我一直在对 Elm 应用程序进行 docker 化时遇到问题。据我所知，我已经创建了一个完整且有效的 Docker 文件……但它不起作用。我会解释的。所以我的脚本在 3 个文件中运行。首先是启动
java - 不能 Mockbean HttpServletResponse
我可以在 Controller 中使用@Autowired，例如 @RestController public class Index { @Autowired HttpServlet
function - 不能 `compose` 和方法和函数
我定义了一个方法和一个函数: def print(str:String) = println val intToString = (n:Int) => n.toString 现在我想创作它们。我的问
javascript - 不能 .map() 一个看似数组的值
当我控制台单独记录变量“pokemons”时，它确实返回一个数组。但是当我尝试映射它时，出现错误: TypeError: pokemons.map is not a function 我的代码: im
python - 不能 `import smtplib`
每当我尝试在 Python 解释器中导入 smtplib 时，都会收到此错误: ImportError: cannot import name fix_eols 我该如何解决这个问题？编辑:这是完整
javascript - 为什么 {{#each}} 可以正常工作而 {{#with}} 不能？
我正在使用 Meteor.js 开发一个项目，但在使用 Handlebar 时遇到了一些问题:我想检索集合的最后一项，并显示字段:其中包含 html 的文本: 这是我的javascript代码: Te
java - onTouchEvent 不能@Override
你好，我想使用 Service 实现 GestureDetector 但是我有这个错误The method onTouchEvent(MotionEvent) of type GestureServi
java - 不能@Autowired接口(interface)
我正在尝试在 Controller bean 中 Autowiring 接口(interface) 在我放置的上下文配置文件中和我的 Controller 类是 @Controller pub
c++ - 不能 #include
我试图在 mainwindow.cpp 中包含 QtSvg，但是当我编译时它说无法打开包含文件:QtSvg。我已经在我的 *.pro 文件中添加了这个(QT += svg)。我可以知道可能是什么问题吗
postgresql - 不能 100% 确定这是否安全
鉴于以下 PostgreSQL 代码，我认为这段代码不容易受到 SQL 注入(inject)攻击: _filter 'day' _start 1 _end 10 _sort 'article_name
MYSQL:不能/如何将子查询用作完全合格的 CTE
我想执行以下操作。这在 MySQL 中是非法的。 PostGRESQL 中关联的 CTE(“with”子句)有效。这里的假设是 MySQL 中的子查询不是完全限定的 CTE。请注意:这个查询显然非常

太空狗

个人简介

我是一名优秀的程序员,十分优秀！

作者热门文章

滴滴打车优惠券免费领取

全站热门文章

首页

博学

6Ren·AI

商城

最多不能超过 50%。矩阵乘法的理论性能