gpt4 book ai didi

optimization - Heaviside函数的优化实现

转载 作者:行者123 更新时间:2023-12-03 15:34:19 27 4
gpt4 key购买 nike

我想( super )优化Heaviside函数的实现。

我正在研究速度特别重要的数值算法(在Fortran中)。它多次使用Heaviside函数,当前由signum内在函数实现,如下所示:

heaviside = 0.5*sign(1,x)+1

我主要对x是Intel处理器上的 double 实数的情况感兴趣。

是否有可能开发更有效的Heaviside功能实现方案?
也许使用汇编语言,超优化代码或调用现有的外部库?

最佳答案

您打算使用heaviside = 0.5*(sign(1,x)+1)吗?在任何情况下,使用gcc 4.8.1 fortran进行的测试都表明High Performance Mark的想法应该是有益的。这里有3种可能性:

heaviside1-原始
heaviside2-高性能马克的想法
heaviside3-另一种变体

  function heaviside1 (x)
double precision heaviside1, x
heaviside1 = 0.5 * (sign(1d0,x) + 1)
end

function heaviside2 (x)
double precision heaviside2, x
heaviside2 = sign(0.5d0,x) + 0.5
end

function heaviside3 (x)
double precision heaviside3, x
heaviside3 = 0
if (x .ge. 0) heaviside3 = 1
end

program demo
double precision heaviside1, heaviside2, heaviside3, x, a, b, c

do
x = 0.5 - RAND(0)
a = heaviside1(x)
b = heaviside2(x)
c = heaviside3(x)
print *, "x=", x, "heaviside(x)=", a, b, c
enddo
end

编译时,gcc会生成以下3个独立函数:
<heaviside1_>:
vmovsd xmm0,QWORD PTR [rcx]
vandpd xmm0,xmm0,XMMWORD PTR [rip+0x2d824]
vorpd xmm0,xmm0,XMMWORD PTR [rip+0x2d80c]
vaddsd xmm0,xmm0,QWORD PTR [rip+0x2d7f4]
vmulsd xmm0,xmm0,QWORD PTR [rip+0x2d81c]
ret

<heaviside2_>:
vmovsd xmm0,QWORD PTR [rcx]
vandpd xmm0,xmm0,XMMWORD PTR [rip+0x2d844]
vorpd xmm0,xmm0,XMMWORD PTR [rip+0x2d85c]
vaddsd xmm0,xmm0,QWORD PTR [rip+0x2d844]
ret

<heaviside3_>:
vxorpd xmm0,xmm0,xmm0
vmovsd xmm1,QWORD PTR [rip+0x2d844]
vcmplesd xmm0,xmm0,QWORD PTR [rcx]
vandpd xmm0,xmm1,xmm0
ret

使用gcc编译时,heaviside1会生成一个乘法,这可能会降低执行速度。
heaviside2消除了乘法。
heaviside3与heaviside2的指令数相同,但使用的内存访问少2。

对于独立功能:
             instruction   memory reference
count count
heaviside1 6 5
heaviside2 5 4
heaviside3 5 2

这些函数的内联代码避免了对返回指令的需求,理想情况下将参数传递给寄存器,并用所需的常数预加载其他寄存器。确切的结果取决于所使用的编译器和调用代码。内联代码的估计:
             instruction   memory reference
count count
heaviside1 4 0
heaviside2 3 0
heaviside3 2 0

看起来该函数最多可以由两个编译器生成的指令来处理:vcmplesd + vandpd。如果参数为负,则第一条指令将创建全零的掩码,否则将创建全零的掩码。第二条指令将掩码应用于寄存器常量值1,以产生零或一的结果值。

尽管我尚未对这些函数进行基准测试,但看起来重载函数应该不需要太多的执行时间。

--- 09/23/2013:添加了x86_64汇编语言版本和C语言基准测试---

文件功能
//----------------------------------------------------------------------------
.intel_syntax noprefix
.text

//-----------------------------------------------------------------------------
// this heaviside function generates its own register constants
// double heaviside_a1 (double arg);
.globl heaviside_a1

heaviside_a1:
mov rax,0x3ff0000000000000
xorpd xmm1,xmm1 # xmm1: constant 0.0
cmplesd xmm1,xmm0 # xmm1: mask (all Fs or all 0s)
movq xmm0,rax # xmm0: constant 1.0
andpd xmm0,xmm1
retq

//-----------------------------------------------------------------------------
// this heaviside function uses register constants passed from caller
// double heaviside_a2 (double arg, double const0, double const1);
.globl heaviside_a2

heaviside_a2:
cmplesd xmm1,xmm0 # xmm1: mask (all Fs or all 0s)
movsd xmm0,xmm2 # xmm0: constant 1.0
andpd xmm0,xmm1
retq

//-----------------------------------------------------------------------------

文件ctest.c
#define __USE_MINGW_ANSI_STDIO 1
#include <windows.h>
#include <stdio.h>
#include <stdint.h>

// functions.s
double heaviside_a1 (double x);
double heaviside_a2 (double arg, double const0, double const1);

//-----------------------------------------------------------------------------

static double heaviside_c1 (double x)
{
double result = 0;
if (x >= 0) result = 1;
return result;
}

//-----------------------------------------------------------------------------
//
// queryPerformanceCounter - similar to QueryPerformanceCounter, but returns
// count directly.

uint64_t queryPerformanceCounter (void)
{
LARGE_INTEGER int64;
QueryPerformanceCounter (&int64);
return int64.QuadPart;
}

//-----------------------------------------------------------------------------
//
// queryPerformanceFrequency - same as QueryPerformanceFrequency, but returns count direcly.

uint64_t queryPerformanceFrequency (void)
{
LARGE_INTEGER int64;

QueryPerformanceFrequency (&int64);
return int64.QuadPart;
}

//----------------------------------------------------------------------------
//
// lfsr64gpr - left shift galois type lfsr for 64-bit data, general purpose register implementation
//
static uint64_t lfsr64gpr (uint64_t data, uint64_t mask)
{
uint64_t carryOut = data >> 63;
uint64_t maskOrZ = -carryOut;
return (data << 1) ^ (maskOrZ & mask);
}

//---------------------------------------------------------------------------

int runtests (uint64_t pattern, uint64_t mask)
{
uint64_t startCount, elapsed, index, loops = 800000000;
double ns;
double total = 0;

startCount = queryPerformanceCounter ();
for (index = 0; index < loops; index++)
{
double x, result;
pattern = lfsr64gpr (pattern, mask);
x = (double) (int64_t) pattern;
result = heaviside_c1 (x);
total += result;
}
elapsed = queryPerformanceCounter () - startCount;
ns = (double) elapsed / queryPerformanceFrequency () * 1000000000 / loops;
printf ("heaviside_c1: %7.2f ns\n", ns);

startCount = queryPerformanceCounter ();
for (index = 0; index < loops; index++)
{
double x, result;
pattern = lfsr64gpr (pattern, mask);
x = (double) (int64_t) pattern;
result = heaviside_a1 (x);
//printf ("heaviside_a1 (%lf): %lf\n", x, result);
total += result;
}
elapsed = queryPerformanceCounter () - startCount;
ns = (double) elapsed / queryPerformanceFrequency () * 1000000000 / loops;
printf ("heaviside_a1: %7.2f ns\n", ns);

startCount = queryPerformanceCounter ();
for (index = 0; index < loops; index++)
{
double x, result;
const double const0 = 0.0;
const double const1 = 1.0;
pattern = lfsr64gpr (pattern, mask);
x = (double) (int64_t) pattern;
result = heaviside_a2 (x, const0, const1);
//printf ("heaviside_a2 (%lf): %lf\n", x, result);
total += result;
}
elapsed = queryPerformanceCounter () - startCount;
ns = (double) elapsed / queryPerformanceFrequency () * 1000000000 / loops;
printf ("heaviside_a2: %7.2f ns\n", ns);
return total;
}

//---------------------------------------------------------------------------

int main (void)
{
uint64_t mask;

mask = 0xBEFFFFFFFFFFFFFF;

// raise our priority to increase measurement accuracy
SetPriorityClass (GetCurrentProcess (), REALTIME_PRIORITY_CLASS);

printf ("using pseudo-random data\n");
runtests (1, mask);
return 0;
}

//---------------------------------------------------------------------------
mingw64 build command: gcc -Wall -Wextra -O3 -octest.exe ctest.c functions.s
英特尔®酷睿i7-2600K在4.0 GHz时的程序输出:
using pseudo-random data
heaviside_c1: 2.24 ns
heaviside_a1: 2.00 ns
heaviside_a2: 2.00 ns

这些计时结果包括伪随机参数生成的执行和结果总计代码,这些代码可防止优化程序消除否则未使用的heaviside_c1局部函数。

heaviside_c1来自原始的fortran建议,已移植到C。
heaviside_a1是一种汇编语言实现。
heaviside_a2是汇编语言版本的修改,它使用调用者传递的寄存器常量来避免生成它们的开销。对于我的处理器,基准测试没有显示传递常量的优势。

汇编语言函数假定xmm0返回结果,并且xmm1和xmm2可用作暂存器。这对于Windows使用的x64调用约定有效。对于其他调用约定,应确认该假设。

为了避免访问内存,汇编语言版本希望参数通过寄存器(XMM0)中的值传递。因为这不是fortran的默认值,所以需要特殊的声明。这对于gfortran 64位似乎正常工作:
  interface
real(c_double) function heaviside_a1(x)
use iso_c_binding, only: c_double
real(c_double), VALUE :: x
end function heaviside_a1
end interface

关于optimization - Heaviside函数的优化实现,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/18787425/

27 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com