gpt4 book ai didi

c++ - 使用仿函数提供函数或运算符作为 C++ 模板参数的性能损失?

转载 作者:塔克拉玛干 更新时间:2023-11-03 01:31:05 25 4
gpt4 key购买 nike

我有一系列复杂的函数执行非常相似的任务,除了函数中间的一个运算符。我的代码的简化版本可能是这样的:

#include <assert.h>

static void memopXor(char * buffer1, char * buffer2, char * res, unsigned n){
for (unsigned x = 0 ; x < n ; x++){
res[x] = buffer1[x] ^ buffer2[x];
}
};

static void memopPlus(char * buffer1, char * buffer2, char * res, unsigned n){
for (unsigned x = 0 ; x < n ; x++){
res[x] = buffer1[x] + buffer2[x];
}
};

static void memopMul(char * buffer1, char * buffer2, char * res, unsigned n){
for (unsigned x = 0 ; x < n ; x++){
res[x] = buffer1[x] * buffer2[x];
}
};


int main(int argc, char ** argv){
char b1[5] = {0, 1, 2, 3, 4};
char b2[5] = {0, 1, 2, 3, 4};

char res1[5] = {};
memopXor(b1, b2, res1, 5);

assert(res1[0] == 0);
assert(res1[1] == 0);
assert(res1[2] == 0);
assert(res1[3] == 0);
assert(res1[4] == 1);

char res2[5] = {};
memopPlus(b1, b2, res2, 5);

assert(res2[0] == 0);
assert(res2[1] == 2);
assert(res2[2] == 4);
assert(res2[3] == 6);
assert(res2[4] == 8);

char res3[5] = {};
memopMul(b1, b2, res3, 5);

assert(res3[0] == 0);
assert(res3[1] == 1);
assert(res3[2] == 4);
assert(res3[3] == 9);
assert(res3[4] == 16);
}

使用 C++ 模板避免重复代码看起来是一个很好的案例,因此我正在寻找一种方法将我的代码更改为如下所示(伪代码):

#include <assert.h>

template <FUNCTION>
void memop<FUNCTION>(char * buffer1, char * buffer2, char * res, size_t n){
for (size_t x = 0 ; x < n ; x++){
res[x] = FUNCTION(buffer1[x], buffer2[x]);
}
}

int main(int argc, char ** argv){
char b1[5] = {0, 1, 2, 3, 4};
char b2[5] = {0, 1, 2, 3, 4};

char res1[5] = {};
memop<operator^>(b1, b2, res1, 5);

assert(res1[0] == 0);
assert(res1[1] == 0);
assert(res1[2] == 0);
assert(res1[3] == 0);
assert(res1[4] == 0);

char res2[5] = {};
memop<operator+>(b1, b2, res2, 5);

assert(res2[0] == 0);
assert(res2[1] == 2);
assert(res2[2] == 4);
assert(res2[3] == 6);
assert(res2[4] == 8);

char res3[5] = {};
memop<operator*>(b1, b2, res3, 5);

assert(res3[0] == 0);
assert(res3[1] == 1);
assert(res3[2] == 4);
assert(res3[3] == 9);
assert(res3[4] == 16);
}

难点在于我不愿意接受结果代码的任何减速。这意味着暗示间接调用(通过 vtable 或函数指针)的解决方案不可行。

这个问题的常见 C++ 解决方案似乎是将运算符包装在仿函数类的 operator() 方法中调用。通常会得到类似于以下代码的内容:

#include <assert.h>

template <typename Op>
void memop(char * buffer1, char * buffer2, char * res, unsigned n){
Op o;
for (unsigned x = 0 ; x < n ; x++){
res[x] = o(buffer1[x], buffer2[x]);
}
};


struct Xor
{
char operator()(char a, char b){
return a ^ b;
}
};

struct Plus
{
char operator()(char a, char b){
return a + b;
}
};

struct Mul
{
char operator()(char a, char b){
return a * b;
}
};

int main(int argc, char ** argv){
char b1[5] = {0, 1, 2, 3, 4};
char b2[5] = {0, 1, 2, 3, 4};

char res1[5] = {};
memop<Xor>(b1, b2, res1, 5);

assert(res1[0] == 0);
assert(res1[1] == 0);
assert(res1[2] == 0);
assert(res1[3] == 0);
assert(res1[4] == 0);

char res2[5] = {};
memop<Plus>(b1, b2, res2, 5);

assert(res2[0] == 0);
assert(res2[1] == 2);
assert(res2[2] == 4);
assert(res2[3] == 6);
assert(res2[4] == 8);

char res3[5] = {};
memop<Mul>(b1, b2, res3, 5);

assert(res3[0] == 0);
assert(res3[1] == 1);
assert(res3[2] == 4);
assert(res3[3] == 9);
assert(res3[4] == 16);
}

这样做会降低性能吗?

最佳答案

就 bencharmk 而言,您公开的代码几乎没有用。

char cversion() {
char b1[5] = {0, 1, 2, 3, 4};
char b2[5] = {0, 1, 2, 3, 4};

char res1[5] = {};
memopXor(b1, b2, res1, 5);

return res1[4];
}

char cppversion() {
char b1[5] = {0, 1, 2, 3, 4};
char b2[5] = {0, 1, 2, 3, 4};

char res1[5] = {};
memop<Xor>(b1, b2, res1, 5);

return res1[4];
}

被编译成这样的LLVM IR:

define signext i8 @cversion()() nounwind uwtable readnone {
ret i8 0
}

define signext i8 @cppversion()() nounwind uwtable readnone {
ret i8 0
}

也就是说,编译器在编译过程中进行整个计算。

所以我冒昧地定义了一个新函数:

void cppmemopXor(char * buffer1,
char * buffer2,
char * res,
unsigned n)
{
memop<Xor>(buffer1, buffer2, res, n);
}

并删除了 memopXor 上的 static 限定符,然后重复该体验:

define void @memopXor(char*, char*, char*, unsigned int)(i8* nocapture %buffer1, i8* nocapture %buffer2, i8* nocapture %res, i32 %n) nounwind uwtable {
%1 = icmp eq i32 %n, 0
br i1 %1, label %._crit_edge, label %.lr.ph

.lr.ph: ; preds = %.lr.ph, %0
%indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
%2 = getelementptr inbounds i8* %buffer1, i64 %indvars.iv
%3 = load i8* %2, align 1, !tbaa !0
%4 = getelementptr inbounds i8* %buffer2, i64 %indvars.iv
%5 = load i8* %4, align 1, !tbaa !0
%6 = xor i8 %5, %3
%7 = getelementptr inbounds i8* %res, i64 %indvars.iv
store i8 %6, i8* %7, align 1, !tbaa !0
%indvars.iv.next = add i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp eq i32 %lftr.wideiv, %n
br i1 %exitcond, label %._crit_edge, label %.lr.ph

._crit_edge: ; preds = %.lr.ph, %0
ret void
}

以及带有模板的 C++ 版本:

define void @cppmemopXor(char*, char*, char*, unsigned int)(i8* nocapture %buffer1, i8* nocapture %buffer2, i8* nocapture %res, i32 %n) nounwind uwtable {
%1 = icmp eq i32 %n, 0
br i1 %1, label %_ZL5memopI3XorEvPcS1_S1_j.exit, label %.lr.ph.i

.lr.ph.i: ; preds = %.lr.ph.i, %0
%indvars.iv.i = phi i64 [ %indvars.iv.next.i, %.lr.ph.i ], [ 0, %0 ]
%2 = getelementptr inbounds i8* %buffer1, i64 %indvars.iv.i
%3 = load i8* %2, align 1, !tbaa !0
%4 = getelementptr inbounds i8* %buffer2, i64 %indvars.iv.i
%5 = load i8* %4, align 1, !tbaa !0
%6 = xor i8 %5, %3
%7 = getelementptr inbounds i8* %res, i64 %indvars.iv.i
store i8 %6, i8* %7, align 1, !tbaa !0
%indvars.iv.next.i = add i64 %indvars.iv.i, 1
%lftr.wideiv = trunc i64 %indvars.iv.next.i to i32
%exitcond = icmp eq i32 %lftr.wideiv, %n
br i1 %exitcond, label %_ZL5memopI3XorEvPcS1_S1_j.exit, label %.lr.ph.i

_ZL5memopI3XorEvPcS1_S1_j.exit: ; preds = %.lr.ph.i, %0
ret void
}

正如预期的那样,它们在结构上完全相同,因为仿函数代码已完全内联(即使不了解 IR 也是可见的)。

请注意,这不是孤立的结果。例如,std::sort 的执行速度是 qsort 的两倍到三倍,因为它使用仿函数而不是间接函数调用。当然,使用模板化函数和仿函数意味着每个不同的实例化都会生成新代码,就像您手动编写函数一样,但这正是您手动执行的操作。

关于c++ - 使用仿函数提供函数或运算符作为 C++ 模板参数的性能损失?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/9595488/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com