gpt4 book ai didi

使用数组指针的 C++ 简单技巧可提高性能

转载 作者:行者123 更新时间:2023-11-28 05:36:15 24 4
gpt4 key购买 nike

我在我的堆排序例程中发现了一个奇怪的行为(见下文)。

void hpsort(unsigned long n, double *data)
{
unsigned long i, ir, j, l;
double rra;

if (n < 2) return;
l = (n - 2) / 2 + 1;
ir = n - 1;

for (;;)
{
if (l > 0) rra = data[--l];
else
{
rra = data[ir];
data[ir] = data[0];
if (--ir == 0) { data[0] = rra; break; }
}

i = l;
j = l + l + 1;
while (j <= ir)
{
if (j < ir && data[j] < data[j+1]) ++j;
if (rra < data[j])
{
data[i] = data[j];
i = j;
j += j + 1;
}
else break;
}
data[i] = rra;
}

return;
}

如果我做一个像这样调用这个例程的基准测试

double* array = (double*)malloc(sizeof(double) * N);
... fill in the array ...
hpsort(N, array);

需要 X 秒。但是如果我只添加一行

void hpsort(unsigned int n, double *data)
{
++data;

并做基准测试

double* array = (double*)malloc(sizeof(double) * N);
... fill in the array ...
hpsort(N, array-1);

大约需要 0.96X 秒(即快 4%)。这种性能差异在每次运行中都是稳定的。

感觉 g++ 编译器在第一种情况下进行边界检查,而在第二种情况下我可以以某种方式欺骗它。但是我从来没有听说过对 C 数组进行边界检查...

知道为什么我会在性能上出现这种奇怪的差异吗?

附注编译是用 g++ -O2 完成的。顺便说一句,将 unsigned long 更改为 long int 也会将性能降低大约 3 到 4%。

p.p.s. “定义的行为”版本也显示了性能改进

void hpsort(unsigned int n, double *data)
{
--data;

和基准为

double* array = (double*)malloc(sizeof(double) * N);
... fill in the array ...
hpsort(N, array+1);

p.p.p.s.性能对比

Size of array   Faster  Slower
10 1.46 1.60
100 1.41 1.62
1000 1.84 1.96
10000 1.78 1.87
100000 1.72 1.80
1000000 1.76 1.83
10000000 1.98 2.03

这是我的 hpsort.cpp

代码
 void hpsort1(unsigned long n, double *data)
{
unsigned long i, ir, j, l;
double rra;

if (n < 2) return;
l = (n - 2) / 2 + 1;
ir = n - 1;

for (;;)
{
if (l > 0) rra = data[--l];
else
{
rra = data[ir];
data[ir] = data[0];
if (--ir == 0)
{
data[0] = rra;
break;
}
}

i = l;
j = l + l + 1;
while (j <= ir)
{
if (j < ir && data[j] < data[j+1]) ++j;
if (rra < data[j])
{
data[i] = data[j];
i = j;
j += j + 1;
}
else break;
}
data[i] = rra;
}
return;
}

void hpsort2(unsigned long n, double *data)
{
unsigned long i, ir, j, l;
double rra;

--data;

if (n < 2) return;
l = (n - 2) / 2 + 1;
ir = n - 1;

for (;;)
{
if (l > 0) rra = data[--l];
else
{
rra = data[ir];
data[ir] = data[0];
if (--ir == 0)
{
data[0] = rra;
break;
}
}

i = l;
j = l + l + 1;
while (j <= ir)
{
if (j < ir && data[j] < data[j+1]) ++j;
if (rra < data[j])
{
data[i] = data[j];
i = j;
j += j + 1;
}
else break;
}
data[i] = rra;
}
return;
}

这是我的基准测试代码heapsort-benchmark.cpp

 #include <vector>
#include <alloca.h>
#include <limits.h>
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <time.h>
#include <math.h>

using namespace std;

void hpsort1(unsigned long n, double *data);
void hpsort2(unsigned long n, double *data);

typedef double element_t;
typedef void(*Test)(element_t*, element_t*, int);

const int sizes [] = {10, 100, 1000, 10000, 100000, 1000000, 10000000};
const int largest_size = sizes[sizeof(sizes)/sizeof(int)-1];

vector<double> result_times; // results are pushed into this vector

clock_t start_time;

void do_row(int size) // print results for given size of processed array
{
printf("%10d \t", size);
for (int i=0; i<result_times.size(); ++i) printf("%.2f\t", result_times[i]);
printf("\n");
result_times.clear();
}

inline void start_timer() { start_time = clock(); }

inline double timer()
{
clock_t end_time = clock();
return (end_time - start_time)/double(CLOCKS_PER_SEC);
}

void run(Test f, element_t* first, element_t* last, int number_of_times)
{
start_timer();
while (number_of_times-- > 0) f(first,last,number_of_times);
result_times.push_back(timer());
}

void random_shuffle(double *first, double *last)
{
size_t i, j, n;
double tmp;
n = last-first;

srand((unsigned int)0);

for (i=n-1; i>0; --i)
{
j = rand() % (i+1);
tmp = first[i];
first[i] = first[j];
first[j] = tmp;
}
return;
}

void hpsort1_test(element_t* first, element_t* last, int number_of_times)
{
size_t num_elements = (last-first);
element_t* array = (element_t*)malloc(sizeof(element_t)*num_elements);
memcpy(array, first, sizeof(element_t)*num_elements);
hpsort1(num_elements, array);
free(array);
}

void hpsort2_test(element_t* first, element_t* last, int number_of_times)
{
size_t num_elements = (last-first);
element_t* array = (element_t*)malloc(sizeof(element_t)*num_elements);
memcpy(array, first, sizeof(element_t)*num_elements);
hpsort2(num_elements, array+1);
free(array);
}

void initialize(element_t* first, element_t* last)
{
element_t x = 0.;
while (first != last) { *first++ = x; x += 1.; }
}

double logtwo(double x) { return log(x)/log((double) 2.0); }

int number_of_tests(int size)
{
double n = (double)size;
double largest_n = (double)largest_size;
return int(floor((largest_n * logtwo(largest_n)) / (n * logtwo(n))));
}

void run_tests(int size)
{
const int n = number_of_tests(size);

element_t *buffer = (element_t *)malloc(size * sizeof(element_t));
element_t* buffer_end = &buffer[size];

initialize(buffer, buffer + size); // fill in the elements

for (int i = 0; i < size/2; ++i) buffer[size/2 + i] = buffer[i]; // fill in the second half with values of the first half
//random_shuffle(buffer, buffer_end); // shuffle if you do not want an ordered array

run(hpsort2_test, buffer, buffer_end, n);
run(hpsort1_test, buffer, buffer_end, n);

do_row(size);

free(buffer);
}


int main()
{
const int n = sizeof(sizes)/sizeof(int);
for (int i = 0; i < n; ++i) run_tests(sizes[i]);
}

我编译并运行它为

g++ -O2 -c heapsort-benchmark.cpp
g++ -O2 -c hpsort.cpp
g++ -O2 -o heapsort-benchmark heapsort-benchmark.o hpsort.o
./heapsort-benchmark

第一列将是更快的版本

最佳答案

无法像 OP 那样获得一致的结果。

IMO OP 的小差异不是代码差异的一部分,而是测试的一部分。

void hpsort(unsigned long n, double *data) {
unsigned long i, ir, j, l;
double rra;
...
}

void hpsort1(unsigned long n, double *data) {
--data;
unsigned long i, ir, j, l;
double rra;
...
}

测试代码

#include <time.h>
#include <stdlib.h>

void test(const char *s, int code, size_t n) {
srand(0);
double* array = (double*) malloc(sizeof(double) * n * 2);
// make 2 copies of same random data
for (size_t i = 0; i < n; i++) {
array[i] = rand();
array[i+n] = array[i];
}

double dt0;
double dt1;
clock_t c0 = clock();
clock_t c1,c2;
if (code) {
hpsort1(n, array + 1);
c1 = clock();
hpsort(n, &array[n]);
c2 = clock();
dt0 = (double) (c2 - c1)/CLOCKS_PER_SEC;
dt1 = (double) (c1 - c0)/CLOCKS_PER_SEC;
} else {
hpsort(n, array);
c1 = clock();
hpsort1(n, &array[n]+1);
c2 = clock();
dt0 = (double) (c1 - c0)/CLOCKS_PER_SEC;
dt1 = (double) (c2 - c1)/CLOCKS_PER_SEC;
}
free(array);
const char *cmp = dt0==dt1 ? "==" : (dt0<dt1 ? "<" : ">");
printf("%s %f %2s %f Diff:% f%%\n", s, dt0, cmp, dt1, 100*(dt1-dt0)/dt0);
}

int main() {
//srand((unsigned) time(0));
size_t n = 3000000;
for (int i=0; i<10; i++) {
test("heap first", 0, n);
test("heap1 first", 1, n);
fflush(stdout);
}
}

输出

heap  first 1.263000  > 1.201000  Diff:-4.908947%
heap1 first 1.295000 < 1.326000 Diff: 2.393822%
heap first 1.342000 > 1.295000 Diff:-3.502235%
heap1 first 1.279000 < 1.295000 Diff: 1.250977%
heap first 1.279000 == 1.279000 Diff: 0.000000%
heap1 first 1.280000 > 1.279000 Diff:-0.078125%
heap first 1.295000 > 1.294000 Diff:-0.077220%
heap1 first 1.280000 > 1.279000 Diff:-0.078125%
heap first 1.279000 == 1.279000 Diff: 0.000000%
heap1 first 1.295000 > 1.279000 Diff:-1.235521%
heap first 1.263000 < 1.295000 Diff: 2.533650%
heap1 first 1.280000 > 1.279000 Diff:-0.078125%
heap first 1.295000 > 1.263000 Diff:-2.471042%
heap1 first 1.295000 < 1.310000 Diff: 1.158301%
heap first 1.310000 < 1.326000 Diff: 1.221374%
heap1 first 1.326000 < 1.342000 Diff: 1.206637%
heap first 1.279000 == 1.279000 Diff: 0.000000%
heap1 first 1.264000 < 1.295000 Diff: 2.452532%
heap first 1.279000 > 1.264000 Diff:-1.172791%
heap1 first 1.279000 > 1.264000 Diff:-1.172791%

关于使用数组指针的 C++ 简单技巧可提高性能,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/38247957/

24 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com