使用数组指针的 C++ 简单技巧可提高性能-6ren

使用数组指针的 C++ 简单技巧可提高性能

转载作者：行者123 更新时间：2023-11-28 05:36:15

我在我的堆排序例程中发现了一个奇怪的行为(见下文)。

void hpsort(unsigned long n, double *data)
{
  unsigned long i, ir, j, l;
  double rra;

  if (n < 2) return;
  l = (n - 2) / 2 + 1;
  ir = n - 1;

  for (;;)
    {
      if (l > 0) rra = data[--l];
      else
        {
          rra = data[ir];
          data[ir] = data[0];
          if (--ir == 0) { data[0] = rra; break; }
        }

      i = l;
      j = l + l + 1;
      while (j <= ir)
        {
          if (j < ir && data[j] < data[j+1]) ++j;
          if (rra < data[j])
            {
              data[i] = data[j];
              i = j;
              j += j + 1;
            }
          else break;
        }
      data[i] = rra;
    }

  return;
}

如果我做一个像这样调用这个例程的基准测试

double* array = (double*)malloc(sizeof(double) * N);
... fill in the array ...
hpsort(N, array);

需要 X 秒。但是如果我只添加一行

void hpsort(unsigned int n, double *data)
{
   ++data;

并做基准测试

double* array = (double*)malloc(sizeof(double) * N);
... fill in the array ...
hpsort(N, array-1);

大约需要 0.96X 秒(即快 4%)。这种性能差异在每次运行中都是稳定的。

感觉 g++ 编译器在第一种情况下进行边界检查，而在第二种情况下我可以以某种方式欺骗它。但是我从来没有听说过对 C 数组进行边界检查...

知道为什么我会在性能上出现这种奇怪的差异吗？

附注编译是用 g++ -O2 完成的。顺便说一句，将 unsigned long 更改为 long int 也会将性能降低大约 3 到 4%。

p.p.s. “定义的行为”版本也显示了性能改进

void hpsort(unsigned int n, double *data)
{
   --data;

和基准为

double* array = (double*)malloc(sizeof(double) * N);
... fill in the array ...
hpsort(N, array+1);

p.p.p.s.性能对比

Size of array   Faster  Slower
        10      1.46    1.60
       100      1.41    1.62
      1000      1.84    1.96
     10000      1.78    1.87
    100000      1.72    1.80
   1000000      1.76    1.83
  10000000      1.98    2.03

这是我的 hpsort.cpp

代码

 void hpsort1(unsigned long n, double *data)
 {
   unsigned long i, ir, j, l;
   double rra;

   if (n < 2) return;
   l = (n - 2) / 2 + 1;
   ir = n - 1;

   for (;;)
     {
       if (l > 0) rra = data[--l];
       else
         {
           rra = data[ir];
           data[ir] = data[0];
           if (--ir == 0)
             {
               data[0] = rra;
               break;
             }
         }

       i = l;
       j = l + l + 1;
       while (j <= ir)
         {
           if (j < ir && data[j] < data[j+1]) ++j;
           if (rra < data[j])
             {
               data[i] = data[j];
               i = j;
               j += j + 1;
             }
           else break;
         }
       data[i] = rra;
     }
   return;
 }

 void hpsort2(unsigned long n, double *data)
 {
   unsigned long i, ir, j, l;
   double rra;

   --data;

   if (n < 2) return;
   l = (n - 2) / 2 + 1;
   ir = n - 1;

   for (;;)
     {
       if (l > 0) rra = data[--l];
       else
         {
           rra = data[ir];
           data[ir] = data[0];
           if (--ir == 0)
             {
               data[0] = rra;
               break;
             }
         }

       i = l;
       j = l + l + 1;
       while (j <= ir)
         {
           if (j < ir && data[j] < data[j+1]) ++j;
           if (rra < data[j])
             {
               data[i] = data[j];
               i = j;
               j += j + 1;
             }
           else break;
         }
       data[i] = rra;
     }
   return;
 }

这是我的基准测试代码heapsort-benchmark.cpp

 #include <vector>
 #include <alloca.h>
 #include <limits.h>
 #include <stdlib.h>
 #include <string.h>
 #include <stdio.h>
 #include <time.h>
 #include <math.h>

 using namespace std;

 void hpsort1(unsigned long n, double *data);
 void hpsort2(unsigned long n, double *data);

 typedef double element_t;
 typedef void(*Test)(element_t*, element_t*, int);

 const int sizes [] = {10, 100, 1000, 10000, 100000, 1000000, 10000000};
 const int largest_size = sizes[sizeof(sizes)/sizeof(int)-1];

 vector<double> result_times; // results are pushed into this vector

 clock_t start_time;

 void do_row(int size) // print results for given size of processed array
 {
   printf("%10d  \t", size);
   for (int i=0; i<result_times.size(); ++i) printf("%.2f\t", result_times[i]);
   printf("\n");
   result_times.clear();
 }

 inline void start_timer() { start_time = clock(); }

 inline double timer()
 {
   clock_t end_time = clock();
   return (end_time - start_time)/double(CLOCKS_PER_SEC);
 }

 void run(Test f, element_t* first, element_t* last, int number_of_times)
 {
   start_timer();
   while (number_of_times-- > 0) f(first,last,number_of_times);
   result_times.push_back(timer());
 }

 void random_shuffle(double *first, double *last)
 {
   size_t i, j, n;
   double tmp;
   n = last-first;

   srand((unsigned int)0);

   for (i=n-1; i>0; --i)
     {
       j = rand() % (i+1);
       tmp = first[i];
       first[i] = first[j];
       first[j] = tmp;
     }
   return;
 }

 void hpsort1_test(element_t* first, element_t* last, int number_of_times)
 {
   size_t num_elements = (last-first);
   element_t* array = (element_t*)malloc(sizeof(element_t)*num_elements);
   memcpy(array, first, sizeof(element_t)*num_elements);
   hpsort1(num_elements, array);
   free(array);
 }

 void hpsort2_test(element_t* first, element_t* last, int number_of_times)
 {
   size_t num_elements = (last-first);
   element_t* array = (element_t*)malloc(sizeof(element_t)*num_elements);
   memcpy(array, first, sizeof(element_t)*num_elements);
   hpsort2(num_elements, array+1);
   free(array);
 }

 void initialize(element_t* first, element_t* last)
 {
   element_t x = 0.;
   while (first != last) { *first++ = x; x += 1.; }
 }

 double logtwo(double x) { return log(x)/log((double) 2.0); }

 int number_of_tests(int size)
 {
   double n = (double)size;
   double largest_n = (double)largest_size;
   return int(floor((largest_n * logtwo(largest_n)) / (n * logtwo(n))));
 }

 void run_tests(int size)
 {
   const int n = number_of_tests(size);

   element_t *buffer = (element_t *)malloc(size * sizeof(element_t));
   element_t* buffer_end = &buffer[size];

   initialize(buffer, buffer + size); // fill in the elements

   for (int i = 0; i < size/2; ++i) buffer[size/2 + i] = buffer[i]; // fill in the second half with values of the first half
   //random_shuffle(buffer, buffer_end); // shuffle if you do not want an ordered array

   run(hpsort2_test, buffer, buffer_end, n);
   run(hpsort1_test, buffer, buffer_end, n);

   do_row(size);

   free(buffer);
 }


 int main()
 {
   const int n = sizeof(sizes)/sizeof(int);
   for (int i = 0; i < n; ++i) run_tests(sizes[i]);
 }

我编译并运行它为

g++ -O2 -c heapsort-benchmark.cpp
g++ -O2 -c hpsort.cpp
g++ -O2 -o heapsort-benchmark heapsort-benchmark.o hpsort.o 
./heapsort-benchmark

第一列将是更快的版本

最佳答案

无法像 OP 那样获得一致的结果。

IMO OP 的小差异不是代码差异的一部分，而是测试的一部分。

void hpsort(unsigned long n, double *data) {
  unsigned long i, ir, j, l;
  double rra;
  ...
}

void hpsort1(unsigned long n, double *data) {
  --data;
  unsigned long i, ir, j, l;
  double rra;
  ...
}

测试代码

#include <time.h>
#include <stdlib.h>

void test(const char *s, int code, size_t n) {
  srand(0);
  double* array = (double*) malloc(sizeof(double) * n * 2);
  // make 2 copies of same random data
  for (size_t i = 0; i < n; i++) {
    array[i] = rand();
    array[i+n] = array[i];
  }

  double dt0;
  double dt1;
  clock_t c0 = clock();
  clock_t c1,c2;
  if (code) {
    hpsort1(n, array + 1);
    c1 = clock();
    hpsort(n, &array[n]);
    c2 = clock();
    dt0 = (double) (c2 - c1)/CLOCKS_PER_SEC;
    dt1 = (double) (c1 - c0)/CLOCKS_PER_SEC;
  } else {
    hpsort(n, array);
    c1 = clock();
    hpsort1(n, &array[n]+1);
    c2 = clock();
    dt0 = (double) (c1 - c0)/CLOCKS_PER_SEC;
    dt1 = (double) (c2 - c1)/CLOCKS_PER_SEC;
  }
  free(array);
  const char *cmp = dt0==dt1 ? "==" : (dt0<dt1 ? "<" : ">");
  printf("%s %f %2s %f  Diff:% f%%\n", s, dt0, cmp,  dt1, 100*(dt1-dt0)/dt0);
}

int main() {
  //srand((unsigned) time(0));
  size_t n = 3000000;
  for (int i=0; i<10; i++) {
    test("heap  first", 0, n);
    test("heap1 first", 1, n);
    fflush(stdout);
  }
}

输出

heap  first 1.263000  > 1.201000  Diff:-4.908947%
heap1 first 1.295000  < 1.326000  Diff: 2.393822%
heap  first 1.342000  > 1.295000  Diff:-3.502235%
heap1 first 1.279000  < 1.295000  Diff: 1.250977%
heap  first 1.279000 == 1.279000  Diff: 0.000000%
heap1 first 1.280000  > 1.279000  Diff:-0.078125%
heap  first 1.295000  > 1.294000  Diff:-0.077220%
heap1 first 1.280000  > 1.279000  Diff:-0.078125%
heap  first 1.279000 == 1.279000  Diff: 0.000000%
heap1 first 1.295000  > 1.279000  Diff:-1.235521%
heap  first 1.263000  < 1.295000  Diff: 2.533650%
heap1 first 1.280000  > 1.279000  Diff:-0.078125%
heap  first 1.295000  > 1.263000  Diff:-2.471042%
heap1 first 1.295000  < 1.310000  Diff: 1.158301%
heap  first 1.310000  < 1.326000  Diff: 1.221374%
heap1 first 1.326000  < 1.342000  Diff: 1.206637%
heap  first 1.279000 == 1.279000  Diff: 0.000000%
heap1 first 1.264000  < 1.295000  Diff: 2.452532%
heap  first 1.279000  > 1.264000  Diff:-1.172791%
heap1 first 1.279000  > 1.264000  Diff:-1.172791%

关于使用数组指针的 C++ 简单技巧可提高性能，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/38247957/

文章推荐： ios - addSubview 与呈现模态 ViewController 之间的区别？

文章推荐： html - 内容在 div 后面滚动，它应该在里面

文章推荐： javascript - Three.js 如何确定我的场景是否已加载

文章推荐： C++，如何用另一个类的线程更改另一个类的变量

jquery - 谷歌新闻，左导航 CSS 技巧？或 jQuery 技巧
我是新手。查看 Google 新闻...上下滚动页面时请注意左侧导航栏。看看它是如何滚动一点，然后在它消失之前粘在页面顶部的？关于如何做到这一点有什么想法吗？ jQuery 和 CSS 可以复制吗
改进性能和样式的24个ASP 技巧
技巧 1：在 Web 服务器上缓存常用数据技巧 2：在 Application 或 Session 对象中缓存常用数据技巧 3：在 Web 服务器磁盘上缓存数据和 HTML 技巧 4：避免
此任务的 Excel 技巧
我在 excel 中有一个电子表格，其中包含以下行: COLUMN Value1.Value2.Value3 Value4.Value5.Value6 Value7.Value8.Val
获取规则依赖项的 Makefile 技巧
GNU Makefile 中是否有任何技巧来获取规则的所有依赖项？例子: rule1: dep1_1 dep1_2 dep1_3 rule2: dep2_1 dep2_2 rule1 dump_
debugging - 追踪单点触控程序泄漏的技巧/技巧？
人们使用什么来追踪内存泄漏？我已经通过代码检查设法解决了一些问题，但我不知道下一步该做什么/当我的程序变大时我将如何管理问题。我知道我在泄漏什么类型的对象，但我不知道是什么让它保持活力。在 Wind
C#技巧——通过字符串获取常量值
有什么好的方法可以将“xlSum”、“xlAverage”和“xlCount”等字符串转换为它们在 Microsoft.Office.Interop.Excel.XlConsolidationFunc
Javascript 地址栏黑客/技巧
我们都见过这个: javascript:document.body.contentEditable='true'; document.designMode='on';无效 0 但我的问题是，这实际上是
类插件架构的 Java 技巧
我的应用程序将输出一个图形，其布局由用户定义。自定义布局类应该实现我定义的接口(interface)。我应该怎么做？有一个特殊的文件夹，我可以在其中查找布局类？用户是否将类名作为参数传递给应用？如有
Javascript DOM 技巧
我在弄清楚如何在 Javascript 中自引用表行时遇到了一些麻烦。这是简化的代码: $( "#listitems tbody" ).append( "" + "" + id.va
css 技巧，元素后面的行
关闭。这个问题需要更多focused .它目前不接受答案。想改进这个问题吗？更新问题，使其只关注一个问题 editing this post . 关闭 6 年前。 Improve this q
避免指针比较的 C++ 技巧
我正在将代码库从一种编程风格转移到另一种编程风格。我们有一个名为 Operand 的类型，定义如下: class Operand {...}; 然后我们有 class OperandFactory
显示缩略图时避免额外类的 CSS 技巧
我使用以下缩略图类在我的内容包装器中显示 4x3 缩略图: .thumbnail { float:left; width:300px; height:200px; ma
algorithm - 如何识别相似的声音 - 技巧
按照目前的情况，这个问题不适合我们的问答形式。我们希望答案得到事实、引用或专业知识的支持，但这个问题可能会引发辩论、争论、投票或扩展讨论。如果您觉得这个问题可以改进并可能重新打开，visit the
c++ - constexpr 技巧
我认为这是不可能的，但我想在放弃之前问问你。我想要类似 constexpr 增量的东西。 #include constexpr int inc() { static int inc = 0;
新手 C++ 技巧
是否有任何适合 C++ 新手的技术和描述的好列表。我在想一个描述 RAII、RVO、左值的列表……这适用于目前不了解这些技术或来自不适用这些技术的其他语言的新手。最好是短小精悍的:-) 最佳答案是
在字符串中查找前导零的 Python 技巧
我有一个二进制字符串 '01110000'，我想在不编写 forloop 的情况下返回前面的前导零数。有谁知道如何做到这一点？如果字符串立即以“1”开头，最好也返回 0 最佳答案如果您真的确定它是一
内存优化的 Python 技巧
我需要优化我的应用程序的 RAM 使用率。请省去那些告诉我在编写 Python 代码时不应该关心内存的讲座。我有内存问题，因为我使用非常大的默认字典(是的，我也想快点)。我目前的内存消耗是 350M
CSS 黑客(技巧)
有时，当我看到一个我喜欢的网站或来自受人尊敬的人的网站时，我会查看源代码并尝试理解它们(就像我们所有人一样)。关于 Jeremy Keiths他使用以下代码的网站: [role="navigatio
7个管理家目录的 Git 技巧
这是我怎样设置 Git 来管理我的家目录的方法。我有好几台电脑。一台笔记本电脑用于工作，一台工作站放在家里，一台树莓派（或四台），一台 Pocket CHIP，一台运行
15、HBase shell 技巧
shell 技巧表变量 HBase 0.95 版本增加了为表提供 jruby 风格的面向对象引用的 shell 命令。以前，作用于表的所有 shell 命令都具有程序风格，该风格始终将表的名称作

行者123

个人简介

我是一名优秀的程序员,十分优秀！

作者热门文章

滴滴打车优惠券免费领取

全站热门文章

首页

博学

6Ren·AI

商城

使用数组指针的 C++ 简单技巧可提高性能