gpt4 book ai didi

common-lisp - 从 Common Lisp/SBCL 中获得更高的速度

转载 作者:行者123 更新时间:2023-12-02 22:28:04 24 4
gpt4 key购买 nike

This paper声称让某个 Lisp 程序运行得比 C 语言更快相等的。尝试重现结果,我能够接近(Lisp 是比 C) 慢 50%,但想知道是否有人知道如何挤压更多SBCL 1.3.1 的性能。

目标问题是向 800 x 中的每个单元格添加一个恒定的单个 float 800 个单 float 数组。方法是用C语言编写程序和 Common Lisp 时代比较一下。使用这个portable timer ,C代码如下如下:

#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <time.h>

#include "./modules/tictoc/tictoc.h"

const int HORZ = 800;
const int VERT = 800;

#define PERF_REPS 1000

typedef float DATA_T;

struct image_s {
size_t n;
size_t w, h;
DATA_T * data;
};
typedef struct image_s image;

image * make_image (size_t w, size_t h) {
size_t n = w * h;
DATA_T * data = (DATA_T *)malloc(sizeof(DATA_T) * n);
assert (NULL != data);
image * result = (image *)malloc(sizeof(image));
assert (NULL != result);
result->n = n;
result->w = w;
result->h = h;
result->data = data;
return result;
}

void free_image (image * it) {
assert (NULL != it);
assert (NULL != it->data);
free (it->data);
free (it);
}

image * init_to_value (image * it, DATA_T val) {
assert (NULL != it);
assert (NULL != it->data);
size_t i;
const size_t n = it->n;
for (i = 0; i < n; ++i) {
it->data[i] = val;
}
return it;
}

void add (image * to, image * from, DATA_T val) {
assert (NULL != to);
assert (NULL != to->data);
assert (NULL != from);
assert (NULL != from->data);
size_t i;
const size_t n = to->n;
for (i = 0; i < n; ++i) {
to->data[i] = from->data[i] + val;
}
}

int main (int argc, char ** argv) {
image * from = init_to_value (make_image (HORZ, VERT), 0.0f);
image * to = init_to_value (make_image (HORZ, VERT), 0.0f);
TicTocTimer clock = tic();
for (size_t i = 0; i < PERF_REPS; ++i)
add (to, from, 42.0);
printf("Elapsed time %f seconds.\n",toc(&clock));
free_image (to);
free_image (from);
return 0;
}

我编译并运行代码如下:

gcc -O3 image-add.c ./modules/tictoc/libtictoc.a && ./a.out

我的 mac book pro 上的典型时间约为 0.178 秒。相当不错。

等效的 Lisp 代码,使用我能在 Lisp 中找到的每个选项 hyperspec ,在新书中Common Lisp Recipes ,并在SBCL user manual中, 是如下。评论指出了我尝试过的一些事情,但没有成功不同之处。

;;; None of the commented-out declarations made any difference in speed. 

(declaim (optimize speed (safety 0)))

(defun image-add (to from val)
(declare (type (simple-array single-float (*))
to from))
(declare (type single-float val))
(let ((size (array-dimension to 0)))
;(declare (type fixnum size))
(dotimes (i size)
;(declare (type fixnum i))
(setf (aref to i) (+ (aref from i) val)))))

(defparameter HORZ 800)
(defparameter VERT 800)

(defparameter PERF-REPS 1000)

(let ((to (make-array (* HORZ VERT) :element-type 'single-float))
(fm (make-array (* HORZ VERT) :element-type 'single-float)))
;(declare (type fixnum HORZ))
;(declare (type fixnum VERT))
(time (dotimes (_ PERF-REPS)
;(declare (type fixnum PERF-REPS))
;(declare (type fixnum _))
;(declare (inline image-add))
(image-add to fm 42.0))))

我编译并运行它如下:

sbcl --script image-perf.lisp

典型的运行时间是 0.276。不错,但我想要更好。当然,这个练习的重点是 Lisp 代码更短,但是有人知道一种方法可以让它更快或更快吗?

最佳答案

以下是经过稍微修改后的版本的一些结果,供引用。

C 版本

C 版本平均需要 0.197s

Lisp版本

(declaim (optimize (speed 3) (debug 0) (safety 0)))

(defconstant HORZ 800)
(defconstant VERT 800)
(defconstant PERF-REPS 1000)

(defun test ()
(let ((target #1=(make-array (* HORZ VERT)
:element-type 'single-float
:initial-element 0f0))
(source #1#))
(declare (type (simple-array single-float (*)) target source))
(time
(dotimes (_ PERF-REPS)
(map-into target
(lambda (x)
(declare (single-float x))
(the single-float (+ x 42f0)))
source)))))

这是输出:

Evaluation took:                                                                                                 
0.372 seconds of real time
0.372024 seconds of total run time (0.368023 user, 0.004001 system)
100.00% CPU
965,075,988 processor cycles
0 bytes consed

lparallel:pmap-into 替换 map-into,使用由 4 个工作线程组成的内核获得最短时间,并给出:

Evaluation took:                                                                                                 
0.122 seconds of real time
0.496031 seconds of total run time (0.492030 user, 0.004001 system)
406.56% CPU
316,445,789 processor cycles
753,280 bytes consed

注意内存使用情况的差异。

关于common-lisp - 从 Common Lisp/SBCL 中获得更高的速度,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/34999052/

24 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com