gpt4 book ai didi

python - 使用 itertools.groupby 性能进行 NumPy 分组

转载 作者:IT老高 更新时间:2023-10-28 22:17:31 31 4
gpt4 key购买 nike

我有许多包含重复项的大型 (>35,000,000) 整数列表。我需要对列表中的每个整数进行计数。以下代码有效,但似乎很慢。其他人可以使用 Python 和 NumPy 来改进基准测试吗?

def group():
import numpy as np
from itertools import groupby
values = np.array(np.random.randint(0,1<<32, size=35000000), dtype='u4')
values.sort()
groups = ((k, len(list(g))) for k,g in groupby(values))
index = np.fromiter(groups, dtype='u4,u2')

if __name__=='__main__':
from timeit import Timer
t = Timer("group()","from __main__ import group")
print t.timeit(number=1)

返回:

$ python bench.py
111.377498865

根据回复:

def group_original():
import numpy as np
from itertools import groupby
values = np.array(np.random.randint(0, 1<<32, size=35000000), dtype='u4')
values.sort()
groups = ((k, len(list(g))) for k,g in groupby(values))
index = np.fromiter(groups, dtype='u4,u2')

def group_gnibbler():
import numpy as np
from itertools import groupby
values = np.array(np.random.randint(0, 1<<32, size=35000000), dtype='u4')
values.sort()
groups = ((k,sum(1 for i in g)) for k,g in groupby(values))
index = np.fromiter(groups, dtype='u4,u2')

def group_christophe():
import numpy as np
values = np.array(np.random.randint(0, 1<<32, size=35000000), dtype='u4')
values.sort()
counts=values.searchsorted(values, side='right') - values.searchsorted(values, side='left')
index = np.zeros(len(values), dtype='u4,u2')
index['f0'] = values
index['f1'] = counts
# Erroneous result!

def group_paul():
import numpy as np
values = np.array(np.random.randint(0, 1<<32, size=35000000), dtype='u4')
values.sort()
diff = np.concatenate(([1], np.diff(values)))
idx = np.concatenate((np.where(diff)[0], [len(values)]))
index = np.empty(len(idx)-1, dtype='u4,u2')
index['f0'] = values[idx[:-1]]
index['f1'] = np.diff(idx)

if __name__=='__main__':
from timeit import Timer
timings=[
("group_original", "Original"),
("group_gnibbler", "Gnibbler"),
("group_christophe", "Christophe"),
("group_paul", "Paul"),
]
for method,title in timings:
t = Timer("%s()"%method,"from __main__ import %s"%method)
print "%s: %s secs"%(title, t.timeit(number=1))

返回:

$ python bench.py
Original: 113.385262966 secs
Gnibbler: 71.7464978695 secs
Christophe: 27.1690568924 secs
Paul: 9.06268405914 secs

虽然 Christophe 目前给出的结果不正确。

最佳答案

通过这样的操作,我得到了三倍的改进:

def group():
import numpy as np
values = np.array(np.random.randint(0, 3298, size=35000000), dtype='u4')
values.sort()
dif = np.ones(values.shape, values.dtype)
dif[1:] = np.diff(values)
idx = np.where(dif>0)
vals = values[idx]
count = np.diff(idx)

关于python - 使用 itertools.groupby 性能进行 NumPy 分组,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/4651683/

31 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com