gpt4 book ai didi

python - 如何加快 Pandas rolling_sum() 的速度?

转载 作者:太空宇宙 更新时间:2023-11-04 10:48:24 25 4
gpt4 key购买 nike

我有一个应用程序需要计算 Pandas 多级数据帧的滚动总和,我想找到一种方法来缩短处理时间。

mul_df() 是创建演示多级数据帧的函数。

import itertools
import numpy as np
import pandas as pd

def mul_df(level1_rownum, level2_rownum, col_num):
''' create multilevel dataframe '''

index_name = ['IDX_1','IDX_2']
col_name = ['COL'+str(x).zfill(3) for x in range(col_num)]

first_level_dt = [['A'+str(x).zfill(4)]*level2_rownum for x in range(level1_rownum)]
first_level_dt = list(itertools.chain(*first_level_dt))
second_level_dt = ['B'+str(x).zfill(3) for x in range(level2_rownum)]*level1_rownum

dt = pd.DataFrame(np.random.randn(level1_rownum*level2_rownum, col_num), columns=col_name)
dt[index_name[0]] = first_level_dt
dt[index_name[1]] = second_level_dt

rst = dt.set_index(index_name, drop=True, inplace=False)
return rst

例如:

>>> df = mul_df(4,5,3)
COL000 COL001 COL002
IDX_1 IDX_2
A0000 B000 0.2317 -0.6122 0.2289
B001 -0.9218 -0.2918 1.7295
B002 0.1368 0.6659 -1.9193
B003 0.3839 -0.8542 -0.3065
B004 2.0361 -0.4601 1.1246
A0001 B000 0.3039 -0.6761 1.3762
B001 1.1767 0.8465 -0.1745
B002 0.4937 1.6774 -0.3038
B003 -0.3627 -1.6413 -0.7373
B004 -0.0149 1.5900 0.3385
A0002 B000 0.0326 0.2637 1.7990
B001 -0.1071 0.6097 -0.2812
B002 -0.2199 0.7360 1.9425
B003 -1.0423 0.6763 -0.2479
B004 -0.9024 0.3016 -2.7585
A0003 B000 0.2550 0.0470 0.6849
B001 0.5986 0.3283 1.6327
B002 0.8929 -1.1128 -0.9495
B003 -0.5633 1.7935 0.1652
B004 1.0417 -0.4833 0.3413

并使用下面的命令计算每个列数据组的滚动总和(窗口大小为 4):

>>> df.groupby(level='IDX_1').apply(lambda x: pd.rolling_sum(x,4))
COL000 COL001 COL002
IDX_1 IDX_2
A0000 B000 NaN NaN NaN
B001 NaN NaN NaN
B002 NaN NaN NaN
B003 -0.1694 -1.0923 -0.2675
B004 1.6350 -0.9402 0.6282
A0001 B000 NaN NaN NaN
B001 NaN NaN NaN
B002 NaN NaN NaN
B003 1.6116 0.2064 0.1606
B004 1.2928 2.4726 -0.8771
A0002 B000 NaN NaN NaN
B001 NaN NaN NaN
B002 NaN NaN NaN
B003 -1.3367 2.2857 3.2125
B004 -2.2717 2.3236 -1.3451
A0003 B000 NaN NaN NaN
B001 NaN NaN NaN
B002 NaN NaN NaN
B003 1.1832 1.0559 1.5334
B004 1.9699 0.5256 1.1898
>>>

然后我尝试计算一个大数据帧的 rolling_sum()

In [1]: df = mul_df(1000,25,1000)
In [2]: timeit df.groupby(level='IDX_1').apply(lambda x: pd.rolling_sum(x,4))
1 loops, best of 3: 52.1 s per loop

(1000*25, 1000) 数据帧花费 52.1 秒。如何加速rolling_sum(我的意思是有没有其他方法可以达到相同的计算结果但花费更少的时间)?

编辑(为waitingkuo的解决方案添加内存错误信息)

In [1]: df = mul_df(1000,25,1000)

In [2]: k2 = df.frs(4)
---------------------------------------------------------------------------
MemoryError Traceback (most recent call last)
<ipython-input-2-1b54b2662162> in <module>()
----> 1 k2 = df.frs(4)

F:\STK Analysis\Kits\Dev_Tools\FinReporter\FM_CORE.pyc in wrapped(*args, **kwargs)
149 from datetime import datetime
150 t1 = datetime.now()
--> 151 rst = fn(*args, **kwargs)
152 t2 = datetime.now()
153 print "Time: %0.3f"%((t2-t1).seconds + (t2-t1).microseconds/1000000.0)

F:\STK Analysis\Kits\Dev_Tools\FinReporter\FM_CORE.pyc in _frs(df, n)
864 ''' fast_rolling_sum , http://stackoverflow.com/questions/15652343/how-to-speed-up-pandas-rolling-sum '''
865 grp = df.groupby(level='STK_ID')
--> 866 return np.sum([grp.shift(i) for i in range(n)])
867 DataFrame.frs = _frs
868

D:\Python\lib\site-packages\pandas\core\groupby.pyc in wrapper(*args, **kwargs)
259 return self.apply(curried_with_axis)
260 except Exception:
--> 261 return self.apply(curried)
262
263 return wrapper

D:\Python\lib\site-packages\pandas\core\groupby.pyc in apply(self, func, *args, **kwargs)
320 func = _intercept_function(func)
321 f = lambda g: func(g, *args, **kwargs)
--> 322 return self._python_apply_general(f)
323
324 def _python_apply_general(self, f):

D:\Python\lib\site-packages\pandas\core\groupby.pyc in _python_apply_general(self, f)
323
324 def _python_apply_general(self, f):
--> 325 keys, values, mutated = self.grouper.apply(f, self.obj, self.axis)
326
327 return self._wrap_applied_output(keys, values,

D:\Python\lib\site-packages\pandas\core\groupby.pyc in apply(self, f, data, axis, keep_internal)
583 if hasattr(splitter, 'fast_apply') and axis == 0:
584 try:
--> 585 values, mutated = splitter.fast_apply(f, group_keys)
586 return group_keys, values, mutated
587 except lib.InvalidApply:

D:\Python\lib\site-packages\pandas\core\groupby.pyc in fast_apply(self, f, names)
2136 return [], True
2137
-> 2138 sdata = self._get_sorted_data()
2139 results, mutated = lib.apply_frame_axis0(sdata, f, names, starts, ends)
2140

D:\Python\lib\site-packages\pandas\core\groupby.pyc in _get_sorted_data(self)
2103
2104 def _get_sorted_data(self):
-> 2105 return self.data.take(self.sort_idx, axis=self.axis)
2106
2107 def _chop(self, sdata, slice_obj):

D:\Python\lib\site-packages\pandas\core\frame.pyc in take(self, indices, axis)
2900 new_values = com.take_2d(self.values,
2901 com._ensure_int64(indices),
-> 2902 axis=axis)
2903 if axis == 0:
2904 new_columns = self.columns

D:\Python\lib\site-packages\pandas\core\common.pyc in take_2d(arr, indexer, out, mask, needs_masking, axis, fill_value)
426 elif dtype_str in ('float64', 'object', 'datetime64[ns]'):
427 if out is None:
--> 428 out = np.empty(out_shape, dtype=arr.dtype)
429 take_f = _get_take2d_function(dtype_str, axis=axis)
430 take_f(arr, _ensure_int64(indexer), out=out, fill_value=fill_value)

MemoryError:

In [3]:

最佳答案

先移位再相加如何?

In [223]: def my_rolling_sum(d, n):
.....: g = d.groupby(level='IDX_1')
.....: return np.sum([g.shift(i) for i in range(n)])
.....:

让我们看看性能:

In [224]: df = mul_df(1000,25,1000)

In [225]: timeit df.groupby(level='IDX_1').apply(lambda x: pd.rolling_sum(x,4))
1 loops, best of 3: 32.4 s per loop

In [230]: timeit my_rolling_sum(df, 4)
1 loops, best of 3: 7.15 s per loop

编辑

虽然它太耗内存,但我试着给它一些修改:

In [5]: def my_rolling_sum(d, n):
...: g = d.groupby(level='IDX_1')
...: result = g.shift(0)
...: for i in range(1, n):
...: result = result + g.shift(i)
...:

希望对你有帮助。

关于python - 如何加快 Pandas rolling_sum() 的速度?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/15652343/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com