gpt4 book ai didi

python - 如何根据前一行通过逐行计算改进 DataFrame 上的 for 循环?

转载 作者:行者123 更新时间:2023-12-05 04:36:39 24 4
gpt4 key购买 nike

我有一个带有 for 循环的工作代码,我想在速度方面对其进行优化(该项目已被拒绝,因为它很慢)。

这是我的情况:我有一个 Excel 文件,我从中将一个包含数千行和几列的表导入到 Pandas DataFrame 中。第一列是一系列单调递增的时间戳,频率为 15 分钟。

我必须使用此表来计算一些额外的列,将它们附加到原始表并将生成的 DataFrame 保存到新的 Excel 文件中。

使代码变慢的是额外列的核心计算。这是工作代码的片段:

import pandas as pd
from datetime import timedelta as td

cons_prod = pd.read_csv("sample.csv", index_col=0, parse_dates=True)

soc_dct = {} # State of charge (kW)
charge_dct = {} # Charge (kW)
discharge_dct = {} # Discharge (kW)
acc_dct = {} # Auto-consumption NEW (kW)
lst_dct = {} # Lost injection due to battery efficiency (kW)
inj_dct = {} # Injection NEW (kW)
gridcons_dct = {} # Grid Consumption NEW (kW)
agg_dct = {} # Additional Auto-consumption through battery (kW)

battery_parameters = {
"power": 50,
"energy": 130,
"efficiency": 0.9,
"minsoc": 0.1,
"soct0": 65.0,
}

bp_energy = battery_parameters["energy"]
bp_power = battery_parameters["power"]
soct0 = 0.5 * bp_energy

for t in cons_prod.index:
L = cons_prod.loc[t, "Injection (kW)"]
m = cons_prod.loc[t, "Grid Consumption (kW)"]
k = cons_prod.loc[t, "Auto-consumption (kW)"]
f = cons_prod.loc[t, "Consumption (kW)"]
if t == cons_prod.index[0]:
# State of charge (kW)
soc_dct.setdefault(t, soct0)

# Charge (kW)
charge_dct.setdefault(
t,
min(
L,
(bp_energy - soc_dct[t]) * 4,
bp_power,
)
* battery_parameters["efficiency"]
if L >= 0
else 0,
)

# Discharge (kW)
discharge_dct.setdefault(
t,
-min(
m,
4 * soc_dct[t] - 4 * (battery_parameters["minsoc"] * bp_energy),
bp_power,
)
if m >= 0
else 0,
)

# Auto-consumption NEW (kW)
acc_dct.setdefault(t, k - discharge_dct[t])

# Lost injection due to battery efficiency (kW)
lst_dct.setdefault(
t,
(charge_dct[t] / battery_parameters["efficiency"]) - charge_dct[t],
)

# Injection NEW (kW)
inj_dct.setdefault(t, L - charge_dct[t] - lst_dct[t])

# Grid Consumption NEW (kW)
gridcons_dct.setdefault(t, f - acc_dct[t])

# Additional Auto-consumption through battery (kW)
agg_dct.setdefault(t, acc_dct[t] - k)
else:
# State of charge (kW)
soc_dct.setdefault(
t,
soc_dct[t - td(minutes=15)]
+ (charge_dct[t - td(minutes=15)] + discharge_dct[t - td(minutes=15)]) / 4,
)

# Charge (kW)
charge_dct.setdefault(
t,
min(
L,
(bp_energy - soc_dct[t]) * 4,
bp_power,
)
* battery_parameters["efficiency"]
if L >= 0
else 0,
)

# Discharge (kW)
discharge_dct.setdefault(
t,
-min(
m,
4 * soc_dct[t] - 4 * (battery_parameters["minsoc"] * bp_energy),
bp_power,
)
if m >= 0
else 0,
)

# Auto-consumption NEW (kW)
acc_dct.setdefault(t, k - discharge_dct[t])

# Lost injection due to battery efficiency (kW)
lst_dct.setdefault(
t, charge_dct[t] / battery_parameters["efficiency"] - charge_dct[t]
)

# Injection NEW (kW)
inj_dct.setdefault(t, L - charge_dct[t] - lst_dct[t])

# Grid Consumption NEW (kW)
gridcons_dct.setdefault(t, f - acc_dct[t])

# Additional Auto-consumption through battery (kW)
agg_dct.setdefault(t, acc_dct[t] - k)
# Creating a DataFrame with all the values
output_df = pd.DataFrame(
data=[
soc_dct,
charge_dct,
discharge_dct,
acc_dct,
lst_dct,
inj_dct,
gridcons_dct,
agg_dct,
]
).T
output_df.columns = [
"State of charge (kW)",
"Charge (kW)",
"Discharge (kW)",
"Auto-consumption NEW (kW)",
"Lost injection due to battery efficiency (kW)",
"Injection NEW (kW)",
"Grid Consumption NEW (kW)",
"Additional Auto-consumption through battery (kW)",
]
charge_dct = {} # Charge (kW)
discharge_dct = {} # Discharge (kW)
acc_dct = {} # Auto-consumption NEW (kW)
lst_dct = {} # Lost injection due to battery efficiency (kW)
inj_dct = {} # Injection NEW (kW)
gridcons_dct = {} # Grid Consumption NEW (kW)
agg_dct = {} # Additional Auto-consumption through battery (kW)

for t in cons_prod.index:
L = cons_prod.loc[t, "Injection (kW)"]
m = cons_prod.loc[t, "Grid Consumption (kW)"]
k = cons_prod.loc[t, "Auto-consumption (kW)"]
f = cons_prod.loc[t, "Consumption (kW)"]
if t == cons_prod.index[0]:
# State of charge (kW)
soc_dct.setdefault(t, soct0)

# Charge (kW)
charge_dct.setdefault(
t,
min(
L,
(bp_energy - soc_dct[t]) * 4,
bp_power,
)
* battery_parameters["efficiency"]
if L >= 0
else 0,
)

# Discharge (kW)
discharge_dct.setdefault(
t,
-min(
m,
4 * soc_dct[t] - 4 * (battery_parameters["minsoc"] * bp_energy),
bp_power,
)
if m >= 0
else 0,
)

# Auto-consumption NEW (kW)
acc_dct.setdefault(t, k - discharge_dct[t])

# Lost injection due to battery efficiency (kW)
lst_dct.setdefault(
t,
(charge_dct[t] / battery_parameters["efficiency"]) - charge_dct[t],
)

# Injection NEW (kW)
inj_dct.setdefault(t, L - charge_dct[t] - lst_dct[t])

# Grid Consumption NEW (kW)
gridcons_dct.setdefault(t, f - acc_dct[t])

# Additional Auto-consumption through battery (kW)
agg_dct.setdefault(t, acc_dct[t] - k)
else:
# State of charge (kW)
soc_dct.setdefault(
t,
soc_dct[t - td(minutes=15)]
+ (charge_dct[t - td(minutes=15)] + discharge_dct[t - td(minutes=15)]) / 4,
)

# Charge (kW)
charge_dct.setdefault(
t,
min(
L,
(bp_energy - soc_dct[t]) * 4,
bp_power,
)
* battery_parameters["efficiency"]
if L >= 0
else 0,
)

# Discharge (kW)
discharge_dct.setdefault(
t,
-min(
m,
4 * soc_dct[t] - 4 * (battery_parameters["minsoc"] * bp_energy),
bp_power,
)
if m >= 0
else 0,
)

# Auto-consumption NEW (kW)
acc_dct.setdefault(t, k - discharge_dct[t])

# Lost injection due to battery efficiency (kW)
lst_dct.setdefault(
t, charge_dct[t] / battery_parameters["efficiency"] - charge_dct[t]
)

# Injection NEW (kW)
inj_dct.setdefault(t, L - charge_dct[t] - lst_dct[t])

# Grid Consumption NEW (kW)
gridcons_dct.setdefault(t, f - acc_dct[t])

# Additional Auto-consumption through battery (kW)
agg_dct.setdefault(t, acc_dct[t] - k)
# Creating a DataFrame with all the values
output_df = pd.DataFrame(
data=[
soc_dct,
charge_dct,
discharge_dct,
acc_dct,
lst_dct,
inj_dct,
gridcons_dct,
agg_dct,
]
).T
output_df.columns = [
"State of charge (kW)",
"Charge (kW)",
"Discharge (kW)",
"Auto-consumption NEW (kW)",
"Lost injection due to battery efficiency (kW)",
"Injection NEW (kW)",
"Grid Consumption NEW (kW)",
"Additional Auto-consumption through battery (kW)",
]

cons_prod 是导入到 DataFrame 中的表。

如您所见,我们有两种情况:当 t == cons_prod.index[0](即时间戳的第一项)时,计算使用相同的值 t。但是,从第二个时间戳开始,一些计算引用以前的值(这里指的是使用索引 t - td(minutes=15) 之前的 15 分钟)。

这就是我努力摆脱 for 循环的原因。

对可能出现的问题的一些解释

  • 问:为什么要使用字典?答:因为我认为它们比其他数据类型填充得更快,而且我可以稍后使用它们来创建 DataFrame。
  • 问:时间戳是否连贯,例如他们有缺失值吗?答:没有缺失值,因为我之前写了一个函数来确保时间戳被完全填充。
  • 问:计算目前不引用 DataFrame 中的前一行,而是引用字典中的前一行!为什么标题具有误导性?答:这是迄今为止我能想出的最好的解决方案,但我想知道我对 Pandas 不那么完整的了解是否隐藏了一个更简单、更快的解决方案。

希望框架清晰。

提前致谢!

编辑:根据要求,添加了一个 100-lines sample cons_prod 并修改了以前的代码以满足 MRE 的要求。

编辑 2:我尝试从字典转移到 Pandas 查找,尝试尽可能优化。这是我想出的代码:

from time import time as tt

cp = cons_prod.copy(deep=True)

# Initialise the columns filling them with zeroes
cp["State of charge (kW)"] = 0
cp["Charge (kW)"] = 0
cp["Discharge (kW)"] = 0

# Storing the position of the columns in variables
cp_soc = cp.columns.get_loc("State of charge (kW)")
cp_charge = cp.columns.get_loc("Charge (kW)")
cp_discharge = cp.columns.get_loc("Discharge (kW)")
cp_inj = cp.columns.get_loc("Injection (kW)")
cp_gridcons = cp.columns.get_loc("Grid Consumption (kW)")

# Storing the values of the battery dictionary lookups in variables
bp_energy = dct_bp["energy"]
bp_power = dct_bp["power"]
bp_efficiency = dct_bp["efficiency"]
bp_soct0 = dct_bp["soct0"]
bp_minsoc = dct_bp["minsoc"]

start1 = tt() # Measuring time
for row in cp.itertuples(name=None): # Using itertuples to gain some speed
L = cp.loc[row[0], "Injection (kW)"]
m = cp.loc[row[0], "Grid Consumption (kW)"]
k = cp.loc[row[0], "Auto-consumption (kW)"]
f = cp.loc[row[0], "Consumption (kW)"]
if row[0] == cp.index[0]:
cp.iloc[0, cp_soc] = bp_soct0
cp.iloc[0, cp_charge] = float(
min(L, (bp_energy - bp_soct0) * 4, bp_power) * bp_efficiency
if L >= 0
else 0,
)
cp.iloc[0, cp_discharge] = float(
-min(
m,
4 * bp_soct0 - 4 * (bp_minsoc * bp_energy),
bp_power,
)
if m >= 0
else 0
)
else:
t = pd.Index(cp.index).get_loc(row[0])
cp.iloc[t, cp_soc] = float(
cp.iloc[t - 1, cp_soc]
+ (cp.iloc[t - 1, cp_charge] + cp.iloc[t - 1, cp_discharge]) / 4
)
cp.iloc[t, cp_charge] = float(
min(L, (bp_energy - cp.iloc[t, cp_soc]) * 4, bp_power) * bp_efficiency
if L >= 0
else 0,
)
cp.iloc[t, cp_discharge] = float(
-min(
m,
4 * cp.iloc[t, cp_soc] - 4 * (dct_bp["minsoc"] * bp_energy),
bp_power,
)
if m >= 0
else 0
)
end1 = tt() - start1
print(f"Pandas lookup took {end1:.2f} seconds")

使用此代码,我完成每个任务的平均时间为 42 秒,而我过去使用字典的时间为 <20 秒。

最佳答案

从 Python 的角度来看,访问 dict 是很快的,但是,从 Pandas 的角度来看就不是这样了。 Pandas 的最佳特性之一是矢量化,这使得 Pandas 在大型数据集上表现非常高效。

什么是矢量化,为什么快? => What is "vectorization"?

另外,我想留下这个引用。当您在 Pandas 上工作时,请尝试按照链接中提到的顺序查找操作以优化流程。

https://stackoverflow.com/a/55557758/2956135

回到您的案例,您对大多数参数的计算仅取决于同一行的数据。这是您可以轻松进行矢量化的完美案例。

这三个参数(“State of charge”、“Charge”和“Discharge”相互依赖,取决于上一行的计算结果。这部分我没有找到矢量化的方法。希望如此,有人可以进一步优化它。

首先我取出常量字典并将其作为基本常量。这可能没有太大区别,但简单的引用比通过额外的步骤访问字典中的常量要好。如果您必须将多个常量传递给一个函数,将其放入字典并将其作为字典传递是有意义的,但这里不是这种情况。

power = 50
efficiency = 0.9
minsoc = 0.1
soct0 = 65.0
bp_energy = 130
bp_power = 50
soct0 = 0.5 * bp_energy

然后定义计算3个参数的函数,去掉其他参数的计算。

def _calc_soc_discharge(prev, L, m):
if prev is None:
soc = 65.0

# Charge (kW)
charge = min(
L,
(bp_energy - soc) * 4,
bp_power,
) * efficiency if L >= 0 else 0

# Discharge (kW)
discharge = -min(
m,
4 * soc - 4 * (minsoc * bp_energy),
bp_power,
) if m >= 0 else 0
return [soc, charge, discharge]
else:
soc = prev[0] + (prev[1] + prev[2]) / 4

# Charge (kW)
charge = min(
L,
(bp_energy - soc) * 4,
bp_power,
) * efficiency if L >= 0 else 0

# Discharge (kW)
discharge = -min(
m,
4 * soc - 4 * (minsoc * bp_energy),
bp_power,
) if m >= 0 else 0

return [soc, charge, discharge]

迭代调用此函数并将结果追加回数据框。

scd = []
for i, row in cons_prod[['Injection (kW)', 'Grid Consumption (kW)']].iterrows():
scd.append(_calc_soc_discharge(None if len(scd) == 0 else scd[-1], row['Injection (kW)'], row['Grid Consumption (kW)']))

cons_prod = pd.concat([cons_prod, pd.DataFrame(scd, columns=["State of charge (kW)", "Charge (kW)", "Discharge (kW)"], index=cons_prod.index)], axis=1)

现在,此数据框具有计算其他参数所需的所有数据。这里我们使用矢量化。这部分应该真正优化。使用 Pandas Series 调用的数学运算是矢量化的一部分。

cons_prod['Auto-consumption NEW (kW)'] = cons_prod['Auto-consumption (kW)'] - cons_prod['Discharge (kW)']
cons_prod['Lost injection due to battery efficiency (kW)'] = cons_prod['Charge (kW)'] / efficiency - cons_prod['Charge (kW)']
cons_prod['Injection NEW (kW)'] = cons_prod['Injection (kW)'] - cons_prod['Charge (kW)'] - cons_prod['Lost injection due to battery efficiency (kW)']
cons_prod['Grid Consumption NEW (kW)'] = cons_prod['Consumption (kW)'] - cons_prod['Auto-consumption NEW (kW)']
cons_prod['Additional Auto-consumption through battery (kW)'] = cons_prod['Auto-consumption NEW (kW)'] - cons_prod['Auto-consumption (kW)']

基准

在我的笔记本电脑上使用示例数据。

Original solution: 48.2 ms ± 6.17 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
With vectorization: 13.8 ms ± 2.13 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)

关于python - 如何根据前一行通过逐行计算改进 DataFrame 上的 for 循环?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/70784452/

24 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com