This looks like a partial powerset
:
这看起来像是一个部分动力装置:
from itertools import chain, combinations
# modified powerset recipe
def powerset(iterable, min=0):
s = list(iterable)
return chain.from_iterable(combinations(s, r) for r in range(min, len(s)+1))
group = ['county', 'co']
tmp = df.groupby(['tech', *group])['loc'].agg(set)
out = []
for k, g in tmp.groupby(group):
for x in map(list, powerset(g.index.get_level_values('tech'), min=2)):
out.append(dict(zip(group, k))
|{'tech': ', '.join(x),
'log': (s:=set().union(*g.loc[x])),
'log-nunique': len(s),
})
out = pd.DataFrame(out)
NB. this uses the walrus operator (:=
, python ≥3.8) and dictionary union ({}|{}
, python ≥3.9), none of which are strictly required, the code can be adapted for older python versions*.
注意:这使用了海象运算符(:=,PYTHON≥3.8)和字典并集({}|{},PYTHON≥3.9),这些都不是严格要求的,代码可以适应较旧的PYTHON版本*。
Output:
产出:
county co tech log log-nunique
0 Albany LU Fiber, cable {o, n, m, r, q, w, p} 7
1 Albany LU Fiber, copper {o, n, s, t, w} 5
2 Albany LU cable, copper {o, n, m, s, r, q, t, p} 8
3 Albany LU Fiber, cable, copper {o, n, m, s, r, q, t, w, p} 9
4 Laramie LU cable, copper {b, a, c, e, f, g, d} 7
* code for older python:
*针对较旧的Python的代码:
group = ['county', 'co']
tmp = df.groupby(['tech', 'county', 'co'])['loc'].agg(set)
out = []
for k, g in tmp.groupby(['county', 'co']):
for x in map(list, powerset(g.index.get_level_values('tech'), min=2)):
out.append({'county': k[0],
'co': k[1],
'tech': ', '.join(x),
'log': set().union(*g.loc[x]),
})
out[-1]['log-nunique'] = len(out[-1]['log'])
out = pd.DataFrame(out)
This is possible in a nearly-fully-vectorised way. Vectorisation breaks down when you want lists in cells, which is generally a bad idea in Pandas; nevertheless:
这可以以几乎完全矢量化的方式实现。当你想在单元格中列出列表时,矢量化会崩溃,这在Pandas中通常是一个坏主意;然而:
import cProfile
import pstats
from timeit import timeit
import seaborn as sns
import numpy as np
import pandas as pd
import itertools
from matplotlib import pyplot as plt
def op_data() -> pd.DataFrame:
"""Sample data more or less the same as OP"""
return pd.DataFrame({
'county': ['Laramie']*10 + ['Albany']*12,
'co': ['LU']*22,
'tech': ['cable']*6 + ['copper']*4 + ['cable']*6 + ['copper']*4 + ['fibre']*2,
'loc': list('abcdefdefgmnopqrnostow'),
})
def big_data(
rand: np.random.Generator,
n: int = 2_000,
group_frac: int = 0.1,
n_locs: int = 26,
tech_dist: float = 0.33,
) -> pd.DataFrame:
techs = np.full(shape=n, fill_value='copper')
tech_asn = rand.random(size=n)
techs[tech_asn > tech_dist] = 'cable'
techs[tech_asn > tech_dist*0.5 + 0.5] = 'fibre'
group_size = group_frac*n
return pd.DataFrame({
'county': np.linspace(start=0, stop=n/10, num=n, dtype=int),
'co': np.linspace(start=0, stop=n/group_size, num=n, dtype=int),
'tech': techs,
'loc': rand.integers(low=0, high=n_locs, size=n),
})
def notna_tuple(col: pd.Series) -> tuple:
return tuple(col.dropna())
def agg_tech(df: pd.DataFrame) -> pd.DataFrame:
# One-hot boolean columns for tech assignments
tech_vals = df['tech'].unique()
df[tech_vals] = pd.DataFrame(
data=np.equal.outer(df['tech'].values, tech_vals),
columns=tech_vals, index=df.index,
)
# Cartesian product of all tech values
tech_combos = pd.MultiIndex.from_product((
pd.Index(name=tech, data=(False, True))
for tech in tech_vals
)).to_frame()
# each group must have at least two tech values in the combination
tech_combos = tech_combos[tech_combos[tech_vals].sum(axis=1) > 1]
# Make tech name tuples
names = pd.DataFrame(index=tech_combos.index)
names[tech_vals] = tech_vals
names[~tech_combos[tech_vals].values] = np.nan
tech_combos['name'] = names.agg(notna_tuple, axis=1)
# Cartesian product of all tech values and all county-co groups,
# where each group must have at least two tech values in the combination
tech_combos = pd.merge(
# For each county-co group, for each tech: is there at least one of that tech?
left=df.groupby(['county', 'co'])[tech_vals]
.any().reset_index(),
right=tech_combos,
how='cross', suffixes=['_any', '']
)
# Eliminate county-co/tech pairs where the tech combo uses a missing tech
tech_combos = tech_combos.loc[
(
tech_combos[tech_vals] <= tech_combos[tech_vals + '_any'].values
).all(axis=1),
['county', 'co', 'name', *tech_vals],
]
# loc values for every tech combination
tech_combos = pd.merge(
left=tech_combos, right=df,
on=['county', 'co'], suffixes=['_group', ''],
)
tech_combos = (
tech_combos.loc[
(
# eliminate rows for which the tech combination does not match
tech_combos[tech_vals] <= tech_combos[tech_vals + '_group'].values
).all(axis=1),
['county', 'co', 'name', 'loc'],
]
.drop_duplicates()
.groupby(['county', 'co', 'name'])
.agg([tuple, 'count'])
.reset_index()
)
return tech_combos
def powerset(iterable, min=0):
"""modified powerset recipe"""
s = list(iterable)
return itertools.chain.from_iterable(itertools.combinations(s, r) for r in range(min, len(s)+1))
def mozway(df: pd.DataFrame) -> pd.DataFrame:
group = ['county', 'co']
tmp = df.groupby(['tech', *group])['loc'].agg(set)
out = []
for k, g in tmp.groupby(group):
for x in map(list, powerset(g.index.get_level_values('tech'), min=2)):
out.append(dict(zip(group, k))
|{'tech': ', '.join(x),
'log': (s:=set().union(*g.loc[x])),
'log-nunique': len(s),
})
return pd.DataFrame(out)
def test() -> None:
print(agg_tech(op_data()))
print(mozway(op_data()))
def plot() -> None:
rand = np.random.default_rng(seed=0)
methods = (agg_tech, mozway)
fig, axes = plt.subplots(nrows=2, ncols=2)
print('By n...')
by_n = []
for n in (10**(np.linspace(1.5, 4, num=20))).astype(int):
inp_orig = big_data(rand, n=n)
for method in methods:
inp = inp_orig.copy()
def run():
return method(inp)
dur = timeit(stmt=run, number=1)
by_n.append((n, method.__name__, dur))
sns.lineplot(pd.DataFrame(by_n, columns=('n', 'method', 'dur')),
x='n', y='dur', hue='method', ax=axes[0][0])
axes[0][0].set(xscale='log', yscale='log')
print('By group_frac...')
by_frac = []
for frac in np.linspace(0, 0.5, num=20):
inp_orig = big_data(rand, group_frac=frac)
for method in methods:
inp = inp_orig.copy()
def run():
return method(inp)
dur = timeit(stmt=run, number=1)
by_frac.append((frac, method.__name__, dur))
sns.lineplot(pd.DataFrame(by_frac, columns=('group_frac', 'method', 'dur')),
x='group_frac', y='dur', hue='method', ax=axes[0][1])
print('By n_locs...')
by_locs = []
for n_locs in np.linspace(1, 200, num=20, dtype=int):
inp_orig = big_data(rand, n_locs=n_locs)
for method in methods:
inp = inp_orig.copy()
def run():
return method(inp)
dur = timeit(stmt=run, number=1)
by_locs.append((n_locs, method.__name__, dur))
sns.lineplot(pd.DataFrame(by_locs, columns=('n_locs', 'method', 'dur')),
x='n_locs', y='dur', hue='method', ax=axes[1][0])
print('By tech dist...')
by_tech = []
for tech_dist in np.linspace(0.05, 0.95, num=20):
inp_orig = big_data(rand, tech_dist=tech_dist)
for method in methods:
inp = inp_orig.copy()
def run():
return method(inp)
dur = timeit(stmt=run, number=1)
by_tech.append((tech_dist, method.__name__, dur))
sns.lineplot(pd.DataFrame(by_tech, columns=('tech_dist', 'method', 'dur')),
x='tech_dist', y='dur', hue='method', ax=axes[1][1])
axes[1][1].set(yscale='log')
plt.show()
def profile():
rand = np.random.default_rng(seed=0)
df = big_data(rand, n=100_000)
pr = cProfile.Profile()
pr.enable()
agg_tech(df)
pr.disable()
ps = pstats.Stats(pr).sort_stats('cumulative')
ps.print_stats(10)
if __name__ == '__main__':
test()
profile()
plot()
county co name loc
tuple count
0 Albany LU (cable, copper) (m, n, o, p, q, r, s, t) 8
1 Albany LU (cable, copper, fibre) (m, n, o, p, q, r, s, t, w) 9
2 Albany LU (cable, fibre) (m, n, o, p, q, r, w) 7
3 Albany LU (copper, fibre) (n, o, s, t, w) 5
4 Laramie LU (cable, copper) (a, b, c, d, e, f, g) 7
This always outperforms the nonvectorised version for any input greater than about 100 elements; by how much depends mostly on the distribution of tech values:
对于任何大于100个元素的输入,这总是优于非向量化版本;具体表现如何主要取决于tech值的分布:
Note that by far the longest operation is agg(tuple)
. If you can tolerate a different output, such as
请注意,到目前为止最长的操作是agg(元组)。如果您可以容忍不同的输出,例如
- a long-form table with one entry per
loc
, or
- a short-form table that only has the unique count, or
- a wide-form table with
loc
-or-NaN in columns,
this will be much, much faster. For instance, for the last option you could use this routine:
这将会快得多。例如,对于最后一个选项,您可以使用以下例程:
def agg_tech(df: pd.DataFrame) -> pd.DataFrame:
# One-hot boolean columns for tech assignments
tech_vals = df['tech'].unique()
df[tech_vals] = pd.DataFrame(
data=np.equal.outer(df['tech'].values, tech_vals),
columns=tech_vals, index=df.index,
)
# Cartesian product of all tech values
tech_combos = pd.MultiIndex.from_product((
pd.Index(name=tech, data=(False, True))
for tech in tech_vals
)).to_frame()
# each group must have at least two tech values in the combination
tech_combos = tech_combos[tech_combos[tech_vals].sum(axis=1) > 1]
# Make tech name tuples
names = pd.DataFrame(index=tech_combos.index)
names[tech_vals] = tech_vals
names[~tech_combos[tech_vals].values] = np.nan
tech_combos['name'] = names.agg(notna_tuple, axis=1)
grouped = df.groupby(['county', 'co'])
df['group_idx'] = grouped.cumcount()
# Cartesian product of all tech values and all county-co groups,
# where each group must have at least two tech values in the combination
tech_combos = pd.merge(
# For each county-co group, for each tech: is there at least one of that tech?
left=grouped[tech_vals]
.any().reset_index(),
right=tech_combos,
how='cross', suffixes=['_any', '']
)
# Eliminate county-co/tech pairs where the tech combo uses a missing tech
tech_combos = tech_combos.loc[
(
tech_combos[tech_vals] <= tech_combos[tech_vals + '_any'].values
).all(axis=1),
['county', 'co', 'name', *tech_vals],
]
# loc values for every tech combination
tech_combos = pd.merge(
left=tech_combos, right=df,
on=['county', 'co'], suffixes=['_group', ''],
)
tech_combos = (
tech_combos.loc[
(
# eliminate rows for which the tech combination does not match
tech_combos[tech_vals] <= tech_combos[tech_vals + '_group'].values
).all(axis=1),
['county', 'co', 'name', 'group_idx', 'loc'],
]
.set_index(['county', 'co', 'name', 'group_idx'])
.unstack('group_idx')
)
return tech_combos
for about a 10x speedup.
加速了约10倍。
更多回答
Great solution, @mozway. +1
“很好的解决方案,”莫兹韦。+1
Would this code be efficient to work on large dataset?
此代码对于处理大型数据集是否有效?
@Rac in terms of generating the data, I believe it's as efficient as it can be with python. However, keep in mind that generating combinations might give a much larger output that the input. In particular if you have many "tech" within a group, and no matter the algorithm or programming language. What if the number of groups and the max number of unique "tech" per group?
@RAC在生成数据方面,我相信它与使用Python一样高效。但是,请记住,生成组合可能会产生比输入大得多的输出。尤其是如果你在一个组中有很多“技术”,无论是算法还是编程语言。如果群组的数量和每个群组的最大独一无二的“技术”数呢?
I'm doing initial performance tests on a frame of 100,000, and this method is about an order of magnitude slower than the vectorised one. I'll post more detail once I have it.
我正在对100,000帧进行初步性能测试,这种方法比矢量化方法慢一个数量级。一旦我有了它,我会发布更多的细节。
This solution is very efficient. Thanks @mozway.
这个解决方案非常有效。谢谢@mozway。
Using a cross-merge to then filter is most often inefficient, especially since you used broadcasting. Have you tested it on a large dataset?
使用交叉合并来过滤通常效率很低,特别是在您使用广播的情况下。你在大型数据集上测试过它吗?
Depends on a lot of things OP has not specified, including data size, proportion of tech values per group, etc. If I were to attempt a large dataset I'd be flying blind and it would be a waste of time without more information.
取决于许多OP没有指定的东西,包括数据大小、每组技术价值的比例等。如果我尝试一个大型数据集,我将盲目飞行,在没有更多信息的情况下将是浪费时间。
Its an extremely large dataset consisting of around 100 million + rows, not sure about the proportion of the tech values per group.
这是一个非常大的数据集,包含大约1亿多行,不确定每组技术值的比例。
@Rac I don't have the patience to test out 100M rows, so I've demonstrated with up to 10,000.
@RAC我没有耐心测试100M行,所以我已经演示了多达10000行。
@mozway So I have tried, now; and for a few reasons - the inefficiency of non-vectorised operations in Python, as well as the relatively low rejection ratio of the post-cross-join filter, this is not inefficient.
@mozway,所以我现在已经尝试了;出于几个原因--Python中非矢量化操作的低效,以及后交叉联接过滤器的相对较低的拒绝率,这并不是低效的。
我是一名优秀的程序员,十分优秀!