在panda之中,我们经常需要使用到agg函数来将pandas之中的某几列聚合起来
from tqdm import tqdm
import pandas as pd
import numpy as np
df = pd.DataFrame({"A":[1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0],
"B":[1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.0],
"C":[1.0, 1.5, 2.0, 2.0, 3.0, 4.0, 5.0, 6.0, 10.0],
"D":[2.0, 5.0, 3.0, 6.0, 4.0, 2.0, 5.0, 1.0, 2.0],
"E":['a', 'a', 'b', 'a', 'b', 'b', 'b', 'a', 'a']})
df2 = df.groupby('B').agg({
'C': 'mean',
'D': 'sum',
'E': lambda x: x.mode()
})
print(df2)
这里可以自定义list并加入进度条
agger = {
'C': 'mean',
'D': 'sum'}
#'E': lambda x: x.mode()}
#加上'E'会报错:cannot perform both aggregation and transformation operations simultaneously
#这里的'E'为transformation opertions操作
gcols = ['B'] # columns defining the groups
groupby = df.groupby(gcols)
ngroups = len(groupby)
gfrac = 0.3 # fraction of groups for which you want to print progress
gfrac_size = max((1, int(ngroups*gfrac)))
#gfrac为比例,这里为0.3,ngroups为总的长度
#下面也就是将agg函数进行实现了一下
groups = []
rows = []
for i,g in tqdm(enumerate(groupby),total=len(groupby)):
r"""
i = 0,
g =
(1.0, A B C D E
0 1.0 1.0 1.0 2.0 a
1 2.0 1.0 1.5 5.0 a
2 3.0 1.0 2.0 3.0 b)
"""
gstats = g[1].agg(agger)
r"""
g[1] =
A B C D E
0 1.0 1.0 1.0 2.0 a
1 2.0 1.0 1.5 5.0 a
2 3.0 1.0 2.0 3.0 b
gstats =
C 1.5
D 10.0
newcols =
['C', 'D']
groups =
[1.0]
rows =
[<numpy.flatiter object at 0x21d18a0>]
g[1]取出了上面整体数据的前三个对应值
g[1] =
A B C D E
3 1.0 2.0 2.0 6.0 a
4 2.0 2.0 3.0 4.0 b
5 3.0 2.0 4.0 2.0 b
gstats =
C 3.0
D 12.0
newcols =
['C', 'D']
groups =
[1.0, 2.0]
rows =
[<numpy.flatiter object at 0x21d18a0>,<numpy.flatiter object at ox21cc610>,<numpy.flatiter object at 0x21cd060>]
同理g[1]取出了上面的三个对应值
g[1] =
A B C D E
6 1.0 3.0 5.0 5.0 b
7 2.0 3.0 6.0 1.0 a
8 3.0 3.0 10.0 2.0 a
gstats =
C 7.0
D 8.0
newcols =
['C','D']
groups =
[1.0,2.0,3.0]
rows =
[<numpy.flatiter object at 0x21d18a0>, <numpy.flatiter object at 0x21cc610>, <numpy.flatiter object at 0x21cd060>]
"""
if i==0:
if gstats.ndim==2:
newcols = gstats.columns.tolist()
else:
newcols = gstats.index.tolist()
r"""
newcols = ['C','D']
newcols = ['C','D']
newcols = ['C','D']
"""
groups.append(g[0])
r"""
groups = [1.0]
groups = [1.0,2.0]
groups = [1.0,2.0,3.0]
"""
rows.append(gstats.values.flat)
r"""
rows = [<numpy.flatiter object at 0x21d18a0>]
rows = [<numpy.flatiter object at 0x21d18a0>, <numpy.flatiter object at 0x21cc610>]
rows = [<numpy.flatiter object at 0x21d18a0>, <numpy.flatiter object at 0x21cc610>, <numpy.flatiter object at 0x21cd060>]
"""
df3 = pd.DataFrame(np.vstack(rows), columns=newcols)
if len(gcols) == 1:
df3.index = groups
else:
df3.index = pd.MultiIndex.from_tuples(groups, names=gcols)
df3 = df3.astype(df[newcols].dtypes)
之前每一次得到结果[1.5,10.0],[3.0,12.0]之类的,都是将数值展平之后压入rows
rows.append(gstats.values.flat)
因此这里得到新的结果之后,应该将新的结果进行展开得到对应的DataFrame
df3 = pd.DataFrame(np.vstack(rows),columns=newcols)
并且这里每次得到的列名0.0,1.0,2.0依次压入groups之中
groups.append(g[0])
因此这里最后对df3进行操作
if len(gcols) == 1:
df3.index = groups
else:
df3.index = pd.MultiIndex.from_tuples(groups,names=gcols)
这里将对应的group赋值到df3上面,得到结果
C D
1.0 1.5 10.0
2.0 3.0 12.0
3.0 7.0 8.0
方法2
import pandas as pd
import numpy as np
df = pd.DataFrame({"A":[1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0],
"B":[1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.0],
"C":[1.0, 1.5, 2.0, 2.0, 3.0, 4.0, 5.0, 6.0, 10.0],
"D":[2.0, 5.0, 3.0, 6.0, 4.0, 2.0, 5.0, 1.0, 2.0],
"E":['a', 'a', 'b', 'a', 'b', 'b', 'b', 'a', 'a']})
df2 = df.groupby('B').agg({
'C': 'mean',
'D': 'sum',
'E': lambda x: x.mode()
})
print(df2)
class ModeHack:
def __init__(self, size=5, N=10):
self.ix = 0
self.K = 1
self.size = size
self.N = N
def mode(self, x):
self.ix = self.ix + x.shape[0]
if self.K*self.size <= self.ix:
print('{:.0f}% complete'.format(100*self.ix/self.N))
self.K += 1
return x.mode()
def reset(self):
self.ix = 0
self.K = 1
mymode = ModeHack(size=int(.1*df.shape[0]), N=df.shape[0])
mymode.reset()
agger = {
'C': 'mean',
'D': 'sum',
'E': lambda x: mymode.mode(x)}
df3 = df.groupby('B').agg(agger)