Option 1
defaultdict
Verwendung
from collections import defaultdict
d = defaultdict(list)
[d[(n1, n2)].append(d1) for n1, n2, d1, d2 in df.values];
pd.DataFrame(
d, [1, 2]
).T.add_prefix('data').rename_axis(['name1', 'name2']).reset_index()
name1 name2 data1 data2
0 a x 1 5
1 a y 2 6
2 b x 3 7
3 b y 4 8
Option 2
Verwendung numba
from numba import njit
@njit
def plc(f, v):
m = np.bincount(f).max()
n = f.max() + 1
a = np.arange(n * m).reshape(n, m) * 0
j = np.arange(n) * 0
for x, y in zip(f, v):
a[x, j[x]] = y
j[x] += 1
return a
f, u = pd.Series(zip(df.name1.values, df.name2.values)).factorize()
new = np.column_stack([np.array(u.tolist()), plc(f, df.data1.values)])
pd.DataFrame(new, columns='name1 name2 data1 data2'.split())
name1 name2 data1 data2
0 a x 1 5
1 a y 2 6
2 b x 3 7
3 b y 4 8
TIMING
kleine Daten
%%timeit
f, u = pd.Series(zip(df.name1.values, df.name2.values)).factorize()
new = np.column_stack([np.array(u.tolist()), plc(f, df.data1.values)])
pd.DataFrame(new, columns='name1 name2 data1 data2'.split())
###############################################################
%%timeit
d = defaultdict(list)
[d[(n1, n2)].append(d1) for n1, n2, d1, d2 in df.values];
pd.DataFrame(d, [1, 2]).T.add_prefix('data').rename_axis(['name1', 'name2']).reset_index()
###############################################################
%%timeit
df1 = df.groupby(['name1','name2'])['data1'].apply(list)
df2 = pd.DataFrame(df1.values.tolist(), index=df1.index)
df2.rename(columns = lambda x: 'data' + str(x + 1)).reset_index()
#--------------------------------------------------------------
1000 loops, best of 3: 400 µs per loop
100 loops, best of 3: 2.23 ms per loop
100 loops, best of 3: 2.82 ms per loop
Kann keine einfache Möglichkeit finden, Listen zu explodieren ... aufgeben ... +1 –
@ cᴏʟᴅsᴘᴇᴇᴅ - Danke. – jezrael
@ cᴏʟᴅsᴘᴇᴇᴅ gib nicht auf! – piRSquared