>>>import pandas as pd
>>> obj = pd.DataFrame([[0,1],[2,3]], index=['cat','dog'], columns=['weight','height'])>>> obj
weight height
cat 01
dog 23>>>>>> obj.stack()
cat weight 0
height 1
dog weight 2
height 3
dtype: int64
1
2
3
4
5
6
7
8
9
10
11
12
13
多层列(Multi level columns):
>>>import pandas as pd
>>> multicol = pd.MultiIndex.from_tuples([('weight','kg'),('weight','pounds')])>>> obj = pd.DataFrame([[1,2],[2,4]], index=['cat','dog'], columns=multicol)>>> obj
weight
kg pounds
cat 12
dog 24>>>>>> obj.stack()
weight
cat kg 1
pounds 2
dog kg 2
pounds 4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
缺失值填充:
>>>import pandas as pd
>>> multicol = pd.MultiIndex.from_tuples([('weight','kg'),('height','m')])>>> obj = pd.DataFrame([[1.0,2.0],[3.0,4.0]], index=['cat','dog'], columns=multicol)>>> obj
weight height
kg m
cat 1.02.0
dog 3.04.0>>>>>> obj.stack()
height weight
cat kg NaN 1.0
m 2.0 NaN
dog kg NaN 3.0
m 4.0 NaN
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
通过
level
参数指定不同层级的轴进行重塑:
>>>import pandas as pd
>>> multicol = pd.MultiIndex.from_tuples([('weight','kg'),('height','m')])>>> obj = pd.DataFrame([[1.0,2.0],[3.0,4.0]], index=['cat','dog'], columns=multicol)>>> obj
weight height
kg m
cat 1.02.0
dog 3.04.0>>>>>> obj.stack(level=0)
kg m
cat height NaN 2.0
weight 1.0 NaN
dog height NaN 4.0
weight 3.0 NaN
>>>>>> obj.stack(level=1)
height weight
cat kg NaN 1.0
m 2.0 NaN
dog kg NaN 3.0
m 4.0 NaN
>>>>>> obj.stack(level=[0,1])
cat height m 2.0
weight kg 1.0
dog height m 4.0
weight kg 3.0
dtype: float64
>>>import pandas as pd
>>> multicol = pd.MultiIndex.from_tuples([('weight','kg'),('height','m')])>>> obj = pd.DataFrame([[None,1.0],[2.0,3.0]], index=['cat','dog'], columns=multicol)>>> obj
weight height
kg m
cat NaN 1.0
dog 2.03.0>>>>>> obj.stack(dropna=False)
height weight
cat kg NaN NaN
m 1.0 NaN
dog kg NaN 2.0
m 3.0 NaN
>>>>>> obj.stack(dropna=True)
height weight
cat m 1.0 NaN
dog kg NaN 2.0
m 3.0 NaN
>>>import pandas as pd
>>> obj = pd.Series([1,2,3,4], index=pd.MultiIndex.from_product([['one','two'],['a','b']]))>>> obj
one a 1
b 2
two a 3
b 4
dtype: int64
>>>>>> obj.unstack()
a b
one 12
two 34>>>>>> obj.unstack(level=0)
one two
a 13
b 24
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
和
stack
方法类似,如果值不存在将会引入缺失值(NaN):
>>>import pandas as pd
>>> obj1 = pd.Series([0,1,2,3], index=['a','b','c','d'])>>> obj2 = pd.Series([4,5,6], index=['c','d','e'])>>> obj3 = pd.concat([obj1, obj2], keys=['one','two'])>>> obj3
one a 0
b 1
c 2
d 3
two c 4
d 5
e 6
dtype: int64
>>>>>> obj3.unstack()
a b c d e
one 0.01.02.03.0 NaN
two NaN NaN 4.05.06.0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
在 DataFrame 对象中的应用:
>>>import pandas as pd
>>>import numpy as np
>>> obj = pd.DataFrame(np.arange(6).reshape((2,3)),
index=pd.Index(['Ohio','Colorado'], name='state'),
columns=pd.Index(['one','two','three'],
name='number'))>>> obj
number one two three
state
Ohio 012
Colorado 345>>>>>> obj2 = obj.stack()>>> obj2
state number
Ohio one 0
two 1
three 2
Colorado one 3
two 4
three 5
dtype: int32
>>>>>> obj3 = pd.DataFrame({'left': obj2,'right': obj2 +5},
columns=pd.Index(['left','right'], name='side'))>>> obj3
side left right
state number
Ohio one 05
two 16
three 27
Colorado one 38
two 49
three 510>>>>>> obj3.unstack('state')
side left right
state Ohio Colorado Ohio Colorado
number
one 0358
two 1469
three 25710>>>>>> obj3.unstack('state').stack('side')
state Colorado Ohio
number side
one left 30
right 85
two left 41
right 96
three left 52
right 107
>>>import pandas as pd
>>>import numpy as np
>>> obj = pd.DataFrame({'data1':['a']*4+['b']*4,'data2': np.random.randint(0,4,8)})>>> obj
data1 data2
0 a 01 a 02 a 03 a 34 b 35 b 36 b 07 b 2>>>>>> obj.duplicated()0False1True2True3False4False5True6False7False
dtype:bool>>>>>> obj.duplicated(subset='data1')0False1True2True3True4False5True6True7True
dtype:bool>>>>>> obj.duplicated(subset='data2', keep='last')0True1True2True3True4True5False6False7False
dtype:bool
>>>import pandas as pd
>>> obj = pd.Series(['lama','cow','lama','beetle','lama','hippo'], name='animal')>>> obj
0 lama
1 cow
2 lama
3 beetle
4 lama
5 hippo
Name: animal, dtype:object>>
>>>> obj.drop_duplicates()0 lama
1 cow
3 beetle
5 hippo
Name: animal, dtype:object>>>>>> obj.drop_duplicates(keep='last')1 cow
3 beetle
4 lama
5 hippo
Name: animal, dtype:object>>>>>> obj.drop_duplicates(keep=False)1 cow
3 beetle
5 hippo
Name: animal, dtype:object
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
如果设置
inplace=True
,则不会返回任何值,但原对象的值已被改变:
>>>import pandas as pd
>>> obj1 = pd.Series(['lama','cow','lama','beetle','lama','hippo'], name='animal')>>> obj1
0 lama
1 cow
2 lama
3 beetle
4 lama
5 hippo
Name: animal, dtype:object>>>>>> obj2 = obj1.drop_duplicates()>>> obj2 # 有返回值0 lama
1 cow
3 beetle
5 hippo
Name: animal, dtype:object>>>>>> obj3 = obj1.drop_duplicates(inplace=True)>>> obj3 # 无返回值>>>>>> obj1 # 原对象的值已改变0 lama
1 cow
3 beetle
5 hippo
Name: animal, dtype:object
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
在 DataFrame 对象中的使用:
>>>import numpy as np
>>>import pandas as pd
>>> obj = pd.DataFrame({'data1':['a']*4+['b']*4,'data2': np.random.randint(0,4,8)})>>> obj
data1 data2
0 a 21 a 12 a 13 a 24 b 15 b 26 b 07 b 0>>>>>> obj.drop_duplicates()
data1 data2
0 a 21 a 14 b 15 b 26 b 0>>>>>> obj.drop_duplicates(subset='data2')
data1 data2
0 a 21 a 16 b 0>>>>>> obj.drop_duplicates(subset='data2', ignore_index=True)
data1 data2
0 a 21 a 12 b 0
>>>import pandas as pd
>>> obj = pd.DataFrame({'A':[0,1,2,3,4],'B':[5,6,7,8,9],'C':['a','b','c','d','e']})>>> obj
A B C
005 a
116 b
227 c
338 d
449 e
>>>>>> obj.replace(0,5)
A B C
055 a
116 b
227 c
338 d
449 e
>>>>>> obj.replace({0:10,1:100})
A B C
0105 a
11006 b
227 c
338 d
449 e
>>>>>> obj.replace({'A':0,'B':5},100)
A B C
0100100 a
116 b
227 c
338 d
449 e
>>> obj.replace({'A':{0:100,4:400}})
A B C
01005 a
116 b
227 c
338 d
44009 e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
to_replace
传入正则表达式:
>>>import pandas as pd
>>> obj = pd.DataFrame({'A':['bat','foo','bait'],'B':['abc','bar','xyz']})>>> obj
A B
0 bat abc
1 foo bar
2 bait xyz
>>>>>> obj.replace(to_replace=r'^ba.$', value='new', regex=True)
A B
0 new abc
1 foo new
2 bait xyz
>>>>>> obj.replace({'A': r'^ba.$'},{'A':'new'}, regex=True)
A B
0 new abc
1 foo bar
2 bait xyz
>>>>>> obj.replace(regex=r'^ba.$', value='new')
A B
0 new abc
1 foo new
2 bait xyz
>>>>>> obj.replace(regex={r'^ba.$':'new','foo':'xyz'})
A B
0 new abc
1 xyz new
2 bait xyz
>>>>>> obj.replace(regex=[r'^ba.$','foo'], value='new')
A B
0 new abc
1 new new
2 bait xyz
>>>import pandas as pd
>>> obj = pd.DataFrame(np.arange(10).reshape(-1,2), columns=['A','B'])>>> obj
A B
001123245367489>>>>>> m = obj %3==0>>> obj.where(m,-obj)
A B
00-11-232-4-536-74-89>>>>>> obj.where(m,-obj)== np.where(m, obj,-obj)
A B
0TrueTrue1TrueTrue2TrueTrue3TrueTrue4TrueTrue
替换条件,如果 cond 为 False,则保留原始值。如果为 True,则替换为来自 other 的相应值
other
替换值,如果 cond 为 False,则替换为来自该参数的相应值
inplace
bool 类型,是否直接改变原数据且不返回值,默认 False
在 Series 中的应用:
>>>import pandas as pd
>>> obj = pd.Series(range(5))>>> obj
0011223344
dtype: int64
>>>>>> obj.mask(obj >0)00.01 NaN
2 NaN
3 NaN
4 NaN
dtype: float64
>>>>>> obj.mask(obj >1,10)0011210310410
dtype: int64
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
在 DataFrame 中的应用:
>>>import pandas as pd
>>> obj = pd.DataFrame(np.arange(10).reshape(-1,2), columns=['A','B'])>>> obj
A B
001123245367489>>>>>> m = obj %3==0>>>>>> obj.mask(m,-obj)
A B
00112-32453-6748-9>>>>>> obj.where(m,-obj)== obj.mask(~m,-obj)
A B
0TrueTrue1TrueTrue2TrueTrue3TrueTrue4TrueTrue