本文实例总结了Python常见的pandas用法。分享给大家供大家参考,具体如下:
'''
想要学习Python?Python学习交流群:984632579满足你的需求,资料都已经上传群文件,可以自行下载!
'''
import numpy as np
import pandas as pd
s = pd.Series([1,3,6, np.nan, 44, 1]) #定义一个序列。 序列就是一列内容,每一行有一个index值
print(s)
print(s.index)
0 1.0
1 3.0
2 6.0
3 NaN
4 44.0
5 1.0
dtype: float64
RangeIndex(start=0, stop=6, step=1)
dates = pd.date_range('20180101', periods=6)
print(dates)
DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04', '2018-01-05', '2018-01-06'], dtype='datetime64[ns]', freq='D')
df1
=
pd.DataFrame(np.arange(
12
).reshape(
3
,
4
))
#定义DataFrame,可以看作一个有index和colunms的矩阵
print
(df)
0 1 2 3 0 0 1 2 3 1 4 5 6 7 2 8 9 10 11
?
1
2
df2
=
pd.DataFrame(np.random.randn(
6
,
4
), index
=
dates, columns
=
[
'a'
,
'b'
,
'c'
,
'd'
])
#np.random.randn(6,4)生成6行4列矩阵
print
(df)
a b c d 2018-01-01 0.300675 1.769383 1.244406 -1.058294 2018-01-02 0.832666 2.216755 0.178716 -0.156828 2018-01-03 1.314190 -0.866199 0.836150 1.001026 2018-01-04 -1.671724 1.147406 -0.148676 -0.272555 2018-01-05 1.146664 2.022861 -1.833995 -0.627568 2018-01-06 -0.192242 1.517676 0.756707 0.058869
?
1
2
3
4
5
6
7
8
9
df
=
pd.DataFrame({
'A'
:
1.0
,
'B'
:pd.Timestamp(
'20180101'
),
'C'
:pd.Series(
1
, index
=
list
(
range
(
4
)), dtype
=
'float32'
),
'D'
:np.array([
3
]
*
4
, dtype
=
'int32'
),
'E'
:pd.Categorical([
'test'
,
'train'
,
'test'
,
'train'
]),
'F'
:
'foo'
})
#按照给出的逐列定义df
print
(df)
print
(df.dtypes)
A B C D E F 0 1.0 2018-01-01 1.0 3 test foo 1 1.0 2018-01-01 1.0 3 train foo 2 1.0 2018-01-01 1.0 3 test foo 3 1.0 2018-01-01 1.0 3 train foo A float64 B datetime64[ns] C float32 D int32 E category F object dtype: object
?
1
2
3
4
#df的行、列、值
print
(df.index)
print
(df.columns)
print
(df.values)
Int64Index([0, 1, 2, 3], dtype='int64') Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object') [[1.0 Timestamp('2018-01-01 00:00:00') 1.0 3 'test' 'foo'] [1.0 Timestamp('2018-01-01 00:00:00') 1.0 3 'train' 'foo'] [1.0 Timestamp('2018-01-01 00:00:00') 1.0 3 'test' 'foo'] [1.0 Timestamp('2018-01-01 00:00:00') 1.0 3 'train' 'foo']]
?
1
2
print
(df.describe())
#统计
print
(df.T)
#转置
A C D count 4.0 4.0 4.0 mean 1.0 1.0 3.0 std 0.0 0.0 0.0 min 1.0 1.0 3.0 25% 1.0 1.0 3.0 50% 1.0 1.0 3.0 75% 1.0 1.0 3.0 max 1.0 1.0 3.0 0 1 2 \ A 1 1 1 B 2018-01-01 00:00:00 2018-01-01 00:00:00 2018-01-01 00:00:00 C 1 1 1 D 3 3 3 E test train test F foo foo foo 3 A 1 B 2018-01-01 00:00:00 C 1 D 3 E train F foo
?
1
2
3
#df排序
print
(df.sort_index(axis
=
1
, ascending
=
False
))
#根据索引值对各行进行排序(相当于重新排列各列的位置)
print
(df.sort_values(by
=
'E'
))
#根据内容值对各列进行排序
F E D C B A 0 foo test 3 1.0 2018-01-01 1.0 1 foo train 3 1.0 2018-01-01 1.0 2 foo test 3 1.0 2018-01-01 1.0 3 foo train 3 1.0 2018-01-01 1.0 A B C D E F 0 1.0 2018-01-01 1.0 3 test foo 2 1.0 2018-01-01 1.0 3 test foo 1 1.0 2018-01-01 1.0 3 train foo 3 1.0 2018-01-01 1.0 3 train foo
?
1
2
3
4
5
6
7
8
indexes
=
pd.date_range(
'20180101'
, periods
=
6
)
df3
=
pd.DataFrame(np.arange(
24
).reshape(
6
,
4
), index
=
indexes, columns
=
[
'A'
,
'B'
,
'C'
,
'D'
])
print
(df3)
print
()
#选择column
print
(df3[
'A'
])
print
()
print
(df3.A)
A B C D 2018-01-01 0 1 2 3 2018-01-02 4 5 6 7 2018-01-03 8 9 10 11 2018-01-04 12 13 14 15 2018-01-05 16 17 18 19 2018-01-06 20 21 22 23 2018-01-01 0 2018-01-02 4 2018-01-03 8 2018-01-04 12 2018-01-05 16 2018-01-06 20 Freq: D, Name: A, dtype: int32 2018-01-01 0 2018-01-02 4 2018-01-03 8 2018-01-04 12 2018-01-05 16 2018-01-06 20 Freq: D, Name: A, dtype: int32 A B C D 2018-01-01 0 1 2 3 2018-01-02 4 5 6 7 2018-01-03 8 9 10 11
?
1
2
3
4
5
6
#选择行, 类似limit语句
print
(df3[
0
:
0
])
print
()
print
(df3[
0
:
3
])
print
()
print
(df3[
'20180103'
:
'20180105'
])
Empty DataFrame Columns: [A, B, C, D] Index: [] A B C D 2018-01-01 0 1 2 3 2018-01-02 4 5 6 7 2018-01-03 8 9 10 11 A B C D 2018-01-03 8 9 10 11 2018-01-04 12 13 14 15 2018-01-05 16 17 18 19
?
1
print
(df3.loc[
'20180102'
])
#返回指定行构成的序列
A 4 B 5 C 6 D 7 Name: 2018-01-02 00:00:00, dtype: int32
?
1
2
3
4
5
print
(df3.loc[
'20180103'
, [
'A'
,
'C'
]])
#列筛选
print
()
print
(df3.loc[
'20180103'
:
'20180105'
, [
'A'
,
'C'
]])
#子df,类似select A, C from df limit ...
print
()
print
(df3.loc[:, [
'A'
,
'B'
]])
A 8 C 10 Name: 2018-01-03 00:00:00, dtype: int32 A C 2018-01-03 8 10 2018-01-04 12 14 2018-01-05 16 18 A B 2018-01-01 0 1 2018-01-02 4 5 2018-01-03 8 9 2018-01-04 12 13 2018-01-05 16 17 2018-01-06 20 21
?
1
2
3
4
5
6
print
(df3);
print
()
print
(df3.iloc[
1
]);
print
()
print
(df3.iloc[
1
,
1
]);
print
()
print
(df3.iloc[:,
1
]);
print
()
print
(df3.iloc[
0
:
3
,
1
:
3
]);
print
()
print
(df3.iloc[[
1
,
3
,
5
],[
0
,
2
]])
#行可以不连续,limit做不到
A B C D 2018-01-01 0 1 2 3 2018-01-02 4 5 6 7 2018-01-03 8 9 10 11 2018-01-04 12 13 14 15 2018-01-05 16 17 18 19 2018-01-06 20 21 22 23 A 4 B 5 C 6 D 7 Name: 2018-01-02 00:00:00, dtype: int32 5 2018-01-01 1 2018-01-02 5 2018-01-03 9 2018-01-04 13 2018-01-05 17 2018-01-06 21 Freq: D, Name: B, dtype: int32 B C 2018-01-01 1 2 2018-01-02 5 6 2018-01-03 9 10 A C 2018-01-02 4 6 2018-01-04 12 14 2018-01-06 20 22
?
1
2
3
# print(df3.ix[:3, ['A', 'C']])\
print
(df3);
print
()
print
(df3[df3.A >
=
8
])
#根据值进行条件过滤,类似where A >= 8条件语句
A B C D 2018-01-01 0 1 2 3 2018-01-02 4 5 6 7 2018-01-03 8 9 10 11 2018-01-04 12 13 14 15 2018-01-05 16 17 18 19 2018-01-06 20 21 22 23 A B C D 2018-01-03 8 9 10 11 2018-01-04 12 13 14 15 2018-01-05 16 17 18 19 2018-01-06 20 21 22 23
?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
indexes1
=
pd.date_range(
'20180101'
, periods
=
6
)
df4
=
pd.DataFrame(np.arange(
24
).reshape(
6
,
4
), index
=
indexes1, columns
=
[
'A'
,
'B'
,
'C'
,
'D'
])
print
(df4);
print
()
#给某个元素赋值
df4.A[
1
]
=
1111
df4.B[
'20180103'
]
=
2222
df4.iloc[
3
,
2
]
=
3333
df4.loc[
'20180105'
,
'D'
]
=
4444
print
(df4);
print
()
#范围赋值
df4.B[df4.A <
10
]
=
-
1
print
(df4);
print
()
df4[df4.A <
10
]
=
0
print
(df4);
print
()
A B C D 2018-01-01 0 1 2 3 2018-01-02 4 5 6 7 2018-01-03 8 9 10 11 2018-01-04 12 13 14 15 2018-01-05 16 17 18 19 2018-01-06 20 21 22 23 A B C D 2018-01-01 0 1 2 3 2018-01-02 1111 5 6 7 2018-01-03 8 2222 10 11 2018-01-04 12 13 3333 15 2018-01-05 16 17 18 4444 2018-01-06 20 21 22 23 A B C D 2018-01-01 0 -1 2 3 2018-01-02 1111 5 6 7 2018-01-03 8 -1 10 11 2018-01-04 12 13 3333 15 2018-01-05 16 17 18 4444 2018-01-06 20 21 22 23 A B C D 2018-01-01 0 0 0 0 2018-01-02 1111 5 6 7 2018-01-03 0 0 0 0 2018-01-04 12 13 3333 15 2018-01-05 16 17 18 4444 2018-01-06 20 21 22 23
?
1
2
3
4
5
6
7
8
9
10
indexes1
=
pd.date_range(
'20180101'
, periods
=
6
)
df4
=
pd.DataFrame(np.arange(
24
).reshape(
6
,
4
), index
=
indexes1, columns
=
[
'A'
,
'B'
,
'C'
,
'D'
])
print
(df4);
print
()
#添加一列
df4[
'E'
]
=
np.NaN
print
(df4);
print
()
#由于index没对齐,原df没有的行默认为NaN,类型为float64,多出的行丢弃
df4[
'F'
]
=
pd.Series([
1
,
2
,
3
,
4
,
5
,
6
], index
=
pd.date_range(
'20180102'
, periods
=
6
))
print
(df4);
print
()
print
(df4.dtypes)
A B C D 2018-01-01 0 1 2 3 2018-01-02 4 5 6 7 2018-01-03 8 9 10 11 2018-01-04 12 13 14 15 2018-01-05 16 17 18 19 2018-01-06 20 21 22 23 A B C D E 2018-01-01 0 1 2 3 NaN 2018-01-02 4 5 6 7 NaN 2018-01-03 8 9 10 11 NaN 2018-01-04 12 13 14 15 NaN 2018-01-05 16 17 18 19 NaN 2018-01-06 20 21 22 23 NaN A B C D E F 2018-01-01 0 1 2 3 NaN NaN 2018-01-02 4 5 6 7 NaN 1.0 2018-01-03 8 9 10 11 NaN 2.0 2018-01-04 12 13 14 15 NaN 3.0 2018-01-05 16 17 18 19 NaN 4.0 2018-01-06 20 21 22 23 NaN 5.0 A int32 B int32 C int32 D int32 E float64 F float64 dtype: object
?
1
2
3
4
5
6
7
8
9
10
11
12
df_t
=
pd.DataFrame(np.arange(
24
).reshape(
6
,
4
), index
=
[
1
,
2
,
3
,
4
,
5
,
6
], columns
=
[
'A'
,
'B'
,
'C'
,
'D'
])
df_t.iloc[
0
,
1
]
=
np.NaN
df_t.iloc[
1
,
2
]
=
np.NaN
df
=
df_t.copy()
print
(df);
print
()
print
(df.dropna(axis
=
0
, how
=
'any'
));
print
()
df
=
df_t.copy()
print
(df.dropna(axis
=
1
, how
=
'any'
));
print
()
df
=
df_t.copy()
df.C
=
np.NaN
print
(df);
print
()
print
(df.dropna(axis
=
1
, how
=
'all'
));
print
()
A B C D 1 0 NaN 2.0 3 2 4 5.0 NaN 7 3 8 9.0 10.0 11 4 12 13.0 14.0 15 5 16 17.0 18.0 19 6 20 21.0 22.0 23 A B C D 3 8 9.0 10.0 11 4 12 13.0 14.0 15 5 16 17.0 18.0 19 6 20 21.0 22.0 23 A D 1 0 3 2 4 7 3 8 11 4 12 15 5 16 19 6 20 23 A B C D 1 0 NaN NaN 3 2 4 5.0 NaN 7 3 8 9.0 NaN 11 4 12 13.0 NaN 15 5 16 17.0 NaN 19 6 20 21.0 NaN 23 A B D 1 0 NaN 3 2 4 5.0 7 3 8 9.0 11 4 12 13.0 15 5 16 17.0 19 6 20 21.0 23
?
1
2
3
4
5
6
7
df
=
df_t.copy()
print
(df);
print
()
print
(df.isna());
print
()
print
(df.isnull().
any
());
print
()
#isnull是isna别名,功能一样
print
(df.isnull().
any
(axis
=
1
));
print
()
print
(np.
any
(df.isna()
=
=
True
));
print
()
print
(df.fillna(value
=
0
))
#将NaN赋值
A B C D 1 0 NaN 2.0 3 2 4 5.0 NaN 7 3 8 9.0 10.0 11 4 12 13.0 14.0 15 5 16 17.0 18.0 19 6 20 21.0 22.0 23 A B C D 1 False True False False 2 False False True False 3 False False False False 4 False False False False 5 False False False False 6 False False False False A False B True C True D False dtype: bool 1 True 2 True 3 False 4 False 5 False 6 False dtype: bool True A B C D 1 0 0.0 2.0 3 2 4 5.0 0.0 7 3 8 9.0 10.0 11 4 12 13.0 14.0 15 5 16 17.0 18.0 19 6 20 21.0 22.0 23
?
1
2
3
data
=
pd.read_csv(
'D:/pythonwp/test/student.csv'
)
print
(data)
data.to_pickle(
'D:/pythonwp/test/student.pickle'
)
id name age gender 0 1 牛帅 23 Male 1 2 gyb 89 Male 2 3 xxs 27 Male 3 4 hey 24 Female 4 5 奥莱利赫本 66 Female 5 6 Jackson 61 Male 6 7 牛帅 23 Male
?
1
2
3
4
5
6
7
8
9
10
df0
=
pd.DataFrame(np.ones((
3
,
4
))
*
0
, columns
=
[
'A'
,
'B'
,
'C'
,
'D'
])
df1
=
pd.DataFrame(np.ones((
3
,
4
))
*
1
, columns
=
[
'A'
,
'B'
,
'C'
,
'D'
])
df2
=
pd.DataFrame(np.ones((
3
,
4
))
*
2
, columns
=
[
'A'
,
'B'
,
'C'
,
'D'
])
print
(df0);
print
()
print
(df1);
print
()
print
(df2);
print
()
res
=
pd.concat([df0, df1, df2], axis
=
0
)
print
(res);
print
()
res
=
pd.concat([df0, df1, df2], axis
=
0
, ignore_index
=
True
)
print
(res)
A B C D 0 0.0 0.0 0.0 0.0 1 0.0 0.0 0.0 0.0 2 0.0 0.0 0.0 0.0 A B C D 0 1.0 1.0 1.0 1.0 1 1.0 1.0 1.0 1.0 2 1.0 1.0 1.0 1.0 A B C D 0 2.0 2.0 2.0 2.0 1 2.0 2.0 2.0 2.0 2 2.0 2.0 2.0 2.0 A B C D 0 0.0 0.0 0.0 0.0 1 0.0 0.0 0.0 0.0 2 0.0 0.0 0.0 0.0 0 1.0 1.0 1.0 1.0 1 1.0 1.0 1.0 1.0 2 1.0 1.0 1.0 1.0 0 2.0 2.0 2.0 2.0 1 2.0 2.0 2.0 2.0 2 2.0 2.0 2.0 2.0 A B C D 0 0.0 0.0 0.0 0.0 1 0.0 0.0 0.0 0.0 2 0.0 0.0 0.0 0.0 3 1.0 1.0 1.0 1.0 4 1.0 1.0 1.0 1.0 5 1.0 1.0 1.0 1.0 6 2.0 2.0 2.0 2.0 7 2.0 2.0 2.0 2.0 8 2.0 2.0 2.0 2.0
?
1
2
3
4
5
6
7
8
df0
=
pd.DataFrame(np.ones((
3
,
4
))
*
0
, columns
=
[
'A'
,
'B'
,
'C'
,
'D'
])
df1
=
pd.DataFrame(np.ones((
3
,
4
))
*
1
, columns
=
[
'E'
,
'F'
,
'C'
,
'D'
])
res
=
pd.concat([df0, df1], ignore_index
=
True
)
print
(res);
print
()
res
=
pd.concat([df0, df1], join
=
'outer'
, ignore_index
=
True
)
print
(res);
print
()
res
=
pd.concat([df0, df1], join
=
'inner'
,ignore_index
=
True
)
print
(res);
print
()
A B C D E F 0 0.0 0.0 0.0 0.0 NaN NaN 1 0.0 0.0 0.0 0.0 NaN NaN 2 0.0 0.0 0.0 0.0 NaN NaN 3 NaN NaN 1.0 1.0 1.0 1.0 4 NaN NaN 1.0 1.0 1.0 1.0 5 NaN NaN 1.0 1.0 1.0 1.0 A B C D E F 0 0.0 0.0 0.0 0.0 NaN NaN 1 0.0 0.0 0.0 0.0 NaN NaN 2 0.0 0.0 0.0 0.0 NaN NaN 3 NaN NaN 1.0 1.0 1.0 1.0 4 NaN NaN 1.0 1.0 1.0 1.0 5 NaN NaN 1.0 1.0 1.0 1.0 C D 0 0.0 0.0 1 0.0 0.0 2 0.0 0.0 3 1.0 1.0 4 1.0 1.0 5 1.0 1.0
?
1
2
3
4
5
6
7
8
9
10
11
#横向合并
df0
=
pd.DataFrame(np.ones((
3
,
4
))
*
0
, index
=
[
'1'
,
'2'
,
'3'
], columns
=
[
'A'
,
'B'
,
'C'
,
'D'
])
df1
=
pd.DataFrame(np.ones((
3
,
4
))
*
1
, index
=
[
'2'
,
'3'
,
'4'
], columns
=
[
'A'
,
'B'
,
'C'
,
'D'
])
print
(df0);
print
()
print
(df1);
print
()
res
=
pd.concat([df0, df1], axis
=
1
)
print
(res);
print
()
res
=
pd.concat([df0, df1], axis
=
1
, join
=
'inner'
, ignore_index
=
True
)
print
(res);
print
()
res
=
pd.concat([df0, df1], axis
=
1
, join_axes
=
[df0.index])
print
(res);
print
()
A B C D 1 0.0 0.0 0.0 0.0 2 0.0 0.0 0.0 0.0 3 0.0 0.0 0.0 0.0 A B C D 2 1.0 1.0 1.0 1.0 3 1.0 1.0 1.0 1.0 4 1.0 1.0 1.0 1.0 A B C D A B C D 1 0.0 0.0 0.0 0.0 NaN NaN NaN NaN 2 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 3 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 4 NaN NaN NaN NaN 1.0 1.0 1.0 1.0 0 1 2 3 4 5 6 7 2 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 3 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 A B C D A B C D 1 0.0 0.0 0.0 0.0 NaN NaN NaN NaN 2 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 3 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
?
1
2
3
4
5
6
7
8
df0
=
pd.DataFrame(np.ones((
3
,
4
))
*
0
, index
=
[
'1'
,
'2'
,
'3'
], columns
=
[
'A'
,
'B'
,
'C'
,
'D'
])
df1
=
pd.DataFrame(np.ones((
3
,
4
))
*
1
, index
=
[
'2'
,
'3'
,
'4'
], columns
=
[
'A'
,
'B'
,
'C'
,
'D'
])
print
(df0);
print
()
print
(df1);
print
()
res
=
df0.append([df1, df1], ignore_index
=
False
)
print
(res);
print
()
s
=
pd.Series([
1
,
2
,
3
,
4
], index
=
[
'A'
,
'B'
,
'C'
,
'E'
])
print
(df0.append(s, ignore_index
=
True
))
A B C D 1 0.0 0.0 0.0 0.0 2 0.0 0.0 0.0 0.0 3 0.0 0.0 0.0 0.0 A B C D 2 1.0 1.0 1.0 1.0 3 1.0 1.0 1.0 1.0 4 1.0 1.0 1.0 1.0 A B C D 1 0.0 0.0 0.0 0.0 2 0.0 0.0 0.0 0.0 3 0.0 0.0 0.0 0.0 2 1.0 1.0 1.0 1.0 3 1.0 1.0 1.0 1.0 4 1.0 1.0 1.0 1.0 2 1.0 1.0 1.0 1.0 3 1.0 1.0 1.0 1.0 4 1.0 1.0 1.0 1.0 A B C D E 0 0.0 0.0 0.0 0.0 NaN 1 0.0 0.0 0.0 0.0 NaN 2 0.0 0.0 0.0 0.0 NaN 3 1.0 2.0 3.0 NaN 4.0
?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
df1
=
pd.DataFrame({
'key'
:[
'K0'
,
'K1'
,
'K2'
],
'A'
:[
'A0'
,
'A1'
,
'A2'
],
'B'
:[
'B0'
,
'B1'
,
'B2'
]})
df2
=
pd.DataFrame({
'key'
:[
'K3'
,
'K1'
,
'K2'
],
'C'
:[
'C3'
,
'C1'
,
'C2'
],
'D'
:[
'D3'
,
'D1'
,
'D2'
]})
print
(df1);
print
()
print
(df2);
print
()
res
=
pd.merge(df1, df2, on
=
'key'
)
print
(res);
print
()
res
=
pd.merge(df1, df2, on
=
'key'
, how
=
'outer'
)
print
(res);
print
()
res
=
pd.merge(df1, df2, on
=
'key'
, how
=
'left'
)
print
(res);
print
()
res
=
pd.merge(df1, df2, on
=
'key'
, how
=
'right'
)
print
(res);
print
()
A B key 0 A0 B0 K0 1 A1 B1 K1 2 A2 B2 K2 C D key 0 C3 D3 K3 1 C1 D1 K1 2 C2 D2 K2 A B key C D 0 A1 B1 K1 C1 D1 1 A2 B2 K2 C2 D2 A B key C D 0 A0 B0 K0 NaN NaN 1 A1 B1 K1 C1 D1 2 A2 B2 K2 C2 D2 3 NaN NaN K3 C3 D3 A B key C D 0 A0 B0 K0 NaN NaN 1 A1 B1 K1 C1 D1 2 A2 B2 K2 C2 D2 A B key C D 0 A1 B1 K1 C1 D1 1 A2 B2 K2 C2 D2 2 NaN NaN K3 C3 D3
?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
df1
=
pd.DataFrame({
'key1'
:[
'K0'
,
'K0'
,
'K1'
],
'key2'
:[
'K0'
,
'K1'
,
'K1'
],
'A'
:[
'A0'
,
'A1'
,
'A2'
],
'B'
:[
'B0'
,
'B1'
,
'B2'
]})
df2
=
pd.DataFrame({
'key1'
:[
'K0'
,
'K0'
,
'K1'
,
'K2'
],
'key2'
:[
'K0'
,
'K0'
,
'K1'
,
'K2'
],
'C'
:[
'C3'
,
'C1'
,
'C2'
,
'C4'
],
'D'
:[
'D3'
,
'D1'
,
'D2'
,
'D4'
]})
print
(df1);
print
()
print
(df2);
print
()
res
=
pd.merge(df1, df2, on
=
[
'key1'
,
'key2'
])
print
(res);
print
()
res
=
pd.merge(df1, df2, on
=
[
'key1'
,
'key2'
], how
=
'outer'
, indicator
=
'indi'
)
print
(res);
print
()
A B key1 key2 0 A0 B0 K0 K0 1 A1 B1 K0 K1 2 A2 B2 K1 K1 C D key1 key2 0 C3 D3 K0 K0 1 C1 D1 K0 K0 2 C2 D2 K1 K1 3 C4 D4 K2 K2 A B key1 key2 C D 0 A0 B0 K0 K0 C3 D3 1 A0 B0 K0 K0 C1 D1 2 A2 B2 K1 K1 C2 D2 A B key1 key2 C D indi 0 A0 B0 K0 K0 C3 D3 both 1 A0 B0 K0 K0 C1 D1 both 2 A1 B1 K0 K1 NaN NaN left_only 3 A2 B2 K1 K1 C2 D2 both 4 NaN NaN K2 K2 C4 D4 right_only
?
1
2
3
4
5
6
7
8
9
10
11
12
13
#以上是根据值合并。下面根据index合并
df1
=
pd.DataFrame({
'A'
:[
'A0'
,
'A1'
,
'A2'
],
'B'
:[
'B0'
,
'B1'
,
'B2'
]},
index
=
[
'index0'
,
'index1'
,
'index2'
])
df2
=
pd.DataFrame({
'A'
:[
'C3'
,
'C1'
,
'C2'
],
'D'
:[
'D3'
,
'D1'
,
'D2'
]},
index
=
[
'index3'
,
'index1'
,
'index2'
])
print
(df1);
print
()
print
(df2);
print
()
res
=
pd.merge(df1, df2, left_index
=
True
, right_index
=
True
)
print
(res);
print
()
res
=
pd.merge(df1, df2, left_index
=
True
, right_index
=
True
, how
=
'outer'
, suffixes
=
[
'_b'
,
'_g'
])
print
(res);
print
()
A B index0 A0 B0 index1 A1 B1 index2 A2 B2 A D index3 C3 D3 index1 C1 D1 index2 C2 D2 A_x B A_y D index1 A1 B1 C1 D1 index2 A2 B2 C2 D2 A_b B A_g D index0 A0 B0 NaN NaN index1 A1 B1 C1 D1 index2 A2 B2 C2 D2 index3 NaN NaN C3 D3
?
1
2
3
4
res
=
df1.join(df2, how
=
'outer'
, lsuffix
=
'_left'
, rsuffix
=
'_right'
)
#不用on默认用索引合并
print
(res);
print
()
res
=
df1.join(df2, on
=
'B'
, how
=
'outer'
, lsuffix
=
'_left'
, rsuffix
=
'_right'
)
#用on指定df1的某列和df2的索引合并
print
(res);
print
()
A_left B A_right D index0 A0 B0 NaN NaN index1 A1 B1 C1 D1 index2 A2 B2 C2 D2 index3 NaN NaN C3 D3 A_left B A_right D index0 A0 B0 NaN NaN index1 A1 B1 NaN NaN index2 A2 B2 NaN NaN index2 NaN index3 C3 D3 index2 NaN index1 C1 D1 index2 NaN index2 C2 D2
?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
import
numpy as np
import
pandas as pd
import
matplotlib.pyplot as plt
#画图模块
s
=
pd.Series(np.random.randn(
1000
), index
=
np.arange(
1000
))
s
=
s.cumsum()
#须在命令行执行, jupyter会报错
#s.plot()
#plt.show()
df
=
pd.DataFrame(np.random.randn(
1000
,
3
), columns
=
[
'A'
,
'B'
,
'C'
])
df
=
df.cumsum()
print
(df.head());
print
()
#head默认显示前5行
#须在命令行执行, jupyter会报错
#s.plot()
#plt.show()
#须在命令行执行, jupyter会报错
#'bar', 'hist', 'box', 'kde', 'area', 'scatter', 'hexbin', 'pie'...
#class_B = df.plot.scatter(x='A', y='B', color='DarkBlue', label='Class B') #画图,scatter
#df.plot.scatter(x='A', y='C', color='DarkRed', label='Class C', class_B=class_B)
#plt.show()
A B C 0 -0.399363 -1.004210 0.641141 1 -1.970009 -0.608482 -0.758504 2 -3.081640 -0.617352 -1.143872 3 -2.174627 -1.383785 -1.011411 4 -1.415515 -1.892226 -2.511739