Numpy

创建数组

1
2
3
4
import numpy as np
a = np.arange(5)
b = np.array([np.arange(5),np.arange(5)])
c = np.array(np.arange(5)*4)

运算

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
//每种运算的两种表示都可以
np.add(a,b)
a+b
np.subtract(a,b)
a-b
np.multiply(a,b) //叉乘
a*b
np.divide(a,b)
a/b
np.mod(a,b)
a%b

//矩阵的运算
//点乘
np.dot(a,b)
//转置
a.T
//矩阵的逆
import numpy.linalg as lg
lg.inv(a)


切片与索引

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
a = np.arange(9)
In: a[3:7]
out:array([3,4,5,6])

In: a[:7:2]
Out:array([0,2,4,6])

In: a[::-1]
Out: array([8,7,6,5,4,3,2,1,0])

In: a=np.array([[1,2,3],[4,3,2]])
In: a[1][0]
Out:array([4])
In:a[1,:2] #获取第1维的前2个元素
Out:array([4, 3])

形状,堆叠,拆分

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
reshape()
ravel() //多维变一维
flatten() //多维变一维 flatten()返回的是真的,ravel()只是改试图
shape()
transpose() //转置

np.hstack(a,b)
np.vstack(a,b)
np.dstack(a,b) //深度堆叠

np.hsplit(a,3)
np.vsplit(a,3)
np.dsplit(a,3)


//example
np.arange(24).reshape(2,3,4)
a.ravel()
a.flatten()
a.shape=(6,4)
a.transpose()


Pandas

Series

Series是一维的,自带索引,索引默认是数字,索引也可以自己改,可以和字典相互转化

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
from pandas import Series,DataFrame
import pandas as pd

obj=Series([4,7,-5,3])
obj.values
obj.index
obj2=Series([4,7,-5,3],index=['d','b','a','c'])


# 字典
data = {'a': 1, 'b': 2, 'c': 3}
# 将字典转换为 Series
series = pd.Series(data)
# Seris转换为字典
dict = series.to_dict()


DataFrame

DataFrame是二维的,其实就是Series的集合

Series只有行索引,那么DataFrame的列索引其实就是各个Series

Dataframe中的Series共用行索引

1
2
3
4
dictionary = {'state':['0hio','0hio','0hio','Nevada','Nevada'],
'year':[2000,2001,2002,2001,2002],
'pop':[1.5,1.7,3.6,2.4,2.9]}
frame = DataFrame(dictionary)

SKlearn

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
# -*-encoding:utf-8 -*-
# import package.
import numpy as np
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder

import warnings
warnings.filterwarnings('ignore')
data_ori = pd.read_csv('/data/bigfiles/bankloan.csv')
data_ori.head()

data_ori.info()

data_ori.describe()

t = data_ori.isnull().sum()
print(t[t>0])

data_ori['A1'].value_counts()
data_ori['A1'].unique()
#按概率最大填充a
data_ori['A1'].fillna('b', inplace=True)


#筛选所有非数值型的字段
cols = [col for col in data_ori.select_dtypes(include=['object']).columns]
cols

enc=OrdinalEncoder()
newx=enc.fit_transform(data_ori[cols])
newx[:5]
data_ori[cols]=newx
print(data_ori.iloc[:,:6].describe())