Numpy
创建数组
1 2 3 4
| import numpy as np a = np.arange(5) b = np.array([np.arange(5),np.arange(5)]) c = np.array(np.arange(5)*4)
|
运算
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
| //每种运算的两种表示都可以 np.add(a,b) a+b np.subtract(a,b) a-b np.multiply(a,b) //叉乘 a*b np.divide(a,b) a/b np.mod(a,b) a%b
//矩阵的运算 //点乘 np.dot(a,b) //转置 a.T //矩阵的逆 import numpy.linalg as lg lg.inv(a)
|
切片与索引
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
| a = np.arange(9) In: a[3:7] out:array([3,4,5,6])
In: a[:7:2] Out:array([0,2,4,6])
In: a[::-1] Out: array([8,7,6,5,4,3,2,1,0])
In: a=np.array([[1,2,3],[4,3,2]]) In: a[1][0] Out:array([4]) In:a[1,:2] Out:array([4, 3])
|
形状,堆叠,拆分
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
| reshape() ravel() //多维变一维 flatten() //多维变一维 flatten()返回的是真的,ravel()只是改试图 shape() transpose() //转置
np.hstack(a,b) np.vstack(a,b) np.dstack(a,b) //深度堆叠
np.hsplit(a,3) np.vsplit(a,3) np.dsplit(a,3)
//example np.arange(24).reshape(2,3,4) a.ravel() a.flatten() a.shape=(6,4) a.transpose()
|
Pandas
Series
Series是一维的,自带索引,索引默认是数字,索引也可以自己改,可以和字典相互转化
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
| from pandas import Series,DataFrame import pandas as pd
obj=Series([4,7,-5,3]) obj.values obj.index obj2=Series([4,7,-5,3],index=['d','b','a','c'])
data = {'a': 1, 'b': 2, 'c': 3}
series = pd.Series(data)
dict = series.to_dict()
|
DataFrame
DataFrame是二维的,其实就是Series的集合
Series只有行索引,那么DataFrame的列索引其实就是各个Series
Dataframe中的Series共用行索引
1 2 3 4
| dictionary = {'state':['0hio','0hio','0hio','Nevada','Nevada'], 'year':[2000,2001,2002,2001,2002], 'pop':[1.5,1.7,3.6,2.4,2.9]} frame = DataFrame(dictionary)
|
SKlearn
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33
| # -*-encoding:utf-8 -*- # import package. import numpy as np import pandas as pd from sklearn.preprocessing import OrdinalEncoder
import warnings warnings.filterwarnings('ignore') data_ori = pd.read_csv('/data/bigfiles/bankloan.csv') data_ori.head()
data_ori.info()
data_ori.describe()
t = data_ori.isnull().sum() print(t[t>0])
data_ori['A1'].value_counts() data_ori['A1'].unique() #按概率最大填充a data_ori['A1'].fillna('b', inplace=True)
#筛选所有非数值型的字段 cols = [col for col in data_ori.select_dtypes(include=['object']).columns] cols
enc=OrdinalEncoder() newx=enc.fit_transform(data_ori[cols]) newx[:5] data_ori[cols]=newx print(data_ori.iloc[:,:6].describe())
|