Scikit-Learn#

Scikit-learn是开源的Python库,通过统一的界面实现机器学习、预处理、交叉验证及可视化算法。

简例#

# 导入工具库
from sklearn import neighbors, datasets, preprocessing
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# 加载数据
iris = datasets.load_iris()
X, y = iris.data[:, :2], iris.target

# 切分数据集
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=33)

# 数据预处理
scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# 训练与预测
knn = neighbors.KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

# 评估
accuracy_score(y_test, y_pred)
0.631578947368421

加载数据#

Scikit-learn处理的数据是存储为NumPy数组或SciPy稀疏矩阵的数字,还支持Pandas数据框等可转换为数字数组的其它数据类型。

import numpy as np
X = np.random.random((10, 5))
y = np.array(["M", "M", "F", "F", "M", "F", "M", "M", "F", "F"])
X[X < 0.7] = 0

训练/测试集切分#

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

数据预处理#

标准化#

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)  # 拟合
standardized_X = scaler.transform(X_train)  # 训练集变换
standardized_X_test = scaler.transform(X_test)  # 测试集变换

归一化#

from sklearn.preprocessing import Normalizer
scaler = Normalizer().fit(X_train)  # 拟合
normalized_X = scaler.transform(X_train)  # 训练集变换
normalized_X_test = scaler.transform(X_test)  # 测试集变换

二值化#

from sklearn.preprocessing import Binarizer
binarizer = Binarizer(threshold=0.0).fit(X)  # 拟合
binary_X = binarizer.transform(X)  # 变换

编码分类特征#

from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder()
y = enc.fit_transform(y)

缺失值处理#

from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=0, strategy="mean")  # 均值填充器
imp.fit_transform(X_train)  # 对数据进行缺失值均值填充变换
array([[0.94405829, 0.96245828, 0.97310823, 0.79917586, 0.8052525 ],
       [0.94887482, 0.83087041, 0.82984755, 0.79917586, 0.8052525 ],
       [0.95369135, 0.83087041, 0.90147789, 0.72662936, 0.75121241],
       [0.94887482, 0.79301132, 0.90147789, 0.79917586, 0.93847022],
       [0.94887482, 0.83087041, 0.90147789, 0.93928643, 0.8052525 ],
       [0.94887482, 0.75120976, 0.90147789, 0.73161178, 0.72607487],
       [0.94887482, 0.81680226, 0.90147789, 0.79917586, 0.8052525 ]])

生成多项式特征#

from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(5)
poly.fit_transform(X)
array([[1.        , 0.        , 0.75120976, ..., 0.20488284, 0.20333227,
        0.20179343],
       [1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.3357622 ],
       ...,
       [1.        , 0.        , 0.79301132, ..., 0.        , 0.        ,
        0.72795155],
       [1.        , 0.91239431, 0.        , ..., 0.44226442, 0.55753912,
        0.70285977],
       [1.        , 0.94405829, 0.96245828, ..., 0.        , 0.        ,
        0.        ]])

创建模型#

有监督学习评估器#

线性回归

from sklearn.linear_model import LinearRegression
lr = LinearRegression()

支持向量机(SVM)

from sklearn.svm import SVC
svc = SVC(kernel="linear")

朴素贝叶斯

from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()

KNN

from sklearn import neighbors
knn = neighbors.KNeighborsClassifier(n_neighbors=5)

无监督学习评估器#

主成分分析(PCA)

from sklearn.decomposition import PCA
pca = PCA(n_components=0.95)

K-Means聚类

from sklearn.cluster import KMeans
k_means = KMeans(n_clusters=3, random_state=0)
1.0  ## 模型拟合
1.0

有监督学习#

lr.fit(X, y)  # 拟合数据与模型
LinearRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
knn.fit(X_train, y_train)
KNeighborsClassifier()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
svc.fit(X_train, y_train)
SVC(kernel='linear')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

无监督学习#

k_means.fit(X_train)  # 拟合数据与模型
/home/docs/checkouts/readthedocs.org/user_builds/getstarted/envs/latest/lib/python3.11/site-packages/sklearn/cluster/_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  super()._check_params_vs_input(X, default_n_init=10)
KMeans(n_clusters=3, random_state=0)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
pca_model = pca.fit_transform(X_train)  # 拟合并转换数据

预测#

有监督评估器#

y_pred = svc.predict(np.random.random((2, 5)))  # 预测标签
y_pred = lr.predict(X_test)  # 预测标签
y_pred = knn.predict_proba(X_test)  # 评估标签概率

无监督评估器#

y_pred = k_means.predict(X_test)  # 预测聚类算法里的标签

评估模型性能#

分类评价指标#

准确率

svc.fit(X_train, y_train)
svc.score(X_test, y_test)  # 评估器评分法
0.3333333333333333
from sklearn.metrics import accuracy_score  # 指标评分函数
y_pred = svc.predict(X_test)
accuracy_score(y_test, y_pred)  # 评估accuracy
0.3333333333333333

分类预估评价函数

from sklearn.metrics import classification_report  # 精确度、召回率、F1分数及支持率
print(classification_report(y_test, y_pred))
              precision    recall  f1-score   support

           F       0.00      0.00      0.00         2
           M       0.33      1.00      0.50         1

    accuracy                           0.33         3
   macro avg       0.17      0.50      0.25         3
weighted avg       0.11      0.33      0.17         3
/home/docs/checkouts/readthedocs.org/user_builds/getstarted/envs/latest/lib/python3.11/site-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/docs/checkouts/readthedocs.org/user_builds/getstarted/envs/latest/lib/python3.11/site-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/docs/checkouts/readthedocs.org/user_builds/getstarted/envs/latest/lib/python3.11/site-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))

混淆矩阵

from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_pred))
[[0 2]
 [0 1]]

回归评价指标#

平均绝对误差

from sklearn.metrics import mean_absolute_error
import pandas as pd
import numpy as np
data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
target = raw_df.values[1::2, 2]
X, y = data, target
house_X_train, house_X_test, house_y_train, house_y_test = train_test_split(
    X, y, random_state=0
)
from sklearn.tree import DecisionTreeRegressor

dt = DecisionTreeRegressor().fit(house_X_train, house_y_train)
house_y_pred = dt.predict(house_X_test)
mean_absolute_error(house_y_test, house_y_pred)
3.3425196850393695

均方误差

from sklearn.metrics import mean_squared_error
mean_squared_error(house_y_test, house_y_pred)
27.695826771653547

R^2评分

from sklearn.metrics import r2_score
r2_score(house_y_test, house_y_pred)
0.6610017070198235

聚类评价指标#

调整兰德系数

from sklearn.metrics import adjusted_rand_score
adjusted_rand_score(y_test, y_pred)
0.0

同质性

from sklearn.metrics import homogeneity_score
homogeneity_score(y_test, y_pred)
0.0

V-measure

import sklearn.metrics as metrics

metrics.v_measure_score(y_test, y_pred)
0.0

交叉验证#

from sklearn.model_selection import cross_val_score
print(cross_val_score(knn, X_train, y_train, cv=4))
[0.5 0.5 0.5 1. ]
/home/docs/checkouts/readthedocs.org/user_builds/getstarted/envs/latest/lib/python3.11/site-packages/sklearn/model_selection/_split.py:737: UserWarning: The least populated class in y has only 3 members, which is less than n_splits=4.
  warnings.warn(
print(cross_val_score(lr, X, y, cv=2))
[ 0.6069688  -2.25273434]

模型调参与优化#

随机搜索超参优化#

from sklearn.model_selection import RandomizedSearchCV

params = {"n_neighbors": range(1, 5), "weights": ["uniform", "distance"]}

rsearch = RandomizedSearchCV(
    estimator=knn, param_distributions=params, cv=4, n_iter=8, random_state=5
)

rsearch.fit(X_train, y_train)
print(rsearch.best_score_)
0.625
/home/docs/checkouts/readthedocs.org/user_builds/getstarted/envs/latest/lib/python3.11/site-packages/sklearn/model_selection/_split.py:737: UserWarning: The least populated class in y has only 3 members, which is less than n_splits=4.
  warnings.warn(