Validation 和 Cross Validation
1 2
| import numpy as np from sklearn import datasets
|
1 2 3
| digits = datasets.load_digits() X = digits.data y = digits.target
|
测试train_test_split
1 2
| from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=666)
|
1 2 3 4 5 6 7 8 9 10 11 12 13 14
| from sklearn.neighbors import KNeighborsClassifier
best_k, best_p, best_score = 0, 0, 0 for k in range(2, 11): for p in range(1, 6): knn_clf = KNeighborsClassifier(weights="distance", n_neighbors=k, p=p) knn_clf.fit(X_train, y_train) score = knn_clf.score(X_test, y_test) if score > best_score: best_k, best_p, best_score = k, p, score print("Best K =", best_k) print("Best P =", best_p) print("Best Score =", best_score)
|
Best K = 3
Best P = 4
Best Score = 0.986091794159
使用交叉验证
1 2 3 4
| from sklearn.model_selection import cross_val_score
knn_clf = KNeighborsClassifier() cross_val_score(knn_clf, X_train, y_train)
|
array([ 0.98895028, 0.97777778, 0.96629213])
1 2 3 4 5 6 7 8 9 10 11 12
| best_k, best_p, best_score = 0, 0, 0 for k in range(2, 11): for p in range(1, 6): knn_clf = KNeighborsClassifier(weights="distance", n_neighbors=k, p=p) scores = cross_val_score(knn_clf, X_train, y_train) score = np.mean(scores) if score > best_score: best_k, best_p, best_score = k, p, score print("Best K =", best_k) print("Best P =", best_p) print("Best Score =", best_score)
|
Best K = 2
Best P = 2
Best Score = 0.982359987401
1 2 3
| best_knn_clf = KNeighborsClassifier(weights="distance", n_neighbors=2, p=2) best_knn_clf.fit(X_train, y_train) best_knn_clf.score(X_test, y_test)
|
0.98052851182197498
回顾网格搜索
1 2 3 4 5 6 7 8 9 10 11 12
| from sklearn.model_selection import GridSearchCV
param_grid = [ { 'weights': ['distance'], 'n_neighbors': [i for i in range(2, 11)], 'p': [i for i in range(1, 6)] } ]
grid_search = GridSearchCV(knn_clf, param_grid, verbose=1) grid_search.fit(X_train, y_train)
|
Fitting 3 folds for each of 45 candidates, totalling 135 fits
[Parallel(n_jobs=1)]: Done 135 out of 135 | elapsed: 1.9min finished
GridSearchCV(cv=None, error_score='raise',
estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
metric_params=None, n_jobs=1, n_neighbors=10, p=5,
weights='distance'),
fit_params={}, iid=True, n_jobs=1,
param_grid=[{'weights': ['distance'], 'n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9, 10], 'p': [1, 2, 3, 4, 5]}],
pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
scoring=None, verbose=1)
0.98237476808905377
1
| grid_search.best_params_
|
{'n_neighbors': 2, 'p': 2, 'weights': 'distance'}
1 2
| best_knn_clf = grid_search.best_estimator_ best_knn_clf.score(X_test, y_test)
|
0.98052851182197498
cv参数
1
| cross_val_score(knn_clf, X_train, y_train, cv=5)
|
array([ 0.99543379, 0.96803653, 0.98148148, 0.96261682, 0.97619048])
1
| grid_search = GridSearchCV(knn_clf, param_grid, verbose=1, cv=5)
|