將資料切分
在數據科學中,訓練數據和測試數據扮演著兩個主要的角色。評估已構建模型的性能與訓練和構建模型同樣重要,因為未經評估性能的模型可能會產生錯誤的預測並導致嚴重的並發症。為了防止這種情況發生並確保預測的準確性,您必須足夠好地測試和驗證模型。
為了構建和評估機器學習模型的性能,我們通常將數據集分成兩個不同的數據集。這兩個數據集是訓練數據和測試數據。
訓練數據 | 測試數據 |
用於構建模型 | 用於評估構建的模型 |
分配更大的數據部分 | 分配較小的數據部分 |
可進一步劃分以進行驗證 | 不會進一步分割 |
什麼是驗證數據
驗證數據是從訓練數據中分離出來的子數據集,用於在訓練過程中驗證模型。來自驗證過程的信息幫助我們改變模型的參數、分類器以獲得更好的結果。所以基本上,驗證數據可以幫助我們優化模型。
使用 Scikit-learn 的 train_test_split來切割數據
使用下面這段程式碼可以將訓練及分割成訓練及驗證集
from sklearn.model_selection import train_test_split
# 載入資料集
....
# 分離訓練和測試數據
X_train, X_val, y_train, y_val = train_test_split(train_images_fold, train_labels_fold, test_size=0.1, random_state=42)
model = create_model()
keras_classifier.fit(X_train, y_train, validation_data=(X_val, y_val))
使用測試資料作驗證
下面這段程式可以使用X_test、y_test來使用model做測試,並且可以用accuracy_score來取得準確率,並將準確率存入一個陣列裡
predictions = keras_classifier.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
accuracy_scores.append(accuracy)
若是想要取得錯誤的資料集,則可以用np.where來取得與正確答案不一樣的資料,並將錯誤的資料存入incorrect陣列裡面
incorrect_indices = np.where(predictions != y_test)[0]
incorrect_images = X_test[incorrect_indices]
incorrect_labels = y_test[incorrect_indices]
incorrect_prediction = predictions[incorrect_indices]
for i in range(len(incorrect_indices)):
incorrect.append({"image": incorrect_images[i] ,"label": incorrect_labels[i], "pred": incorrect_prediction[i], "idx": fold_index})
完整範例
下面的範例為結合K-Fold概念,將資料及分成五份,並做五次的訓練以判別模型的訓練狀況是否有過擬合的狀況。其中會每次會取其4分來做訓練、1份做測試,再將訓練集中的1/10拆做驗證集。最後使用matplotlib.pyplot來顯示這五次之中,測試集中錯誤結果的圖片
import numpy as np
import matplotlib.pyplot as plt
import pathlib
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from tensorflow import keras
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
img_path = 'dice3'
def class_names(path):
return np.array(sorted([item.name for item in pathlib.Path(path).glob('*') if
"LICENSE.txt" != item.name]))
def create_model():
model = tf.keras.Sequential()
model.add(tf.keras.layers.Rescaling(1. / 255))
model.add(tf.keras.layers.Conv2D(32, kernel_size=7, activation='relu'))
model.add(tf.keras.layers.MaxPooling2D())
model.add(tf.keras.layers.Conv2D(64, kernel_size=5, activation='relu'))
model.add(tf.keras.layers.MaxPooling2D())
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(64, activation='relu'))
model.add(tf.keras.layers.Dense(len(class_names(img_path))))
model.compile(
optimizer='adam',
loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
metrics=['accuracy'])
return model
train_ds = tf.keras.utils.image_dataset_from_directory(
img_path,
seed=7,
batch_size=32)
train_images = []
train_labels = []
for images, labels in train_ds:
train_images.append(images.numpy())
train_labels.append(labels.numpy())
train_images = np.concatenate(train_images, axis=0)
train_labels = np.concatenate(train_labels, axis=0)
label_encoder = LabelEncoder()
train_labels_encoded = label_encoder.fit_transform(train_labels)
# 創建KerasClassifier
keras_classifier = KerasClassifier(build_fn=create_model, epochs=5, batch_size=16)
# 定義StratifiedKFold
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=123)
fold_index = 1
# 進行交叉驗證
accuracy_scores = []
incorrect = []
for train_indices, val_indices in kfold.split(train_images, train_labels_encoded):
print("fold_index="+str(fold_index))
train_images_fold = train_images[train_indices]
train_labels_fold = train_labels_encoded[train_indices]
X_test = train_images[val_indices]
y_test = train_labels_encoded[val_indices]
# 分離訓練和測試數據
X_train, X_val, y_train, y_val = train_test_split(train_images_fold, train_labels_fold, test_size=0.1, random_state=42)
model = create_model()
keras_classifier.fit(X_train, y_train, validation_data=(X_val, y_val))
predictions = keras_classifier.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
accuracy_scores.append(accuracy)
incorrect_indices = np.where(predictions != y_test)[0]
incorrect_images = X_test[incorrect_indices]
incorrect_labels = y_test[incorrect_indices]
incorrect_prediction = predictions[incorrect_indices]
for i in range(len(incorrect_indices)):
incorrect.append({"image": incorrect_images[i] ,"label": incorrect_labels[i], "pred": incorrect_prediction[i], "idx": fold_index})
# 印出準確率
print("Accuracy scores:", accuracy_scores)
print("Mean accuracy:", np.mean(accuracy_scores))
fold_index += 1
# 顯示出錯誤的答案
images_per_page = 15
num_images_per_row = 5
num_images_per_col = 3
num_pages = (len(incorrect) - 1) // images_per_page + 1
for page in range(num_pages):
start_idx = page * images_per_page
end_idx = (page + 1) * images_per_page
page_detail = incorrect[start_idx:end_idx]
fig, axes = plt.subplots(num_images_per_col, num_images_per_row, figsize=(num_images_per_col*2, num_images_per_row*2))
i=0
for data in page_detail:
image = data["image"]
label = data["label"]
pred = data["pred"]
idx = data["idx"]
row = i // num_images_per_row
col = i % num_images_per_row
ax = axes[row, col]
image = image.astype(np.uint8)
ax.imshow(image)
ax.set_title(f"{label}->{pred}({idx})")
i = i+1
ax.axis("off")
plt.tight_layout()
#plt.suptitle("fold_index:"+str(fold_index)+"-"+str(page+1), x=0, y=1, ha='left', va='top')
plt.show()