介紹#
這次的 HW 是圖像分類任務,對食物數據集 food11 進行分類,具體的任務要求如下:
需要達到的目標和 Hints 分別為:
簡單#
只需要 run 它的原始代碼即可,結果如下:
中等#
進行 Training Augmentation 並將訓練時間延長(用更大的 n_epoch)。
Training Augmentation 具體操作如下:
train_tfm = transforms.Compose([
# 將圖像調整為固定形狀(高度 = 寬度 = 128)
transforms.Resize((128, 128)),
# 你可以在這裡添加一些變換。
transforms.RandomChoice([
transforms.RandomRotation((-30,30)),
transforms.RandomHorizontalFlip(p=0.5),
transforms.RandomVerticalFlip(p=0.5),
transforms.ColorJitter(brightness=(0.5,1.5), contrast=(0.5, 1.5), saturation=(0.5,1.5), hue=(-0.25, 0.25)),
transforms.RandomInvert(p=0.5),
transforms.RandomAffine(degrees=(-30,30), translate=(0.1, 0.1), scale=(0.8, 1.2), shear=(-30, 30)),
transforms.Grayscale(num_output_channels=3),
]),
# ToTensor()應該是變換中的最後一個。
transforms.ToTensor(),
])
具體介紹如下:
- RandomRotation ((-30,30)) 隨機旋轉圖像。(-30, 30):旋轉角度範圍(-30 度到 + 30 度之間隨機選擇)。
- RandomHorizontalFlip (p=0.5),以 50% 概率水平翻轉圖像。p=0.5:執行概率(0.5 表示 50%)。
- RandomVerticalFlip (p=0.5),以 50% 概率垂直翻轉圖像。p 含義同上。
- ColorJitter (brightness=(0.5,1.5), contrast=(0.5, 1.5), saturation=(0.5,1.5), hue=(-0.25, 0.25)),隨機調整顏色屬性(亮度、對比度、饱和度、色調)。brightness=(0.5, 1.5):亮度縮放範圍(0.5 倍~1.5 倍)。contrast=(0.5, 1.5):對比度調整範圍(0.5 倍~1.5 倍)。saturation=(0.5, 1.5):饱和度調整範圍(0.5 倍~1.5 倍)。hue=(-0.25, 0.25):色調偏移範圍(-0.25~+0.25,對應色相環的 - 90 度~+90 度)。
- RandomInvert (p=0.5),以 50% 概率反色(顏色取反,如黑變白、紅變青)。p 含義同上。
- RandomAffine (degrees=(-30,30), translate=(0.1, 0.1), scale=(0.8, 1.2), shear=(-30, 30)),隨機仿射變換(旋轉、平移、縮放、剪切)。degrees=(-30, 30):旋轉角度範圍。translate=(0.1, 0.1):水平和垂直方向最大平移比例(10% 圖像尺寸)。scale=(0.8, 1.2):縮放範圍(0.8 倍~1.2 倍)。shear=(-30, 30):剪切角度範圍(-30 度~+30 度)。剪切(Shear)是一種線性幾何變換,通過傾斜圖像的一部分來模擬 “傾斜變形” 效果。
- Grayscale (num_output_channels=3),將圖像轉為灰度圖,但保留 3 通道(RGB 格式,每通道值相同)。num_output_channels=3:輸出通道數(3 表示生成 3 通道灰度圖,兼容模型輸入)。將彩色圖像(RGB)轉換為灰度圖,本質是通過加權平均合併三個通道的亮度信息,生成單通道圖像。
Training Time 延長到 90 個 epoch,結果如下:
需要注意的是,查看 gpu 利用率發現 gpu 利用率很低,問題應該是出在每次讀取數據時的 tranform 增強還有磁碟 IO 的時間太長了,導致訓練時間很慢,需要調一下 DataLoader 的並發。
Train 此處使用了 12 個並發(取決於圖像增強的複雜度),取 persistent_workers=True,避免重複創建 / 銷毀進程,減少開銷。Test 使用了 8 個並發。改善後訓練一個 epoch 的時間大大縮短,此處使用的是本地電腦跑的,設備為 RTX4070laptop。
同時還將 batchsize 變成了 128。
強#
首先先將模型結構進行了修改,學習了一下 resnet18 和 resnet34,自己在 resnet34 的基礎上小改了一下:
class BasicBlock(nn.Module):
def __init__(self, in_channels, out_channels, stride=1):
super().__init__()
self.conv1 = nn.Conv2d(in_channels, out_channels, 3, stride, 1, bias=False)
self.bn1 = nn.BatchNorm2d(out_channels)
self.conv2 = nn.Conv2d(out_channels, out_channels, 3, 1, 1, bias=False)
self.bn2 = nn.BatchNorm2d(out_channels)
self.relu = nn.ReLU(inplace=True)
self.downsample = None
if stride != 1 or in_channels != out_channels:
self.downsample = nn.Sequential(
nn.Conv2d(in_channels, out_channels, 1, stride, 0),
nn.BatchNorm2d(out_channels)
)
def forward(self, x):
identity = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
if self.downsample is not None:
identity = self.downsample(identity)
out += identity
out = self.relu(out)
return out
class Classifier(nn.Module):
def __init__(self):
super(Classifier, self).__init__()
# torch.nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding)
# torch.nn.MaxPool2d(kernel_size, stride, padding)
# input 維度 [3, 128, 128]
# 初始卷積層
self.conv1 = nn.Conv2d(3, 64, kernel_size=5, stride=2, padding=2, bias=False)
self.bn1 = nn.BatchNorm2d(64)
self.relu = nn.ReLU(inplace=True)
self.block1 = BasicBlock(64, 64, 1) # [64, 64, 64]
self.block2 = BasicBlock(64, 64) # [64, 64, 64]
self.block3 = BasicBlock(64, 64) # [64, 64, 64]
self.block4 = BasicBlock(64, 128, 2) # [128, 32, 32]
self.block5 = BasicBlock(128, 128) # [128, 32, 32]
self.block6 = BasicBlock(128, 128) # [128, 32, 32]
self.block7 = BasicBlock(128, 128) # [128, 32, 32]
self.block8 = BasicBlock(128, 256, 2) # [256, 16, 16]
self.block9 = BasicBlock(256, 256) # [256, 16, 16]
self.block10 = BasicBlock(256, 256) # [256, 16, 16]
self.block11 = BasicBlock(256, 256) # [256, 16, 16]
self.block12 = BasicBlock(256, 256) # [256, 16, 16]
self.block13 = BasicBlock(256, 256) # [256, 16, 16]
self.block14 = BasicBlock(256, 512, 2) # [512, 8, 8]
self.block15 = BasicBlock(512, 512) # [512, 8, 8]
self.block16 = BasicBlock(512, 512) # [512, 8, 8]
self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
self.fc = nn.Linear(512, 11)
def forward(self, x):
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.block1(out)
out = self.block2(out)
out = self.block3(out)
out = self.block4(out)
out = self.block5(out)
out = self.block6(out)
out = self.block7(out)
out = self.block8(out)
out = self.block9(out)
out = self.block10(out)
out = self.block11(out)
out = self.block12(out)
out = self.block13(out)
out = self.block14(out)
out = self.block15(out)
out = self.block16(out)
out = self.avgpool(out)
out = out.view(out.size()[0], -1)
return self.fc(out)
然後進行了 Cross-Validation 和 Ensemble:
Cross-Validation 採用了五折交叉驗證,將原來的訓練集和驗證集合併後再進行的五折交叉驗證:
# "cuda" only when GPUs are available.
device = "cuda" if torch.cuda.is_available() else "cpu"
# 訓練的epoch數量和耐心值。
n_epochs = 200
patience = 50 # 如果在'patience'個epoch內沒有改進,則提前停止
import numpy as np
from sklearn.model_selection import KFold
from torch.utils.tensorboard import SummaryWriter
import datetime
# 初始化5折交叉驗證
n_folds = 5
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)
# 加載完整訓練集(用於交叉驗證)
train_set = FoodDataset(os.path.join(_dataset_dir, "training"), tfm=train_tfm)
valid_set = FoodDataset(os.path.join(_dataset_dir, "validation"), tfm=train_tfm)
# 合併數據集
combined_files = train_set.files + valid_set.files
full_dataset = FoodDataset(path="", tfm=train_tfm, files=combined_files)
oof_preds = np.zeros(len(full_dataset)) # 存儲OOF預測結果
oof_labels = np.zeros(len(full_dataset)) # 存儲真實標籤
# 存儲所有基模型(用於後續集成)
base_models = []
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
log_dir = f"runs/food_classification_{timestamp}"
writer = SummaryWriter()
for fold, (train_idx, val_idx) in enumerate(kf.split(train_set)):
print(f"\n====== Fold {fold+1}/{n_folds} ======")
# 劃分訓練集和驗證子集
train_subset = Subset(train_set, train_idx)
val_subset = Subset(train_set, val_idx)
# DataLoader
train_loader = DataLoader(
train_subset,
batch_size=batch_size,
shuffle=True,
num_workers=12,
pin_memory=True,
persistent_workers=True
)
val_loader = DataLoader(
val_subset,
batch_size=batch_size,
shuffle=False,
num_workers=8,
pin_memory=True,
persistent_workers=True
)
# 每折獨立初始化模型和優化器
model = Classifier().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0003, weight_decay=1e-5)
criterion = nn.CrossEntropyLoss()
# 早停相關變量(每折獨立)
fold_best_acc = 0
stale = 0
# 訓練循環(保持原有邏輯)
for epoch in range(n_epochs):
# ---------- 訓練 ----------
model.train()
train_loss, train_accs = [], []
for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
imgs, labels = batch
imgs, labels = imgs.to(device), labels.to(device)
logits = model(imgs)
loss = criterion(logits, labels)
optimizer.zero_grad()
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=10)
optimizer.step()
acc = (logits.argmax(dim=-1) == labels).float().mean()
train_loss.append(loss.item())
train_accs.append(acc.item())
# 打印訓練信息
avg_loss = np.mean(train_loss)
avg_acc = np.mean(train_accs)
# 寫入TensorBoard
writer.add_scalar(f'Fold_{fold}/Train/Loss', avg_loss, epoch)
writer.add_scalar(f'Fold_{fold}/Train/Accuracy', avg_acc, epoch)
print(f"[ 訓練 | {epoch+1:03d}/{n_epochs:03d} ] loss = {avg_loss:.5f}, acc = {avg_acc:.5f}")
# ---------- 驗證 ----------
model.eval()
val_loss, val_accs, val_preds = [], [], []
val_labels = [] # 累積所有驗證批次的標籤
for batch in tqdm(val_loader, desc="驗證中"):
imgs, labels = batch
imgs = imgs.to(device)
labels_np = labels.numpy()
val_labels.extend(labels_np) # 累積標籤
with torch.no_grad():
logits = model(imgs)
preds = logits.argmax(dim=-1).cpu().numpy()
loss = criterion(logits, labels.to(device))
val_loss.append(loss.item())
val_accs.append((preds == labels_np).mean())
val_preds.extend(preds)
# 記錄OOF預測和標籤
oof_preds[val_idx] = np.array(val_preds)
oof_labels[val_idx] = np.array(val_labels)
# 打印驗證信息
avg_val_loss = np.mean(val_loss)
avg_val_acc = np.mean(val_accs)
# 寫入TensorBoard
writer.add_scalar(f'Fold_{fold}/Val/Loss', avg_val_loss, epoch)
writer.add_scalar(f'Fold_{fold}/Val/Accuracy', avg_val_acc, epoch)
print(f"[ 驗證 | {epoch+1:03d}/{n_epochs:03d} ] loss = {avg_val_loss:.5f}, acc = {avg_val_acc:.5f}")
# 早停邏輯(每折獨立)
if avg_val_acc > fold_best_acc:
print(f"Fold {fold} best model at epoch {epoch}")
torch.save(model.state_dict(), f"fold{fold}_best.ckpt")
fold_best_acc = avg_val_acc
stale = 0
else:
stale += 1
if stale > patience:
print(f"在epoch {epoch}提前停止")
break
# 保存當前折的模型
base_models.append(model)
# 關閉TensorBoard的writer
writer.close()
# ---------- 後處理 ----------
# 計算OOF準確率
oof_acc = (oof_preds == oof_labels).mean()
print(f"\n[OOF 準確率] {oof_acc:.4f}")
保存五個基模型後在 test 部分使用 ensemble:
# 集成預測(軟投票法)
all_preds = []
for model in base_models:
model.eval()
fold_preds = []
for data, _ in test_loader: # 保持和原test_loader一致
with torch.no_grad():
logits = model(data.to(device))
# 保存每個模型的原始logits(概率),而不是直接argmax
fold_preds.append(logits.cpu().numpy())
# 合併當前模型的所有batch預測結果
fold_preds = np.concatenate(fold_preds, axis=0)
all_preds.append(fold_preds)
# 軟投票:平均所有模型的logits後取argmax
all_preds = np.stack(all_preds) # shape: (n_models, n_samples, n_classes)
prediction = all_preds.mean(axis=0).argmax(axis=1) # shape: (n_samples,)
最後結果如下:
已經非常接近 boss 了
由於時間問題就沒有做 boss,以後有時間再回來補吧。
兩個 report_problem 分別是數據增強和設計殘差網絡,在 medium 和 strong 完成的過程裡有,就沒有額外再說了。