Do a tutorial - Train the last layer

Ulf Hamster 8 min.
python image classification alexnet pytorch pretrained model transfer learning feature extractor

Load Packages

boilerplate for pretrained models from torchvision

%%capture 
!pip install torchvision==0.4.2
# load packages
import torch
import numpy as np
import torchvision as tv

# check version
print(f"torch version: {torch.__version__}")

# set GPU if available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"device type: {device}")

# reproducibility
np.random.seed(42)  # numpy seed
torch.manual_seed(42)  # pytorch seed
if torch.backends.cudnn.enabled:  # CuDNN deterministic mode
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
torch version: 1.3.1
device type: cuda:0
# image processing
import PIL  # Pillow
import requests
from io import BytesIO

# text processing
import json

# visualization
import matplotlib.pyplot as plt
%matplotlib inline

Approach

The last, final layer of the pretrained AlexNet is trained again.

  1. data collection
  2. data understanding (EDA)
  3. data preparation (Feature Engg.)
  4. modeling
  5. numerical optimization (training)
  6. model evaluation
  7. inference (prediction)

Data Collection

The toy dataset from PyTorch's tutorials is used. It's a tiny dataset. Thus, we can download and unzip it directly in the CLI.

# specify files and folders (python)
ZIP_FILE = "hymenoptera_data.zip"
URL_PATH = f"https://download.pytorch.org/tutorial/{ZIP_FILE}"
DATA_DIR = "tmp/"

# Create folders, download and extract ZIP 
#!rm -rf {DATA_DIR}
!mkdir -p {DATA_DIR}  # create download directory
!wget -q -nc {URL_PATH}   # download ZIP
!unzip -q -n {ZIP_FILE} -d {DATA_DIR}
!rm -f {ZIP_FILE}
# show number of images in each folder
import os
for path, dirs, files in os.walk(DATA_DIR):
    if len(files)>0:
        print("{:>4d} {:<s}".format(len(files), path))
  70 tmp/hymenoptera_data/val/ants
  83 tmp/hymenoptera_data/val/bees
 124 tmp/hymenoptera_data/train/ants
 121 tmp/hymenoptera_data/train/bees

Data Understanding

# get the filename of the first image
!ls tmp/hymenoptera_data/train/bees | head -1
1092977343_cb42b38d62.jpg
im = PIL.Image.open("tmp/hymenoptera_data/train/bees/1092977343_cb42b38d62.jpg", "r")
print(im.size)
im
(500, 333)

png

The image size is 500x333. With 256x256 as input, it allows cropping, rotating and other data augmentation

Data Preparation

Image Transformations

# image transformation pipeline for test set
from torchvision.transforms import (
    Compose, Resize, CenterCrop, ToTensor, Normalize)

trans_infer = Compose([
    Resize(256),  # shrink/scale to 256x256 pixels
    CenterCrop(224),  # crop 224x244 pixels
    ToTensor(),  # convert photo to tensor data type
    Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225])    
])
# image transformation pipeline for training set
from torchvision.transforms import (
    Compose, RandomRotation, RandomPerspective, RandomResizedCrop,
    RandomHorizontalFlip, RandomVerticalFlip, ColorJitter, RandomGrayscale,
    ToTensor, Normalize)

trans_train = Compose([
    # Before Crop (Black Pixels)
    RandomRotation(degrees=(-12, 12), expand=True, resample=PIL.Image.BILINEAR),
    RandomPerspective(p=0.6, distortion_scale=0.1, interpolation=PIL.Image.BICUBIC),
    # Crop (incl. translate, scaling)
    RandomResizedCrop((224, 224), scale=(.2, .9), ratio=(3/4, 4/3)),
    # Other edits
    RandomHorizontalFlip(p=0.5),
    RandomVerticalFlip(p=0.5),
    ColorJitter(brightness=.025, contrast=.075, hue=0.010, saturation=0.5),
    RandomGrayscale(p=.025),
    # pixels to normalized values
    ToTensor(),  # convert photo to tensor data type
    Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225])    
])

Data Loader

from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader

# Training set
dataset_train = tv.datasets.ImageFolder(
    "tmp/hymenoptera_data/train/",
    transform=trans_train)

loader_train = DataLoader(
    dataset_train, batch_size=61,  # Usually 256 is good but we only have 245 images
    shuffle=True, num_workers=2)

# Validation set (super cheesy and wrong. 
# Use the raw images of the training set as validation set)
dataset_valid = tv.datasets.ImageFolder(
    "tmp/hymenoptera_data/train/",
    transform=trans_infer)

loader_valid = DataLoader(
    dataset_train, batch_size=4,
    shuffle=False, num_workers=2)

# Test set
dataset_infer = tv.datasets.ImageFolder(
    "tmp/hymenoptera_data/val/",
    transform=trans_infer)

loader_infer = DataLoader(
    dataset_infer, batch_size=4, 
    shuffle=False, num_workers=2)

Class Labels

print(dataset_train.classes)
print(dataset_infer.classes)
['ants', 'bees']
['ants', 'bees']

Modeling

AlexNet as Feature Extractor

# load pretrained model
model = tv.models.alexnet(pretrained=True, progress=True)
# freeze all model weights
for param in model.parameters():
    param.requires_grad = False
# What's the name of the last/final layer?
model
AlexNet(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(11, 11), stride=(4, 4), padding=(2, 2))
    (1): ReLU(inplace=True)
    (2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(64, 192, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (4): ReLU(inplace=True)
    (5): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(192, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU(inplace=True)
    (8): Conv2d(384, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): ReLU(inplace=True)
    (10): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (avgpool): AdaptiveAvgPool2d(output_size=(6, 6))
  (classifier): Sequential(
    (0): Dropout(p=0.5, inplace=False)
    (1): Linear(in_features=9216, out_features=4096, bias=True)
    (2): ReLU(inplace=True)
    (3): Dropout(p=0.5, inplace=False)
    (4): Linear(in_features=4096, out_features=4096, bias=True)
    (5): ReLU(inplace=True)
    (6): Linear(in_features=4096, out_features=1000, bias=True)
  )
)
# How many input neurons has the final layer of AlexNet?
n_inputs_final = model.classifier[6].in_features
print(n_inputs_final)
4096
# Create a new final layer with 4096 input neurons and 2 output neurons (ants, bees)
# and overwrite the last layer
model.classifier[6] = torch.nn.Linear(n_inputs_final, 2, bias=True)
# check if requires_grad=True for the last layer (input, output requires gradient)
for param in model.parameters():
    print(param.requires_grad)
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
True
# move model to device
model.to(device)
AlexNet(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(11, 11), stride=(4, 4), padding=(2, 2))
    (1): ReLU(inplace=True)
    (2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(64, 192, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (4): ReLU(inplace=True)
    (5): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(192, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU(inplace=True)
    (8): Conv2d(384, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): ReLU(inplace=True)
    (10): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (avgpool): AdaptiveAvgPool2d(output_size=(6, 6))
  (classifier): Sequential(
    (0): Dropout(p=0.5, inplace=False)
    (1): Linear(in_features=9216, out_features=4096, bias=True)
    (2): ReLU(inplace=True)
    (3): Dropout(p=0.5, inplace=False)
    (4): Linear(in_features=4096, out_features=4096, bias=True)
    (5): ReLU(inplace=True)
    (6): Linear(in_features=4096, out_features=2, bias=True)
  )
)

Loss Function

criterion = torch.nn.CrossEntropyLoss()

Adam Optimizer

#optimizer = torch.optim.Adam(
#    model.classifier[6].parameters(), 
#    lr=1e-3, weight_decay=1e-5)

optimizer = torch.optim.SGD(
    model.classifier[6].parameters(),
    lr=1e-3, weight_decay=1e-6)

LR Scheduler

#scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
#    optimizer, len(loader_train), 
#    eta_min=0, last_epoch=-1)

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='min', patience=3, 
    threshold=0.0001, threshold_mode='rel',
    cooldown=0, min_lr=0, eps=1e-08)

Numerical Optimization

%%time
verbose = 1
n_epoch = 25

model.train()  # training mode
validation_loss = None
for e in range(n_epoch):
    # Scheduler
    #scheduler.step()  # update LR at epoch
    if validation_loss:
        scheduler.step(validation_loss)  # for ReduceLROnPlateau
    
    # Training
    training_loss = 0.0
    for i, batch in enumerate(loader_train, 0):
        # read data
        inputs, labels = batch[0].to(device), batch[1].to(device)
        # run optimization
        optimizer.zero_grad()  # reset gradient values
        outputs = model(inputs)  # forward pass
        loss = criterion(outputs, labels)  # loss
        loss.backward()  # Backprop
        optimizer.step()  # update weights
        #scheduler.step()  # update LR at batch
        # information
        training_loss += loss.item() * inputs.size(0)
    training_loss = training_loss / len(dataset_train)

    # Validation loss
    validation_loss = 0
    for i, batch in enumerate(loader_valid, 0):
        inputs, labels = batch[0].to(device), batch[1].to(device)  # read images
        outputs = model(inputs)  # infer/predict 
        loss = criterion(outputs, labels)  # loss
        validation_loss += loss.item() * inputs.size(0)
    validation_loss /= len(dataset_valid)

    # Display stats
    if verbose >= 1:
        current_lr = optimizer.param_groups[0]['lr']
        print(f"Epoch {e+1}/{n_epoch} | "
              f"Train.Loss: {round(training_loss*100,2)}% | "
              f"Valid.Loss: {round(validation_loss*100,2)}% | "
              f"LR: {current_lr}")

print("Finito")
Epoch 1/25 | Train.Loss: 66.85% | Valid.Loss: 57.66% | LR: 0.001
Epoch 2/25 | Train.Loss: 53.72% | Valid.Loss: 52.27% | LR: 0.001
Epoch 3/25 | Train.Loss: 47.81% | Valid.Loss: 43.32% | LR: 0.001
Epoch 4/25 | Train.Loss: 43.78% | Valid.Loss: 42.86% | LR: 0.001
Epoch 5/25 | Train.Loss: 43.97% | Valid.Loss: 39.93% | LR: 0.001
Epoch 6/25 | Train.Loss: 38.47% | Valid.Loss: 39.43% | LR: 0.001
Epoch 7/25 | Train.Loss: 36.52% | Valid.Loss: 36.32% | LR: 0.001
Epoch 8/25 | Train.Loss: 36.04% | Valid.Loss: 33.52% | LR: 0.001
Epoch 9/25 | Train.Loss: 35.04% | Valid.Loss: 32.92% | LR: 0.001
Epoch 10/25 | Train.Loss: 32.96% | Valid.Loss: 29.66% | LR: 0.001
Epoch 11/25 | Train.Loss: 32.13% | Valid.Loss: 32.14% | LR: 0.001
Epoch 12/25 | Train.Loss: 35.11% | Valid.Loss: 29.34% | LR: 0.001
Epoch 13/25 | Train.Loss: 33.67% | Valid.Loss: 32.71% | LR: 0.001
Epoch 14/25 | Train.Loss: 31.11% | Valid.Loss: 30.53% | LR: 0.001
Epoch 15/25 | Train.Loss: 32.33% | Valid.Loss: 31.02% | LR: 0.001
Epoch 16/25 | Train.Loss: 29.23% | Valid.Loss: 30.2% | LR: 0.001
Epoch 17/25 | Train.Loss: 31.48% | Valid.Loss: 29.3% | LR: 0.0001
Epoch 18/25 | Train.Loss: 28.38% | Valid.Loss: 27.47% | LR: 0.0001
Epoch 19/25 | Train.Loss: 32.24% | Valid.Loss: 28.04% | LR: 0.0001
Epoch 20/25 | Train.Loss: 28.29% | Valid.Loss: 25.33% | LR: 0.0001
Epoch 21/25 | Train.Loss: 29.94% | Valid.Loss: 30.4% | LR: 0.0001
Epoch 22/25 | Train.Loss: 28.31% | Valid.Loss: 27.7% | LR: 0.0001
Epoch 23/25 | Train.Loss: 26.89% | Valid.Loss: 31.21% | LR: 0.0001
Epoch 24/25 | Train.Loss: 27.12% | Valid.Loss: 27.5% | LR: 0.0001
Epoch 25/25 | Train.Loss: 30.91% | Valid.Loss: 25.64% | LR: 1e-05
Finito
CPU times: user 15.8 s, sys: 10.4 s, total: 26.2 s
Wall time: 6min 56s

Inference

# enable evaluation mode
model.eval()
AlexNet(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(11, 11), stride=(4, 4), padding=(2, 2))
    (1): ReLU(inplace=True)
    (2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(64, 192, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (4): ReLU(inplace=True)
    (5): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(192, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU(inplace=True)
    (8): Conv2d(384, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): ReLU(inplace=True)
    (10): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (avgpool): AdaptiveAvgPool2d(output_size=(6, 6))
  (classifier): Sequential(
    (0): Dropout(p=0.5, inplace=False)
    (1): Linear(in_features=9216, out_features=4096, bias=True)
    (2): ReLU(inplace=True)
    (3): Dropout(p=0.5, inplace=False)
    (4): Linear(in_features=4096, out_features=4096, bias=True)
    (5): ReLU(inplace=True)
    (6): Linear(in_features=4096, out_features=2, bias=True)
  )
)
# Loss on the test set
test_loss = 0
for i, batch in enumerate(loader_infer, 0):
    inputs, labels = batch[0].to(device), batch[1].to(device)  # read images
    outputs = model(inputs)  # infer/predict 
    loss = criterion(outputs, labels)  # loss
    test_loss += loss.item() * inputs.size(0)
test_loss /= len(dataset_infer)
print(test_loss)
0.2735382152928246
# compute proba for each label
#proba = torch.nn.functional.softmax(outputs, dim=1)
#dataset_infer.classes
# stolen from https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html#test-the-network-on-the-test-data
# and debugged ...
def print_class_accurancy(model, loader, classes, device='cpu'):
    n_classes = len(classes)
    class_correct = list(0. for i in range(n_classes))
    class_total = list(0. for i in range(n_classes))
    with torch.no_grad():
        for batch in loader:
            inputs, labels = batch[0].to(device), batch[1].to(device) 
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            c = (predicted == labels).squeeze()
            if len(c.size()) != 0:
                if len(c) == loader.batch_size:
                    for i in range(loader.batch_size):
                        label = labels[i]
                        class_correct[label] += c[i].item()
                        class_total[label] += 1
    for i in range(n_classes):
        print("Accuracy of {:>5s}: {:5.2f}%".format(classes[i], 100 * class_correct[i] / class_total[i]))
print_class_accurancy(model, loader_infer, dataset_infer.classes, device)
Accuracy of  ants: 91.43%
Accuracy of  bees: 91.46%

Links