Do a tutorial - Effect of Multiple Objects

Ulf Hamster 4 min.
python image classification alexnet pytorch pretrained model baseline model

Load Packages

boilerplate for pretrained models from torchvision

%%capture 
!pip install torchvision==0.4.2
# load packages
import torch
import numpy as np
import torchvision as tv

# check version
print(f"torch version: {torch.__version__}")

# set GPU if available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"device type: {device}")

# reproducibility
np.random.seed(42)  # numpy seed
torch.manual_seed(42)  # pytorch seed
if torch.backends.cudnn.enabled:  # CuDNN deterministic mode
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# show the function wrappers that will return pretrained CV models
#print([m for m in dir(tv.models) if (m[0] != '_' and m[0].islower())])
torch version: 1.3.1+cu100
device type: cuda:0
# image processing
from PIL import Image  # Pillow
import requests
from io import BytesIO

# text processing
import json

# visualization
import matplotlib.pyplot as plt
%matplotlib inline

Approach

We will use the pretrained model directly, i.e. no feature engineering, no training.

  1. data collection
  2. data understanding (EDA)
  3. data preparation (Feature Engg.)
  4. modeling
  5. numerical optimization (training)
  6. model evaluation
  7. inference (prediction)

In other words, load the fitted model, and predict something.

Data Preparation

# image transformation pipeline
from torchvision.transforms import (Compose, Resize, CenterCrop, ToTensor, Normalize)

trans = Compose([
    Resize(256),  # shrink/scale to 256x256 pixels
    CenterCrop(224),  # crop 224x244 pixels
    ToTensor(),  # convert photo to tensor data type
    Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225])    
])
# load class labels for ImageNet
url = "https://s3.amazonaws.com/deep-learning-models/image-models/imagenet_class_index.json"
response = requests.get(url)
label_dict = json.loads(response.content)
# "label_dict['0']" will returen "['n01440764', 'tench']"
labels = [label_dict[key][1] for key in label_dict]
print(labels[:10])
['tench', 'goldfish', 'great_white_shark', 'tiger_shark', 'hammerhead', 'electric_ray', 'stingray', 'cock', 'hen', 'ostrich']

Modeling

The modeling part is just about loading the fitted model.

model = tv.models.alexnet(pretrained=True, progress=True)

Load Test Data (for Inference)

Load the test image

url = "https://upload.wikimedia.org/wikipedia/commons/b/bc/Manfred_kielnhofer_contemporary_art_design_paper_tube_chair.jpg"
response = requests.get(url)  # download image
img1 = Image.open(BytesIO(response.content), "r")  # read image
# eyeballing the image reveals two objects
print(img1.size)
img1
(680, 453)

png

Crop the 2 Objects as seperate images

img2 = img1.crop((0, 120, 380, 450))
print(img2.size)
img2
(380, 330)

png

img3 = img1.crop((355, 170, 635, 450))
print(img3.size)
img3
(280, 280)

png

Apply transformations

img1 = trans(img1)
img2 = trans(img2)
img3 = trans(img3)
# just check the transformed image
# pixel are rescaled with min-max for visual reasons
for im in [img1, img2, img3]:
    lo = im.min()
    up = im.max()
    tmp = (im - lo) / (up - lo)
    tmp = np.transpose(tmp, (1,2,0))
    plt.imshow(tmp)
    plt.show()

png

png

png

Combine to Test Set

batch = torch.stack([img1, img2, img3])
print(batch.size())
torch.Size([3, 3, 224, 224])

Inference

# enable evaluation mode
model.eval()
AlexNet(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(11, 11), stride=(4, 4), padding=(2, 2))
    (1): ReLU(inplace=True)
    (2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(64, 192, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (4): ReLU(inplace=True)
    (5): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(192, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU(inplace=True)
    (8): Conv2d(384, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): ReLU(inplace=True)
    (10): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (avgpool): AdaptiveAvgPool2d(output_size=(6, 6))
  (classifier): Sequential(
    (0): Dropout(p=0.5, inplace=False)
    (1): Linear(in_features=9216, out_features=4096, bias=True)
    (2): ReLU(inplace=True)
    (3): Dropout(p=0.5, inplace=False)
    (4): Linear(in_features=4096, out_features=4096, bias=True)
    (5): ReLU(inplace=True)
    (6): Linear(in_features=4096, out_features=1000, bias=True)
  )
)
# run inference (predict)
output = model(batch)
# The "output" is a (3,1000) tensor. 1000 outputs for 3 examples
# The 1000 outputs belong to the 1000 labels of the ImageNet dataset
print(output.shape, output.min(), output.max())
torch.Size([3, 1000]) tensor(-9.2660, grad_fn=<MinBackward1>) tensor(13.1458, grad_fn=<MaxBackward1>)
# We have to apply the SoftMax function to the outputs 
# to get the probability for each label
pct = torch.nn.functional.softmax(output, dim=1)
pct.shape
torch.Size([3, 1000])

Results

import pandas as pd
df = pd.DataFrame(index=labels, 
                  columns=['img1', 'img2', 'img3'], 
                  data=pct.detach().numpy().T*100)

The results for the full image are bad (see previous article)

df.sort_values(by='img1', ascending=False).head(5)
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

The results for the left chair are not convincing either. It seems that AlexNet think the chair is a keyboard.

df.sort_values(by='img2', ascending=False).head(5)
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

The right chair is identified as furniture. I would count it as partial success.

df.sort_values(by='img3', ascending=False).head(5)
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

Links