Do a tutorial - openai's gym

Ulf Hamster 3 min.
python gym reinforcement learning

Load Packages

#%%capture
#!pip install git+https://github.com/openai/gym
#!pip install gym[atari]
import gym
import numpy as np
import random

np.set_printoptions(precision=4, suppress=True)

Simulate the environment

env = gym.make("CartPole-v0")
#env = gym.make("MountainCar-v0")
#env = gym.make("MsPacman-v0")
print(env)

# reproducibility (THIS DOESN'T WORK AT ALL)
#env.seed(23)
#random.seed(42)
#np.random.seed(99)

# Reset environment
obs = env.reset()
print(obs)  # The 1st observation

# Simulate the environment
for t in range(100):
    action = env.action_space.sample()
    obs, reward, done, info = env.step(action)
    print(action, obs, reward, done)
    if done:
        print(f"This episode finished after {t+1} timesteps")
        break
env.close()
<TimeLimit<CartPoleEnv<CartPole-v0>>>
[-0.0326  0.0122 -0.0492 -0.0427]
0 [-0.0324 -0.1822 -0.0501  0.234 ] 1.0 False
1 [-0.036   0.0136 -0.0454 -0.074 ] 1.0 False
1 [-0.0357  0.2094 -0.0469 -0.3807] 1.0 False
1 [-0.0315  0.4051 -0.0545 -0.6877] 1.0 False
1 [-0.0234  0.6009 -0.0682 -0.9971] 1.0 False
0 [-0.0114  0.4068 -0.0882 -0.7266] 1.0 False
0 [-0.0033  0.213  -0.1027 -0.4629] 1.0 False
0 [ 0.001   0.0195 -0.112  -0.2042] 1.0 False
0 [ 0.0014 -0.1739 -0.116   0.0511] 1.0 False
1 [-0.0021  0.0227 -0.115  -0.2758] 1.0 False
1 [-0.0017  0.2192 -0.1205 -0.6024] 1.0 False
0 [ 0.0027  0.026  -0.1326 -0.35  ] 1.0 False
0 [ 0.0032 -0.167  -0.1396 -0.1019] 1.0 False
0 [-0.0001 -0.3599 -0.1416  0.1437] 1.0 False
1 [-0.0073 -0.1631 -0.1387 -0.1901] 1.0 False
0 [-0.0106 -0.356  -0.1425  0.0558] 1.0 False
0 [-0.0177 -0.5488 -0.1414  0.3003] 1.0 False
0 [-0.0287 -0.7416 -0.1354  0.5453] 1.0 False
0 [-0.0435 -0.9346 -0.1245  0.7924] 1.0 False
0 [-0.0622 -1.1278 -0.1087  1.0435] 1.0 False
1 [-0.0847 -0.9314 -0.0878  0.7188] 1.0 False
1 [-0.1034 -0.7352 -0.0734  0.3998] 1.0 False
0 [-0.1181 -0.9292 -0.0654  0.6685] 1.0 False
1 [-0.1366 -0.7333 -0.0521  0.3559] 1.0 False
0 [-0.1513 -0.9276 -0.0449  0.6317] 1.0 False
1 [-0.1699 -0.7319 -0.0323  0.3252] 1.0 False
0 [-0.1845 -0.9265 -0.0258  0.6076] 1.0 False
1 [-0.203  -0.7311 -0.0136  0.3069] 1.0 False
0 [-0.2177 -0.926  -0.0075  0.5952] 1.0 False
0 [-0.2362 -1.121   0.0044  0.8855] 1.0 False
0 [-0.2586 -1.3162  0.0221  1.1796] 1.0 False
0 [-0.2849 -1.5116  0.0457  1.4791] 1.0 False
1 [-0.3152 -1.317   0.0753  1.2011] 1.0 False
0 [-0.3415 -1.5131  0.0993  1.5163] 1.0 False
0 [-0.3718 -1.7092  0.1296  1.8383] 1.0 False
1 [-0.4059 -1.5158  0.1664  1.5885] 1.0 False
1 [-0.4363 -1.323   0.1982  1.352 ] 1.0 False
0 [-0.4627 -1.5199  0.2252  1.6996] 1.0 True
This episode finished after 38 timesteps

Action Space and Observation Space

What actions can the agent make? How do observations look like that agent perceives?

In CartPole-v0 an agent decides between 2 actions (0 or 1), and each observation is a set of 4 numbers.

env = gym.make("CartPole-v0")
print(env.action_space)
print(env.observation_space)
Discrete(2)
Box(4,)

In MountainCar-v0 an agent decides between 3 actions, and each observation is a set of 2 numbers.

env = gym.make("MountainCar-v0")
print(env.action_space)
print(env.observation_space)
Discrete(3)
Box(2,)

In MsPacman-v0 an agent decides between 9 actions, and each observation is an 210x160 image with 3 color channels.

env = gym.make("MsPacman-v0")
print(env.action_space)
print(env.observation_space)
Discrete(9)
Box(210, 160, 3)
# test code
assert env.action_space.n == 9
# test the code
x = env.action_space.sample()
assert env.action_space.contains(x)

All available environements

from gym import envs
print(list(envs.registry.all())[:10])
[EnvSpec(Copy-v0), EnvSpec(RepeatCopy-v0), EnvSpec(ReversedAddition-v0), EnvSpec(ReversedAddition3-v0), EnvSpec(DuplicatedInput-v0), EnvSpec(Reverse-v0), EnvSpec(CartPole-v0), EnvSpec(CartPole-v1), EnvSpec(MountainCar-v0), EnvSpec(MountainCarContinuous-v0)]

Links