**21st IEEE Latin American Robotics Symposium**

**Tutorial** - Practical Introduction to Reinforcement Learning with Gym in Python

[Dr. Miguel A. Solis](https://www.miguelsolis.info)

November 12, 2024

In [None]:
!pip install gymnasium

import random
import numpy as np
import gymnasium as gym
import matplotlib.pyplot as plt

from gym.wrappers.monitoring import video_recorder
from IPython.display import HTML
from IPython import display
import glob
import io
import base64

In [None]:
def show_video(env_name):
    mp4list = glob.glob('*.mp4')
    if len(mp4list) > 0:
        mp4 = '{}.mp4'.format(env_name)
        video = io.open(mp4, 'r+b').read()
        encoded = base64.b64encode(video)
        display.display(HTML(data='''<video alt="test" autoplay loop controls style="height: 400px;"><source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
    else:
        print("video could not be found")

In [None]:
# test with random actions

env_name = "Taxi-v3"
env = gym.make(env_name, render_mode="rgb_array")
vid = video_recorder.VideoRecorder(env, path="{}.mp4".format(env_name))
state = env.reset()
done = False
for t in range(100):
  vid.capture_frame()
  action = env.action_space.sample()
  next_state, reward, done, _, _ = env.step(action)
  state = next_state
  if done:
    break
vid.close()
env.close()

show_video(env_name)

In [None]:
# Environment initialization
env = gym.make("Taxi-v3", render_mode="rgb_array")
env.reset()
env.render()

In [None]:
# Verifying states and actions space
print("Actions space: {}".format(env.action_space))
print("States space: {}".format(env.observation_space))

In [None]:
# Hyperparameters

alpha = 0.8
gamma = 0.9
epsilon = 0.99 # epsilon-greedy
max_epsilon = 1
min_epsilon = 0.01
decay = 0.01

training_episodes = 1000
max_steps = 100

# Q initialization
Q = np.zeros((env.observation_space.n, env.action_space.n))

In [None]:
# Q-learning training

training_rewards = []
epsilons = []

for episode in range(training_episodes):
    print("====== EPISODE {} ======".format(episode))
    estado, info = env.reset()
    total_training_rewards = 0

    for step in range(max_steps):
        env.render()
        p = random.uniform(0, 1)
        if p < epsilon:
            action = env.action_space.sample()
        else:
            action = np.argmax(Q[state,:])

        new_state, reward, end, _, info = env.step(action)
        Q[state, action] = Q[state, action] + alpha * (reward + gamma * np.max(Q[new_state, :]) - Q[state, action])
        total_training_rewards += reward
        state = new_state

        if end:
            print ("Episode {} total reward: {}".format(episode, total_training_rewards))
            break

    epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay*episode)

    training_rewards.append(total_training_rewards)
    epsilons.append(epsilon)

print ("Training rewards through time: " + str(sum(training_rewards)/training_episodes))

In [None]:
# Total rewards evolution through episodes
x = range(training_episodes)
plt.plot(x, training_rewards)
plt.xlabel('Episode')
plt.ylabel('Reward')
plt.title('Total training reward per episode')
plt.grid(True)
plt.show()



In [None]:
# Epsilon evolution for epsilon-greedy policy

plt.plot(epsilons)
plt.xlabel('Episode')
plt.ylabel('Epsilon')
plt.title("Epsilon evolution (exploration/explotation)")
plt.grid(True)
plt.show()

In [None]:
# Applying the trained agent

vid = video_recorder.VideoRecorder(env, path="{}.mp4".format("Taxi-v3"))
state, info = env.reset()

step = 0
fin = False
total_rewards = 0

for step in range(max_steps):
  env.render()
  vid.capture_frame()
  action = np.argmax(Q[state, :])
  new_state, reward, end, _, info = env.step(action)
  total_rewards += reward

  if fin:
    print('Obtained return on episode {}: {}'.format(episode,total_rewards))
    break
  state = new_state

vid.close()
env.close()
print("Total reward for episode:", total_rewards)

show_video('Taxi-v3')