Skip to content
This repository has been archived by the owner on Aug 15, 2024. It is now read-only.

Commit

Permalink
カリキュラム学習の導入とagilerl_trainにaction maskを追加
Browse files Browse the repository at this point in the history
  • Loading branch information
yasuohayashibara committed Nov 22, 2023
1 parent b248a9c commit d8b05a2
Show file tree
Hide file tree
Showing 7 changed files with 1,673 additions and 120 deletions.
213 changes: 112 additions & 101 deletions controllers/agilerl_train/agilerl_train.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,15 @@
import os
import datetime
import csv
import yaml

import numpy as np
import torch
from agilerl.components.multi_agent_replay_buffer import MultiAgentReplayBuffer
from agilerl.hpo.mutation import Mutations
from agilerl.hpo.tournament import TournamentSelection
from agilerl.utils.utils import initialPopulation
#from agilerl.utils.utils import initialPopulation
from soccer.utils import initialPopulation
from tqdm import trange

import soccer_v0
Expand Down Expand Up @@ -62,14 +64,12 @@
}

# Define training loop parameters
max_episodes = 10000 # Total episodes (default: 6000)
max_steps = 1000 # Maximum steps to take in each episode
epsilon = 1.0 # Starting epsilon value
eps_end = 0.1 # Final epsilon value
eps_decay = 0.995 # Epsilon decay
eps_decay = 0.9997 # Epsilon decay
evo_epochs = 20 # Evolution frequency
evo_loop = 1 # Number of evaluation episodes
checkpoint=max_episodes/10
reward_freq = 1
path = "./models/MATD3/" + datetime.datetime.now().strftime('%y%m%d')
os.makedirs(path, exist_ok=True)
Expand Down Expand Up @@ -163,112 +163,123 @@
filename = "reward.csv"
file_path = os.path.join(path, filename)
agent_name = [n for n in env.agents]
title_names = ["Episode","Step","Value"]
title_names = ["Episode","Step","Epsilon","Value"]
title_names.extend(agent_name)
with open(file_path, "w") as f:
writer = csv.DictWriter(f, fieldnames=title_names)
writer.writeheader()


# Training loop
for idx_epi in trange(max_episodes):
for agent in pop: # Loop through population
state, _ = env.reset() # Reset environment at start of episode
agent_reward = {agent_id: 0 for agent_id in env.agents}
if INIT_HP["CHANNELS_LAST"]:
state = {
agent_id: np.moveaxis(np.expand_dims(s, 0), [3], [1])
for agent_id, s in state.items()
}
step_value = 0

#for _ in range(max_steps):
while True:
action = agent.getAction(state, epsilon) # Get next action from agent
next_state, reward, termination, truncation, _ = env.step(
action
) # Act in environment

# Image processing if necessary for the environment
for lesson_number in range(1, 4):
with open(f"./soccer/config/lesson{lesson_number}.yaml") as file:
LESSON=yaml.safe_load(file)

env.reset(options=LESSON)

# Define training loop parameters
max_episodes = 10000 # Total episodes (default: 6000)
checkpoint=max_episodes/10

# Training loop
for idx_epi in trange(max_episodes):
for agent in pop: # Loop through population
state, _ = env.reset() # Reset environment at start of episode
agent_reward = {agent_id: 0 for agent_id in env.agents}
if INIT_HP["CHANNELS_LAST"]:
state = {agent_id: np.squeeze(s) for agent_id, s in state.items()}
next_state = {
agent_id: np.moveaxis(ns, [2], [0])
for agent_id, ns in next_state.items()
state = {
agent_id: np.moveaxis(np.expand_dims(s, 0), [3], [1])
for agent_id, s in state.items()
}

# Save experiences to replay buffer
memory.save2memory(state, action, reward, next_state, termination)

# Collect the reward
for agent_id, r in reward.items():
agent_reward[agent_id] += r

# Learn according to learning frequency
if (memory.counter % agent.learn_step == 0) and (
len(memory) >= agent.batch_size
):
experiences = memory.sample(
agent.batch_size
) # Sample replay buffer
agent.learn(experiences) # Learn according to agent's RL algorithm

step_value += 1

# Stop episode if any agents have terminated
if any(truncation.values()) or any(termination.values()):
break

# Update the state
if INIT_HP["CHANNELS_LAST"]:
next_state = {
agent_id: np.expand_dims(ns, 0)
for agent_id, ns in next_state.items()
}
state = next_state

# Save the total episode reward
score = sum(agent_reward.values())
agent.scores.append(score)

# Update epsilon for exploration
epsilon = max(eps_end, epsilon * eps_decay)

# Now evolve population if necessary
if (idx_epi + 1) % evo_epochs == 0:
# Evaluate population
fitnesses = [
agent.test(
env,
swap_channels=INIT_HP["CHANNELS_LAST"],
max_steps=max_steps,
loop=evo_loop,
step_value = 0
info = None

#for _ in range(max_steps):
while True:
action = agent.getAction(state, epsilon, action_mask=info) # Get next action from agent
next_state, reward, termination, truncation, info = env.step(
action
) # Act in environment

# Image processing if necessary for the environment
if INIT_HP["CHANNELS_LAST"]:
state = {agent_id: np.squeeze(s) for agent_id, s in state.items()}
next_state = {
agent_id: np.moveaxis(ns, [2], [0])
for agent_id, ns in next_state.items()
}

# Save experiences to replay buffer
memory.save2memory(state, action, reward, next_state, termination)

# Collect the reward
for agent_id, r in reward.items():
agent_reward[agent_id] += r

# Learn according to learning frequency
if (memory.counter % agent.learn_step == 0) and (
len(memory) >= agent.batch_size
):
experiences = memory.sample(
agent.batch_size
) # Sample replay buffer
agent.learn(experiences) # Learn according to agent's RL algorithm

step_value += 1

# Stop episode if any agents have terminated
if any(truncation.values()) or any(termination.values()):
break

# Update the state
if INIT_HP["CHANNELS_LAST"]:
next_state = {
agent_id: np.expand_dims(ns, 0)
for agent_id, ns in next_state.items()
}
state = next_state

# Save the total episode reward
score = sum(agent_reward.values())
agent.scores.append(score)

# Now evolve population if necessary
if (idx_epi + 1) % evo_epochs == 0:
# Evaluate population
fitnesses = [
agent.test(
env,
swap_channels=INIT_HP["CHANNELS_LAST"],
max_steps=max_steps,
loop=evo_loop,
)
for agent in pop
]

print(f"Episode {idx_epi + 1}/{max_episodes}")
print(f'Fitnesses: {["%.2f" % fitness for fitness in fitnesses]}')
print(
f'100 fitness avgs: {["%.2f" % np.mean(agent.fitness[-100:]) for agent in pop]}'
)
for agent in pop
]

print(f"Episode {idx_epi + 1}/{max_episodes}")
print(f'Fitnesses: {["%.2f" % fitness for fitness in fitnesses]}')
print(
f'100 fitness avgs: {["%.2f" % np.mean(agent.fitness[-100:]) for agent in pop]}'
)

# Tournament selection and population mutation
elite, pop = tournament.select(pop)
pop = mutations.mutation(pop)

if (idx_epi + 1) % reward_freq == 0:
filename = "reward.csv"
file_path = os.path.join(path, filename)
with open(file_path, "a") as f:
writer = csv.DictWriter(f, fieldnames=title_names)
writer.writerow(dict(**{"Episode": idx_epi+1, "Step": step_value, "Value": score}, **agent_reward))

# Save the trained algorithm for each checkpoint
if (idx_epi + 1) % checkpoint == 0 and (idx_epi + 1) > evo_epochs:
filename = "MATD3_trained_agent_" + str(idx_epi + 1) + ".pt"
save_path = os.path.join(path, filename)
elite.saveCheckpoint(save_path)

# Tournament selection and population mutation
elite, pop = tournament.select(pop)
pop = mutations.mutation(pop)

if (idx_epi + 1) % reward_freq == 0:
filename = "reward.csv"
file_path = os.path.join(path, filename)
with open(file_path, "a") as f:
writer = csv.DictWriter(f, fieldnames=title_names)
writer.writerow(dict(**{"Episode": idx_epi+1, "Step": step_value, "Epsilon": epsilon, "Value": score}, **agent_reward))

# Update epsilon for exploration
epsilon = max(eps_end, epsilon * eps_decay)

# Save the trained algorithm for each checkpoint
if (idx_epi + 1) % checkpoint == 0 and (idx_epi + 1) > evo_epochs:
filename = "MATD3_trained_agent_" + str(idx_epi + 1) + ".pt"
save_path = os.path.join(path, filename)
elite.saveCheckpoint(save_path)

# Save the trained algorithm
filename = "MATD3_trained_agent.pt"
Expand Down
23 changes: 23 additions & 0 deletions controllers/agilerl_train/soccer/config/lesson1.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
---
# AgileRL Soccer Lesson 1

opponent: self
opponent_pool_size: 1 # Size of opponent pool for self-play
opponent_upgrade: 6000 # Epoch frequency to update opponent pool
#eval_opponent: strong # 'random', 'weak' or 'strong'
pretrained_path: models/DQN/lesson3_trained_agent.pt # Path to pretrained model weights
save_path: models/DQN/lesson4_trained_agent.pt # Path to save trained model
max_train_episodes: 1000 # Maximum number of training episodes in environment

## Game specific:
buffer_warm_up: false # Fill replay buffer with random experiences
warm_up_opponent: # Difficulty level of warm up experiences
agent_warm_up: 0 # Number of epochs to warm up agent by training on random experiences
rewards: # Rewards for different outcomes
score_goal: 1000
approach_ball: 0.2
dribble_or_kick: 10
off_field: -1.0
fall_down: -10
lose_point: -1000
play_continues: 0
23 changes: 23 additions & 0 deletions controllers/agilerl_train/soccer/config/lesson2.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
---
# AgileRL Soccer Lesson 1

opponent: self
opponent_pool_size: 1 # Size of opponent pool for self-play
opponent_upgrade: 6000 # Epoch frequency to update opponent pool
#eval_opponent: strong # 'random', 'weak' or 'strong'
pretrained_path: models/DQN/lesson3_trained_agent.pt # Path to pretrained model weights
save_path: models/DQN/lesson4_trained_agent.pt # Path to save trained model
max_train_episodes: 2000 # Maximum number of training episodes in environment

## Game specific:
buffer_warm_up: false # Fill replay buffer with random experiences
warm_up_opponent: # Difficulty level of warm up experiences
agent_warm_up: 0 # Number of epochs to warm up agent by training on random experiences
rewards: # Rewards for different outcomes
score_goal: 1000
approach_ball: 0.1
dribble_or_kick: 10
off_field: -1.0
fall_down: -10
lose_point: -1000
play_continues: 0
23 changes: 23 additions & 0 deletions controllers/agilerl_train/soccer/config/lesson3.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
---
# AgileRL Soccer Lesson 1

opponent: self
opponent_pool_size: 1 # Size of opponent pool for self-play
opponent_upgrade: 6000 # Epoch frequency to update opponent pool
#eval_opponent: strong # 'random', 'weak' or 'strong'
pretrained_path: models/DQN/lesson3_trained_agent.pt # Path to pretrained model weights
save_path: models/DQN/lesson4_trained_agent.pt # Path to save trained model
max_train_episodes: 7000 # Maximum number of training episodes in environment

## Game specific:
buffer_warm_up: false # Fill replay buffer with random experiences
warm_up_opponent: # Difficulty level of warm up experiences
agent_warm_up: 0 # Number of epochs to warm up agent by training on random experiences
rewards: # Rewards for different outcomes
score_goal: 1000
approach_ball: 0
dribble_or_kick: 10
off_field: -1.0
fall_down: -10
lose_point: -1000
play_continues: 0
Loading

0 comments on commit d8b05a2

Please sign in to comment.