diff --git a/controllers/agilerl_train/agilerl_train.py b/controllers/agilerl_train/agilerl_train.py index 8e1ee8b..d74654d 100755 --- a/controllers/agilerl_train/agilerl_train.py +++ b/controllers/agilerl_train/agilerl_train.py @@ -3,13 +3,15 @@ import os import datetime import csv +import yaml import numpy as np import torch from agilerl.components.multi_agent_replay_buffer import MultiAgentReplayBuffer from agilerl.hpo.mutation import Mutations from agilerl.hpo.tournament import TournamentSelection -from agilerl.utils.utils import initialPopulation +#from agilerl.utils.utils import initialPopulation +from soccer.utils import initialPopulation from tqdm import trange import soccer_v0 @@ -62,14 +64,12 @@ } # Define training loop parameters - max_episodes = 10000 # Total episodes (default: 6000) max_steps = 1000 # Maximum steps to take in each episode epsilon = 1.0 # Starting epsilon value eps_end = 0.1 # Final epsilon value - eps_decay = 0.995 # Epsilon decay + eps_decay = 0.9997 # Epsilon decay evo_epochs = 20 # Evolution frequency evo_loop = 1 # Number of evaluation episodes - checkpoint=max_episodes/10 reward_freq = 1 path = "./models/MATD3/" + datetime.datetime.now().strftime('%y%m%d') os.makedirs(path, exist_ok=True) @@ -163,112 +163,123 @@ filename = "reward.csv" file_path = os.path.join(path, filename) agent_name = [n for n in env.agents] - title_names = ["Episode","Step","Value"] + title_names = ["Episode","Step","Epsilon","Value"] title_names.extend(agent_name) with open(file_path, "w") as f: writer = csv.DictWriter(f, fieldnames=title_names) writer.writeheader() - # Training loop - for idx_epi in trange(max_episodes): - for agent in pop: # Loop through population - state, _ = env.reset() # Reset environment at start of episode - agent_reward = {agent_id: 0 for agent_id in env.agents} - if INIT_HP["CHANNELS_LAST"]: - state = { - agent_id: np.moveaxis(np.expand_dims(s, 0), [3], [1]) - for agent_id, s in state.items() - } - step_value = 0 - - #for _ in range(max_steps): - while True: - action = agent.getAction(state, epsilon) # Get next action from agent - next_state, reward, termination, truncation, _ = env.step( - action - ) # Act in environment - - # Image processing if necessary for the environment + for lesson_number in range(1, 4): + with open(f"./soccer/config/lesson{lesson_number}.yaml") as file: + LESSON=yaml.safe_load(file) + + env.reset(options=LESSON) + + # Define training loop parameters + max_episodes = 10000 # Total episodes (default: 6000) + checkpoint=max_episodes/10 + + # Training loop + for idx_epi in trange(max_episodes): + for agent in pop: # Loop through population + state, _ = env.reset() # Reset environment at start of episode + agent_reward = {agent_id: 0 for agent_id in env.agents} if INIT_HP["CHANNELS_LAST"]: - state = {agent_id: np.squeeze(s) for agent_id, s in state.items()} - next_state = { - agent_id: np.moveaxis(ns, [2], [0]) - for agent_id, ns in next_state.items() + state = { + agent_id: np.moveaxis(np.expand_dims(s, 0), [3], [1]) + for agent_id, s in state.items() } - - # Save experiences to replay buffer - memory.save2memory(state, action, reward, next_state, termination) - - # Collect the reward - for agent_id, r in reward.items(): - agent_reward[agent_id] += r - - # Learn according to learning frequency - if (memory.counter % agent.learn_step == 0) and ( - len(memory) >= agent.batch_size - ): - experiences = memory.sample( - agent.batch_size - ) # Sample replay buffer - agent.learn(experiences) # Learn according to agent's RL algorithm - - step_value += 1 - - # Stop episode if any agents have terminated - if any(truncation.values()) or any(termination.values()): - break - - # Update the state - if INIT_HP["CHANNELS_LAST"]: - next_state = { - agent_id: np.expand_dims(ns, 0) - for agent_id, ns in next_state.items() - } - state = next_state - - # Save the total episode reward - score = sum(agent_reward.values()) - agent.scores.append(score) - - # Update epsilon for exploration - epsilon = max(eps_end, epsilon * eps_decay) - - # Now evolve population if necessary - if (idx_epi + 1) % evo_epochs == 0: - # Evaluate population - fitnesses = [ - agent.test( - env, - swap_channels=INIT_HP["CHANNELS_LAST"], - max_steps=max_steps, - loop=evo_loop, + step_value = 0 + info = None + + #for _ in range(max_steps): + while True: + action = agent.getAction(state, epsilon, action_mask=info) # Get next action from agent + next_state, reward, termination, truncation, info = env.step( + action + ) # Act in environment + + # Image processing if necessary for the environment + if INIT_HP["CHANNELS_LAST"]: + state = {agent_id: np.squeeze(s) for agent_id, s in state.items()} + next_state = { + agent_id: np.moveaxis(ns, [2], [0]) + for agent_id, ns in next_state.items() + } + + # Save experiences to replay buffer + memory.save2memory(state, action, reward, next_state, termination) + + # Collect the reward + for agent_id, r in reward.items(): + agent_reward[agent_id] += r + + # Learn according to learning frequency + if (memory.counter % agent.learn_step == 0) and ( + len(memory) >= agent.batch_size + ): + experiences = memory.sample( + agent.batch_size + ) # Sample replay buffer + agent.learn(experiences) # Learn according to agent's RL algorithm + + step_value += 1 + + # Stop episode if any agents have terminated + if any(truncation.values()) or any(termination.values()): + break + + # Update the state + if INIT_HP["CHANNELS_LAST"]: + next_state = { + agent_id: np.expand_dims(ns, 0) + for agent_id, ns in next_state.items() + } + state = next_state + + # Save the total episode reward + score = sum(agent_reward.values()) + agent.scores.append(score) + + # Now evolve population if necessary + if (idx_epi + 1) % evo_epochs == 0: + # Evaluate population + fitnesses = [ + agent.test( + env, + swap_channels=INIT_HP["CHANNELS_LAST"], + max_steps=max_steps, + loop=evo_loop, + ) + for agent in pop + ] + + print(f"Episode {idx_epi + 1}/{max_episodes}") + print(f'Fitnesses: {["%.2f" % fitness for fitness in fitnesses]}') + print( + f'100 fitness avgs: {["%.2f" % np.mean(agent.fitness[-100:]) for agent in pop]}' ) - for agent in pop - ] - - print(f"Episode {idx_epi + 1}/{max_episodes}") - print(f'Fitnesses: {["%.2f" % fitness for fitness in fitnesses]}') - print( - f'100 fitness avgs: {["%.2f" % np.mean(agent.fitness[-100:]) for agent in pop]}' - ) - - # Tournament selection and population mutation - elite, pop = tournament.select(pop) - pop = mutations.mutation(pop) - - if (idx_epi + 1) % reward_freq == 0: - filename = "reward.csv" - file_path = os.path.join(path, filename) - with open(file_path, "a") as f: - writer = csv.DictWriter(f, fieldnames=title_names) - writer.writerow(dict(**{"Episode": idx_epi+1, "Step": step_value, "Value": score}, **agent_reward)) - - # Save the trained algorithm for each checkpoint - if (idx_epi + 1) % checkpoint == 0 and (idx_epi + 1) > evo_epochs: - filename = "MATD3_trained_agent_" + str(idx_epi + 1) + ".pt" - save_path = os.path.join(path, filename) - elite.saveCheckpoint(save_path) + + # Tournament selection and population mutation + elite, pop = tournament.select(pop) + pop = mutations.mutation(pop) + + if (idx_epi + 1) % reward_freq == 0: + filename = "reward.csv" + file_path = os.path.join(path, filename) + with open(file_path, "a") as f: + writer = csv.DictWriter(f, fieldnames=title_names) + writer.writerow(dict(**{"Episode": idx_epi+1, "Step": step_value, "Epsilon": epsilon, "Value": score}, **agent_reward)) + + # Update epsilon for exploration + epsilon = max(eps_end, epsilon * eps_decay) + + # Save the trained algorithm for each checkpoint + if (idx_epi + 1) % checkpoint == 0 and (idx_epi + 1) > evo_epochs: + filename = "MATD3_trained_agent_" + str(idx_epi + 1) + ".pt" + save_path = os.path.join(path, filename) + elite.saveCheckpoint(save_path) # Save the trained algorithm filename = "MATD3_trained_agent.pt" diff --git a/controllers/agilerl_train/soccer/config/lesson1.yaml b/controllers/agilerl_train/soccer/config/lesson1.yaml new file mode 100644 index 0000000..0624ae8 --- /dev/null +++ b/controllers/agilerl_train/soccer/config/lesson1.yaml @@ -0,0 +1,23 @@ +--- +# AgileRL Soccer Lesson 1 + +opponent: self +opponent_pool_size: 1 # Size of opponent pool for self-play +opponent_upgrade: 6000 # Epoch frequency to update opponent pool +#eval_opponent: strong # 'random', 'weak' or 'strong' +pretrained_path: models/DQN/lesson3_trained_agent.pt # Path to pretrained model weights +save_path: models/DQN/lesson4_trained_agent.pt # Path to save trained model +max_train_episodes: 1000 # Maximum number of training episodes in environment + +## Game specific: +buffer_warm_up: false # Fill replay buffer with random experiences +warm_up_opponent: # Difficulty level of warm up experiences +agent_warm_up: 0 # Number of epochs to warm up agent by training on random experiences +rewards: # Rewards for different outcomes + score_goal: 1000 + approach_ball: 0.2 + dribble_or_kick: 10 + off_field: -1.0 + fall_down: -10 + lose_point: -1000 + play_continues: 0 diff --git a/controllers/agilerl_train/soccer/config/lesson2.yaml b/controllers/agilerl_train/soccer/config/lesson2.yaml new file mode 100644 index 0000000..e5c48df --- /dev/null +++ b/controllers/agilerl_train/soccer/config/lesson2.yaml @@ -0,0 +1,23 @@ +--- +# AgileRL Soccer Lesson 1 + +opponent: self +opponent_pool_size: 1 # Size of opponent pool for self-play +opponent_upgrade: 6000 # Epoch frequency to update opponent pool +#eval_opponent: strong # 'random', 'weak' or 'strong' +pretrained_path: models/DQN/lesson3_trained_agent.pt # Path to pretrained model weights +save_path: models/DQN/lesson4_trained_agent.pt # Path to save trained model +max_train_episodes: 2000 # Maximum number of training episodes in environment + +## Game specific: +buffer_warm_up: false # Fill replay buffer with random experiences +warm_up_opponent: # Difficulty level of warm up experiences +agent_warm_up: 0 # Number of epochs to warm up agent by training on random experiences +rewards: # Rewards for different outcomes + score_goal: 1000 + approach_ball: 0.1 + dribble_or_kick: 10 + off_field: -1.0 + fall_down: -10 + lose_point: -1000 + play_continues: 0 diff --git a/controllers/agilerl_train/soccer/config/lesson3.yaml b/controllers/agilerl_train/soccer/config/lesson3.yaml new file mode 100644 index 0000000..af4b672 --- /dev/null +++ b/controllers/agilerl_train/soccer/config/lesson3.yaml @@ -0,0 +1,23 @@ +--- +# AgileRL Soccer Lesson 1 + +opponent: self +opponent_pool_size: 1 # Size of opponent pool for self-play +opponent_upgrade: 6000 # Epoch frequency to update opponent pool +#eval_opponent: strong # 'random', 'weak' or 'strong' +pretrained_path: models/DQN/lesson3_trained_agent.pt # Path to pretrained model weights +save_path: models/DQN/lesson4_trained_agent.pt # Path to save trained model +max_train_episodes: 7000 # Maximum number of training episodes in environment + +## Game specific: +buffer_warm_up: false # Fill replay buffer with random experiences +warm_up_opponent: # Difficulty level of warm up experiences +agent_warm_up: 0 # Number of epochs to warm up agent by training on random experiences +rewards: # Rewards for different outcomes + score_goal: 1000 + approach_ball: 0 + dribble_or_kick: 10 + off_field: -1.0 + fall_down: -10 + lose_point: -1000 + play_continues: 0 diff --git a/controllers/agilerl_train/soccer/matd3.py b/controllers/agilerl_train/soccer/matd3.py new file mode 100644 index 0000000..c2967d6 --- /dev/null +++ b/controllers/agilerl_train/soccer/matd3.py @@ -0,0 +1,1137 @@ +import copy +import random + +import dill +import numpy as np +import torch +import torch.nn as nn +import torch.optim as optim + +from agilerl.networks.evolvable_cnn import EvolvableCNN +from agilerl.networks.evolvable_mlp import EvolvableMLP +from agilerl.wrappers.make_evolvable import MakeEvolvable + + +class MATD3: + """The MATD3 algorithm class. MATD3 paper: https://arxiv.org/abs/1910.01465 + + :param state_dims: State observation dimensions for each agent + :type state_dims: list[tuple] + :param action_dims: Action dimensions for each agent + :type action_dims: list[int] + :param one_hot: One-hot encoding, used with discrete observation spaces + :type one_hot: bool + :param n_agents: Number of agents + :type n_agents: int + :param agent_ids: Agent ID for each agent + :type agent_ids: list[str] + :param max_action: Upper bound of the action space + :type max_action: float + :param min_action: Lower bound of the action space + :type min_action: float + :param discrete_actions: Boolean flag to indicate a discrete action space + :type discrete_actions: bool, optional + :param expl_noise: Standard deviation for Gaussian exploration noise, defaults to 0.1 + :type expl_noise: float, optional + :param policy_freq: Policy update frequency, defaults to 2 + :type policy_freq: int, optional + :param index: Index to keep track of object instance during tournament selection and mutation, defaults to 0 + :type index: int, optional + :param net_config: Network configuration, defaults to mlp with hidden size [64,64] + :type net_config: dict, optional + :param batch_size: Size of batched sample from replay buffer for learning, defaults to 64 + :type batch_size: int, optional + :param lr: Learning rate for optimizer, defaults to 0.01 + :type lr: float, optional + :param learn_step: Learning frequency, defaults to 5 + :type learn_step: int, optional + :param gamma: Discount factor, defaults to 0.95 + :type gamma: float, optional + :param tau: For soft update of target network parameters, defaults to 0.01 + :type tau: float, optional + :param mutation: Most recent mutation to agent, defaults to None + :type mutation: str, optional + :param actor_networks: List of custom actor networks, defaults to None + :type actor_networks: list[nn.Module], optional + :param critic_networks: List containing two lists of custom critic networks, defaults to None + :type critic_networks: list[list[nn.Module]], optional + :param device: Device for accelerated computing, 'cpu' or 'cuda', defaults to 'cpu' + :type device: str, optional + :param accelerator: Accelerator for distributed computing, defaults to None + :type accelerator: accelerate.Accelerator(), optional + :param wrap: Wrap models for distributed training upon creation, defaults to True + :type wrap: bool, optional + """ + + def __init__( + self, + state_dims, + action_dims, + one_hot, + n_agents, + agent_ids, + max_action, + min_action, + discrete_actions, + expl_noise=0.1, + index=0, + policy_freq=2, + net_config={"arch": "mlp", "h_size": [64, 64]}, + batch_size=64, + lr=0.01, + learn_step=5, + gamma=0.95, + tau=0.01, + mutation=None, + actor_networks=None, + critic_networks=None, + device="cpu", + accelerator=None, + wrap=True, + ): + self.algo = "MATD3" + self.state_dims = state_dims + self.total_state_dims = sum(state_dim[0] for state_dim in self.state_dims) + self.action_dims = action_dims + self.one_hot = one_hot + self.n_agents = n_agents + self.multi = True if n_agents > 1 else False + self.agent_ids = agent_ids + self.net_config = net_config + self.batch_size = batch_size + self.lr = lr + self.learn_step = learn_step + self.gamma = gamma + self.tau = tau + self.mut = mutation + self.device = device + self.accelerator = accelerator + self.index = index + self.policy_freq = policy_freq + self.scores = [] + self.fitness = [] + self.steps = [0] + self.max_action = max_action + self.expl_noise = expl_noise + self.min_action = min_action + self.discrete_actions = discrete_actions + self.total_actions = ( + sum(self.action_dims) + if not self.discrete_actions + else len(self.action_dims) + ) + self.actor_networks = actor_networks + self.critic_networks = critic_networks + + if self.actor_networks is not None and self.critic_networks is not None: + self.actors = actor_networks + self.critics_1 = critic_networks[0] + self.critics_2 = critic_networks[-1] + self.net_config = None + else: + if "output_activation" in self.net_config.keys(): + pass + else: + if self.discrete_actions: + self.net_config["output_activation"] = "GumbelSoftmax" + else: + self.net_config["output_activation"] = "Softmax" + + # model + if self.net_config["arch"] == "mlp": # Multi-layer Perceptron + self.actors = [ + EvolvableMLP( + num_inputs=state_dim[0], + num_outputs=action_dim, + hidden_size=self.net_config["h_size"], + mlp_output_activation=self.net_config["output_activation"], + device=self.device, + accelerator=self.accelerator, + ) + for (action_dim, state_dim) in zip( + self.action_dims, self.state_dims + ) + ] + self.critics_1 = [ + EvolvableMLP( + num_inputs=self.total_state_dims + self.total_actions, + num_outputs=1, + hidden_size=self.net_config["h_size"], + device=self.device, + accelerator=self.accelerator, + ) + for _ in range(self.n_agents) + ] + self.critics_2 = [ + EvolvableMLP( + num_inputs=self.total_state_dims + self.total_actions, + num_outputs=1, + hidden_size=self.net_config["h_size"], + device=self.device, + accelerator=self.accelerator, + ) + for _ in range(self.n_agents) + ] + elif self.net_config["arch"] == "cnn": # Convolutional Neural Network + self.actors = [ + EvolvableCNN( + input_shape=state_dim, + num_actions=action_dim, + channel_size=self.net_config["c_size"], + kernel_size=self.net_config["k_size"], + stride_size=self.net_config["s_size"], + hidden_size=self.net_config["h_size"], + normalize=self.net_config["normalize"], + mlp_output_activation=self.net_config["output_activation"], + multi=self.multi, + n_agents=self.n_agents, + device=self.device, + accelerator=self.accelerator, + ) + for (action_dim, state_dim) in zip( + self.action_dims, self.state_dims + ) + ] + self.critics_1 = [ + EvolvableCNN( + input_shape=state_dim, + num_actions=self.total_actions, + channel_size=self.net_config["c_size"], + kernel_size=self.net_config["k_size"], + stride_size=self.net_config["s_size"], + hidden_size=self.net_config["h_size"], + normalize=self.net_config["normalize"], + mlp_activation="Tanh", + mlp_output_activation="Softmax", + critic=True, + n_agents=self.n_agents, + multi=self.multi, + device=self.device, + accelerator=self.accelerator, + ) + for state_dim in self.state_dims + ] + self.critics_2 = [ + EvolvableCNN( + input_shape=state_dim, + num_actions=self.total_actions, + channel_size=self.net_config["c_size"], + kernel_size=self.net_config["k_size"], + stride_size=self.net_config["s_size"], + hidden_size=self.net_config["h_size"], + normalize=self.net_config["normalize"], + mlp_activation="Tanh", + critic=True, + n_agents=self.n_agents, + multi=self.multi, + device=self.device, + accelerator=self.accelerator, + ) + for state_dim in self.state_dims + ] + + # Assign architecture + self.arch = ( + self.net_config["arch"] + if self.net_config is not None + else self.actors[0].arch + ) + + # Create target networks + self.actor_targets = copy.deepcopy(self.actors) + self.critic_targets_1 = copy.deepcopy(self.critics_1) + self.critic_targets_2 = copy.deepcopy(self.critics_2) + + # Initialise target network parameters + for actor, actor_target in zip(self.actors, self.actor_targets): + actor_target.load_state_dict(actor.state_dict()) + for critic_1, critic_2, critic_target_1, critic_target_2 in zip( + self.critics_1, self.critics_2, self.critic_targets_1, self.critic_targets_2 + ): + critic_target_1.load_state_dict(critic_1.state_dict()) + critic_target_2.load_state_dict(critic_2.state_dict()) + + self.actor_optimizers_type = [ + optim.Adam(actor.parameters(), lr=self.lr) for actor in self.actors + ] + self.critic_1_optimizers_type = [ + optim.Adam(critic.parameters(), lr=self.lr) for critic in self.critics_1 + ] + self.critic_2_optimizers_type = [ + optim.Adam(critic.parameters(), lr=self.lr) for critic in self.critics_2 + ] + + if self.accelerator is not None: + self.actor_optimizers = self.actor_optimizers_type + self.critic_1_optimizers = self.critic_1_optimizers_type + self.critic_2_optimizers = self.critic_2_optimizers_type + if wrap: + self.wrap_models() + else: + self.actors = [actor.to(self.device) for actor in self.actors] + self.actor_targets = [ + actor_target.to(self.device) for actor_target in self.actor_targets + ] + self.critics_1 = [critic.to(self.device) for critic in self.critics_1] + self.critic_targets_1 = [ + critic_target.to(self.device) for critic_target in self.critic_targets_1 + ] + self.critics_2 = [critic.to(self.device) for critic in self.critics_2] + self.critic_targets_2 = [ + critic_target.to(self.device) for critic_target in self.critic_targets_2 + ] + self.actor_optimizers = self.actor_optimizers_type + self.critic_1_optimizers = self.critic_1_optimizers_type + self.critic_2_optimizers = self.critic_2_optimizers_type + + self.criterion = nn.MSELoss() + + def getAction(self, states, epsilon=0, action_mask=None, agent_mask=None, env_defined_actions=None): + """Returns the next action to take in the environment. + Epsilon is the probability of taking a random action, used for exploration. + For epsilon-greedy behaviour, set epsilon to 0. + + :param state: Environment observations: {'agent_0': state_dim_0, ..., 'agent_n': state_dim_n} + :type state: Dict[str, numpy.Array] + :param epsilon: Probablilty of taking a random action for exploration, defaults to 0 + :type epsilon: float, optional + :param agent_mask: Mask of agents to return actions for: {'agent_0': True, ..., 'agent_n': False} + :type agent_mask: Dict[str, bool] + :param env_defined_actions: Mask of agents to return actions for: {'agent_0': True, ..., 'agent_n': False} + :type env_defined_actions: Dict[str, bool] + """ + # Get agents, states and actions we want to take actions for at this timestep according to agent_mask + if agent_mask is None: + agent_ids = self.agent_ids + actors = self.actors + else: + agent_ids = [agent for agent in agent_mask.keys() if agent_mask[agent]] + states = { + agent: states[agent] for agent in agent_mask.keys() if agent_mask[agent] + } + actors = [ + actor + for agent, actor in zip(agent_mask.keys(), self.actors) + if agent_mask[agent] + ] + + # Convert states to a list of torch tensors + states = [torch.from_numpy(state).float() for state in states.values()] + + # Configure accelerator + if self.accelerator is None: + states = [state.to(self.device) for state in states] + + if self.one_hot: + states = [ + nn.functional.one_hot(state.long(), num_classes=state_dim[0]) + .float() + .squeeze() + for state, state_dim in zip(states, self.state_dims) + ] + + if self.arch == "mlp": + states = [ + state.unsqueeze(0) if len(state.size()) < 2 else state + for state in states + ] + elif self.arch == "cnn": + states = [state.unsqueeze(2) for state in states] + + actions = {} + for idx, (agent_id, state, actor) in enumerate(zip(agent_ids, states, actors)): + if random.random() < epsilon: + if self.discrete_actions: + if action_mask is None: + action = np.random.randint(0, self.action_dims[idx]) + # add -----> + else: + a_mask = action_mask[agent_id]['action_mask'] + masked_action_values = np.ma.array(np.random.rand(self.action_dims[idx]), mask=a_mask) + action = np.argmax(masked_action_values, axis=-1) + # <----- + + else: + action = ( + np.random.rand(state.size()[0], self.action_dims[idx]) + .astype("float32") + .squeeze() + ) + else: + actor.eval() + if self.accelerator is not None: + with actor.no_sync(): + action_values = actor(state) + else: + with torch.no_grad(): + action_values = actor(state) + actor.train() + if self.discrete_actions: + if action_mask is None: + action = action_values.squeeze(0).argmax().item() + # add -----> + else: + #action = action_values.squeeze(0).argmax().item() + a_mask = action_mask[agent_id]['action_mask'] + masked_action_values = np.ma.array(action_values.cpu().data.numpy(), mask=a_mask) + action = np.argmax(masked_action_values, axis=-1)[0] + # <----- + else: + action = ( + action_values.cpu().data.numpy().squeeze() + + np.random.normal( + 0, + self.max_action[idx][0] * self.expl_noise, + size=self.action_dims[idx], + ).astype(np.float32) + ) + action = np.clip( + action, self.min_action[idx][0], self.max_action[idx][0] + ) + actions[agent_id] = action + + if env_defined_actions is not None: + for agent in env_defined_actions.keys(): + if not agent_mask[agent]: + actions.update({agent: env_defined_actions[agent]}) + + return actions + + def _squeeze_exp(self, experiences): + """Remove first dim created by dataloader. + + :param experiences: List of batched states, actions, rewards, next_states, dones in that order. + :type state: list[torch.Tensor[float]] + """ + st, ac, re, ne, do = experiences + return st.squeeze(0), ac.squeeze(0), re.squeeze(0), ne.squeeze(0), do.squeeze(0) + + def learn(self, experiences): + """Updates agent network parameters to learn from experiences. + + :param experience: Tuple of dictionaries containing batched states, actions, rewards, next_states, + dones in that order for each individual agent. + :type experience: Tuple[Dict[str, torch.Tensor]] + """ + + for ( + agent_id, + actor, + actor_target, + critic_1, + critic_target_1, + critic_2, + critic_target_2, + actor_optimizer, + critic_1_optimizer, + critic_2_optimizer, + ) in zip( + self.agent_ids, + self.actors, + self.actor_targets, + self.critics_1, + self.critic_targets_1, + self.critics_2, + self.critic_targets_2, + self.actor_optimizers, + self.critic_1_optimizers, + self.critic_2_optimizers, + ): + states, actions, rewards, next_states, dones = experiences + + if self.one_hot: + states = { + agent_id: nn.functional.one_hot( + state.long(), num_classes=state_dim[0] + ) + .float() + .squeeze() + for agent_id, state, state_dim in zip( + states.items(), self.state_dims + ) + } + + if self.arch == "mlp": + if self.discrete_actions: + action_values = [a.unsqueeze(1) for a in actions.values()] + else: + action_values = list(actions.values()) + input_combined = torch.cat(list(states.values()) + action_values, 1) + if self.accelerator is not None: + with critic_1.no_sync(): + q_value_1 = critic_1(input_combined) + with critic_2.no_sync(): + q_value_2 = critic_2(input_combined) + else: + q_value_1 = critic_1(input_combined) + q_value_2 = critic_2(input_combined) + next_actions = [ + self.actor_targets[idx](next_states[agent_id]).detach_() + for idx, agent_id in enumerate(self.agent_ids) + ] + + elif self.arch == "cnn": + stacked_states = torch.stack(list(states.values()), dim=2) + stacked_actions = torch.stack(list(actions.values()), dim=1) + if self.accelerator is not None: + with critic_1.no_sync(): + q_value_1 = critic_1(stacked_states, stacked_actions) + with critic_2.no_sync(): + q_value_2 = critic_2(stacked_states, stacked_actions) + else: + q_value_1 = critic_1(stacked_states, stacked_actions) + q_value_2 = critic_2(stacked_states, stacked_actions) + next_actions = [ + self.actor_targets[idx]( + next_states[agent_id].unsqueeze(2) + ).detach_() + for idx, agent_id in enumerate(self.agent_ids) + ] + + if self.discrete_actions: + next_actions = [ + torch.argmax(agent_actions, dim=1).unsqueeze(1) + if self.arch == "mlp" + else torch.argmax(agent_actions, dim=1) + for agent_actions in next_actions + ] + + if self.arch == "mlp": + next_input_combined = torch.cat( + list(next_states.values()) + next_actions, 1 + ) + if self.accelerator is not None: + with critic_target_1.no_sync(): + q_value_next_state_1 = critic_target_1(next_input_combined) + with critic_target_2.no_sync(): + q_value_next_state_2 = critic_target_2(next_input_combined) + else: + q_value_next_state_1 = critic_target_1(next_input_combined) + q_value_next_state_2 = critic_target_2(next_input_combined) + elif self.arch == "cnn": + stacked_next_states = torch.stack(list(next_states.values()), dim=2) + stacked_next_actions = torch.stack(next_actions, dim=1) + if self.accelerator is not None: + with critic_target_1.no_sync(): + q_value_next_state_1 = critic_target_1( + stacked_next_states, stacked_next_actions + ) + with critic_target_2.no_sync(): + q_value_next_state_2 = critic_target_2( + stacked_next_states, stacked_next_actions + ) + else: + q_value_next_state_1 = critic_target_1( + stacked_next_states, stacked_next_actions + ) + q_value_next_state_2 = critic_target_2( + stacked_next_states, stacked_next_actions + ) + q_value_next_state = torch.min(q_value_next_state_1, q_value_next_state_2) + + y_j = ( + rewards[agent_id] + + (1 - dones[agent_id]) * self.gamma * q_value_next_state + ) + + critic_loss = self.criterion(q_value_1, y_j.detach_()) + self.criterion( + q_value_2, y_j.detach_() + ) + + # critic loss backprop + critic_1_optimizer.zero_grad() + critic_2_optimizer.zero_grad() + if self.accelerator is not None: + self.accelerator.backward(critic_loss) + else: + critic_loss.backward() + critic_1_optimizer.step() + critic_2_optimizer.step() + + # update actor and targets every policy_freq episodes + if len(self.scores) % self.policy_freq == 0: + if self.arch == "mlp": + if self.accelerator is not None: + with actor.no_sync(): + action = actor(states[agent_id]) + else: + action = actor(states[agent_id]) + detached_actions = copy.deepcopy(actions) + if self.discrete_actions: + action = action.argmax(1).unsqueeze(1) + detached_actions = { + agent_id: d.unsqueeze(1) + for agent_id, d in detached_actions.items() + } + detached_actions[agent_id] = action + input_combined = torch.cat( + list(states.values()) + list(detached_actions.values()), 1 + ) + if self.accelerator is not None: + with critic_1.no_sync(): + actor_loss = -critic_1(input_combined).mean() + else: + actor_loss = -critic_1(input_combined).mean() + + elif self.arch == "cnn": + if self.accelerator is not None: + with actor.no_sync(): + action = actor(states[agent_id].unsqueeze(2)) + else: + action = actor(states[agent_id].unsqueeze(2)) + if self.discrete_actions: + action = action.argmax(1) + detached_actions = copy.deepcopy(actions) + detached_actions[agent_id] = action + stacked_detached_actions = torch.stack( + list(detached_actions.values()), dim=1 + ) + if self.accelerator is not None: + with critic_1.no_sync(): + actor_loss = -critic_1( + stacked_states, stacked_detached_actions + ).mean() + else: + actor_loss = -critic_1( + stacked_states, stacked_detached_actions + ).mean() + + # actor loss backprop + actor_optimizer.zero_grad() + if self.accelerator is not None: + self.accelerator.backward(actor_loss) + else: + actor_loss.backward() + actor_optimizer.step() + + if len(self.scores) % self.policy_freq: + for ( + actor, + actor_target, + critic_1, + critic_target_1, + critic_2, + critic_target_2, + ) in zip( + self.actors, + self.actor_targets, + self.critics_1, + self.critic_targets_1, + self.critics_2, + self.critic_targets_2, + ): + self.softUpdate(actor, actor_target) + self.softUpdate(critic_1, critic_target_1) + self.softUpdate(critic_2, critic_target_2) + + def softUpdate(self, net, target): + """Soft updates target network.""" + for eval_param, target_param in zip(net.parameters(), target.parameters()): + target_param.data.copy_( + self.tau * eval_param.data + (1.0 - self.tau) * target_param.data + ) + + def test(self, env, swap_channels=False, max_steps=500, loop=3): + """Returns mean test score of agent in environment with epsilon-greedy policy. + + :param env: The environment to be tested in + :type env: Gym-style environment + :param swap_channels: Swap image channels dimension from last to first [H, W, C] -> [C, H, W], defaults to False + :type swap_channels: bool, optional + :param max_steps: Maximum number of testing steps, defaults to 500 + :type max_steps: int, optional + :param loop: Number of testing loops/episodes to complete. The returned score is the mean. Defaults to 3 + :type loop: int, optional + """ + with torch.no_grad(): + rewards = [] + for i in range(loop): + state, info = env.reset() + agent_reward = {agent_id: 0 for agent_id in self.agent_ids} + score = 0 + for _ in range(max_steps): + if swap_channels: + state = { + agent_id: np.moveaxis(np.expand_dims(s, 0), [3], [1]) + for agent_id, s in state.items() + } + agent_mask = ( + info["agent_mask"] if "agent_mask" in info.keys() else None + ) + env_defined_actions = ( + info["env_defined_actions"] + if "env_defined_actions" in info.keys() + else None + ) + action = self.getAction( + state, + epsilon=0, + agent_mask=agent_mask, + env_defined_actions=env_defined_actions, + ) + state, reward, done, trunc, info = env.step(action) + for agent_id, r in reward.items(): + agent_reward[agent_id] += r + score = sum(agent_reward.values()) + rewards.append(score) + mean_fit = np.mean(rewards) + self.fitness.append(mean_fit) + return mean_fit + + def clone(self, index=None, wrap=True): + """Returns cloned agent identical to self. + + :param index: Index to keep track of agent for tournament selection and mutation, defaults to None + :type index: int, optional + """ + if index is None: + index = self.index + + clone = type(self)( + state_dims=self.state_dims, + action_dims=self.action_dims, + one_hot=self.one_hot, + n_agents=self.n_agents, + agent_ids=self.agent_ids, + max_action=self.max_action, + min_action=self.min_action, + expl_noise=self.expl_noise, + discrete_actions=self.discrete_actions, + index=index, + net_config=self.net_config, + batch_size=self.batch_size, + policy_freq=self.policy_freq, + lr=self.lr, + learn_step=self.learn_step, + gamma=self.gamma, + tau=self.tau, + mutation=self.mut, + actor_networks=self.actor_networks, + critic_networks=self.critic_networks, + device=self.device, + accelerator=self.accelerator, + wrap=wrap, + ) + + if self.accelerator is not None: + self.unwrap_models() + actors = [actor.clone() for actor in self.actors] + actor_targets = [actor_target.clone() for actor_target in self.actor_targets] + critics_1 = [critic.clone() for critic in self.critics_1] + critic_targets_1 = [ + critic_target.clone() for critic_target in self.critic_targets_1 + ] + critics_2 = [critic.clone() for critic in self.critics_2] + critic_targets_2 = [ + critic_target.clone() for critic_target in self.critic_targets_2 + ] + actor_optimizers = [ + optim.Adam(actor.parameters(), lr=clone.lr) for actor in actors + ] + critic_1_optimizers = [ + optim.Adam(critic.parameters(), lr=clone.lr) for critic in critics_1 + ] + critic_2_optimizers = [ + optim.Adam(critic.parameters(), lr=clone.lr) for critic in critics_2 + ] + clone.actor_optimizers_type = actor_optimizers + clone.critic_1_optimizers_type = critic_1_optimizers + clone.critic_2_optimizers_type = critic_2_optimizers + + if self.accelerator is not None: + if wrap: + clone.actors = [self.accelerator.prepare(actor) for actor in actors] + clone.actor_targets = [ + self.accelerator.prepare(actor_target) + for actor_target in actor_targets + ] + clone.critics_1 = [ + self.accelerator.prepare(critic) for critic in critics_1 + ] + clone.critic_targets_1 = [ + self.accelerator.prepare(critic_target) + for critic_target in critic_targets_1 + ] + clone.critics_2 = [ + self.accelerator.prepare(critic) for critic in critics_2 + ] + clone.critic_targets_2 = [ + self.accelerator.prepare(critic_target) + for critic_target in critic_targets_2 + ] + clone.actor_optimizers = [ + self.accelerator.prepare(actor_optimizer) + for actor_optimizer in actor_optimizers + ] + clone.critic_1_optimizers = [ + self.accelerator.prepare(critic_optimizer) + for critic_optimizer in critic_1_optimizers + ] + clone.critic_2_optimizers = [ + self.accelerator.prepare(critic_optimizer) + for critic_optimizer in critic_2_optimizers + ] + else: + ( + clone.actors, + clone.actor_targets, + clone.critics_1, + clone.critic_targets_1, + clone.critics_2, + clone.critic_targets_2, + clone.actor_optimizers, + clone.critic_1_optimizers, + clone.critic_2_optimizers, + ) = ( + actors, + actor_targets, + critics_1, + critic_targets_1, + critics_2, + critic_targets_2, + actor_optimizers, + critic_1_optimizers, + critic_2_optimizers, + ) + else: + clone.actors = [actor.to(self.device) for actor in actors] + clone.actor_targets = [ + actor_target.to(self.device) for actor_target in actor_targets + ] + clone.critics_1 = [critic.to(self.device) for critic in critics_1] + clone.critic_targets_1 = [ + critic_target.to(self.device) for critic_target in critic_targets_1 + ] + clone.critics_2 = [critic.to(self.device) for critic in critics_2] + clone.critic_targets_2 = [ + critic_target.to(self.device) for critic_target in critic_targets_2 + ] + clone.actor_optimizers = actor_optimizers + clone.critic_1_optimizers = critic_1_optimizers + clone.critic_2_optimizers = critic_2_optimizers + + clone.fitness = copy.deepcopy(self.fitness) + clone.steps = copy.deepcopy(self.steps) + clone.scores = copy.deepcopy(self.scores) + + return clone + + def wrap_models(self): + if self.accelerator is not None: + self.actors = [self.accelerator.prepare(actor) for actor in self.actors] + self.actor_targets = [ + self.accelerator.prepare(actor_target) + for actor_target in self.actor_targets + ] + self.critics_1 = [ + self.accelerator.prepare(critic) for critic in self.critics_1 + ] + self.critic_targets_1 = [ + self.accelerator.prepare(critic_target) + for critic_target in self.critic_targets_1 + ] + self.critics_2 = [ + self.accelerator.prepare(critic) for critic in self.critics_2 + ] + self.critic_targets_2 = [ + self.accelerator.prepare(critic_target) + for critic_target in self.critic_targets_2 + ] + self.actor_optimizers = [ + self.accelerator.prepare(actor_optimizer) + for actor_optimizer in self.actor_optimizers_type + ] + self.critic_1_optimizers = [ + self.accelerator.prepare(critic_optimizer) + for critic_optimizer in self.critic_1_optimizers_type + ] + self.critic_2_optimizers = [ + self.accelerator.prepare(critic_optimizer) + for critic_optimizer in self.critic_2_optimizers_type + ] + + def unwrap_models(self): + if self.accelerator is not None: + self.actors = [ + self.accelerator.unwrap_model(actor) for actor in self.actors + ] + self.actor_targets = [ + self.accelerator.unwrap_model(actor_target) + for actor_target in self.actor_targets + ] + self.critics_1 = [ + self.accelerator.unwrap_model(critic) for critic in self.critics_1 + ] + self.critic_targets_1 = [ + self.accelerator.unwrap_model(critic_target) + for critic_target in self.critic_targets_1 + ] + self.critics_2 = [ + self.accelerator.unwrap_model(critic) for critic in self.critics_2 + ] + self.critic_targets_2 = [ + self.accelerator.unwrap_model(critic_target) + for critic_target in self.critic_targets_2 + ] + self.actor_optimizers = [ + self.accelerator.unwrap_model(actor_optimizer) + for actor_optimizer in self.actor_optimizers + ] + self.critic_1_optimizers = [ + self.accelerator.unwrap_model(critic_optimizer) + for critic_optimizer in self.critic_1_optimizers + ] + self.critic_2_optimizers = [ + self.accelerator.unwrap_model(critic_optimizer) + for critic_optimizer in self.critic_2_optimizers + ] + + def saveCheckpoint(self, path): + """Saves a checkpoint of agent properties and network weights to path. + + :param path: Location to save checkpoint at + :type path: string + """ + + torch.save( + { + "actors_init_dict": [actor.init_dict for actor in self.actors], + "actors_state_dict": [actor.state_dict() for actor in self.actors], + "actor_targets_init_dict": [ + actor_target.init_dict for actor_target in self.actor_targets + ], + "actor_targets_state_dict": [ + actor_target.state_dict() for actor_target in self.actor_targets + ], + "critics_1_init_dict": [ + critic_1.init_dict for critic_1 in self.critics_1 + ], + "critics_1_state_dict": [ + critic_1.state_dict() for critic_1 in self.critics_1 + ], + "critic_targets_1_init_dict": [ + critic_target_1.init_dict + for critic_target_1 in self.critic_targets_1 + ], + "critic_targets_1_state_dict": [ + critic_target_1.state_dict() + for critic_target_1 in self.critic_targets_1 + ], + "critics_2_init_dict": [ + critic_2.init_dict for critic_2 in self.critics_2 + ], + "critics_2_state_dict": [ + critic_2.state_dict() for critic_2 in self.critics_2 + ], + "critic_targets_2_init_dict": [ + critic_target_2.init_dict + for critic_target_2 in self.critic_targets_2 + ], + "critic_targets_2_state_dict": [ + critic_target_2.state_dict() + for critic_target_2 in self.critic_targets_2 + ], + "actor_optimizers_state_dict": [ + actor_optimizer.state_dict() + for actor_optimizer in self.actor_optimizers + ], + "critic_1_optimizers_state_dict": [ + critic_1_optimizer.state_dict() + for critic_1_optimizer in self.critic_1_optimizers + ], + "critic_2_optimizers_state_dict": [ + critic_2_optimizer.state_dict() + for critic_2_optimizer in self.critic_2_optimizers + ], + "net_config": self.net_config, + "batch_size": self.batch_size, + "lr": self.lr, + "learn_step": self.learn_step, + "policy_freq": self.policy_freq, + "gamma": self.gamma, + "tau": self.tau, + "mutation": self.mut, + "index": self.index, + "scores": self.scores, + "fitness": self.fitness, + "steps": self.steps, + }, + path, + pickle_module=dill, + ) + + def loadCheckpoint(self, path): + """Loads saved agent properties and network weights from checkpoint. + + :param path: Location to load checkpoint from + :type path: string + """ + checkpoint = torch.load(path, pickle_module=dill) + self.net_config = checkpoint["net_config"] + if self.net_config is not None: + if self.arch == "mlp": + self.actors = [ + EvolvableMLP(**checkpoint["actors_init_dict"][idx]) + for idx, _ in enumerate(self.agent_ids) + ] + self.actor_targets = [ + EvolvableMLP(**checkpoint["actor_targets_init_dict"][idx]) + for idx, _ in enumerate(self.agent_ids) + ] + self.critics_1 = [ + EvolvableMLP(**checkpoint["critics_1_init_dict"][idx]) + for idx, _ in enumerate(self.agent_ids) + ] + self.critic_targets_1 = [ + EvolvableMLP(**checkpoint["critic_targets_1_init_dict"][idx]) + for idx, _ in enumerate(self.agent_ids) + ] + self.critics_2 = [ + EvolvableMLP(**checkpoint["critics_2_init_dict"][idx]) + for idx, _ in enumerate(self.agent_ids) + ] + self.critic_targets_2 = [ + EvolvableMLP(**checkpoint["critic_targets_2_init_dict"][idx]) + for idx, _ in enumerate(self.agent_ids) + ] + elif self.arch == "cnn": + self.actors = [ + EvolvableCNN(**checkpoint["actors_init_dict"][idx]) + for idx, _ in enumerate(self.agent_ids) + ] + self.actor_targets = [ + EvolvableCNN(**checkpoint["actor_targets_init_dict"][idx]) + for idx, _ in enumerate(self.agent_ids) + ] + self.critics_1 = [ + EvolvableCNN(**checkpoint["critics_1_init_dict"][idx]) + for idx, _ in enumerate(self.agent_ids) + ] + self.critic_targets_1 = [ + EvolvableCNN(**checkpoint["critic_targets_1_init_dict"][idx]) + for idx, _ in enumerate(self.agent_ids) + ] + self.critics_2 = [ + EvolvableCNN(**checkpoint["critics_2_init_dict"][idx]) + for idx, _ in enumerate(self.agent_ids) + ] + self.critic_targets_2 = [ + EvolvableCNN(**checkpoint["critic_targets_2_init_dict"][idx]) + for idx, _ in enumerate(self.agent_ids) + ] + else: + self.actors = [ + MakeEvolvable(**checkpoint["actors_init_dict"][idx]) + for idx, _ in enumerate(self.agent_ids) + ] + self.actor_targets = [ + MakeEvolvable(**checkpoint["actor_targets_init_dict"][idx]) + for idx, _ in enumerate(self.agent_ids) + ] + self.critics_1 = [ + MakeEvolvable(**checkpoint["critics_1_init_dict"][idx]) + for idx, _ in enumerate(self.agent_ids) + ] + self.critic_targets_1 = [ + MakeEvolvable(**checkpoint["critic_targets_1_init_dict"][idx]) + for idx, _ in enumerate(self.agent_ids) + ] + self.critics_2 = [ + MakeEvolvable(**checkpoint["critics_2_init_dict"][idx]) + for idx, _ in enumerate(self.agent_ids) + ] + self.critic_targets_2 = [ + MakeEvolvable(**checkpoint["critic_targets_2_init_dict"][idx]) + for idx, _ in enumerate(self.agent_ids) + ] + self.lr = checkpoint["lr"] + self.actor_optimizers = [ + optim.Adam(actor.parameters(), lr=self.lr) for actor in self.actors + ] + self.critic_1_optimizers = [ + optim.Adam(critic_1.parameters(), lr=self.lr) for critic_1 in self.critics_1 + ] + self.critic_2_optimizers = [ + optim.Adam(critic_2.parameters(), lr=self.lr) for critic_2 in self.critics_2 + ] + actor_list = [] + critic_1_list = [] + critic_2_list = [] + actor_target_list = [] + critic_target_1_list = [] + critic_target_2_list = [] + actor_optimizer_list = [] + critic_1_optimizer_list = [] + critic_2_optimizer_list = [] + + for idx, ( + actor, + actor_target, + critic_1, + critic_target_1, + critic_2, + critic_target_2, + actor_optimizer, + critic_1_optimizer, + critic_2_optimizer, + ) in enumerate( + zip( + self.actors, + self.actor_targets, + self.critics_1, + self.critic_targets_1, + self.critics_2, + self.critic_targets_2, + self.actor_optimizers, + self.critic_1_optimizers, + self.critic_2_optimizers, + ) + ): + actor.load_state_dict(checkpoint["actors_state_dict"][idx]) + actor_list.append(actor) + actor_target.load_state_dict(checkpoint["actor_targets_state_dict"][idx]) + actor_target_list.append(actor_target) + critic_1.load_state_dict(checkpoint["critics_1_state_dict"][idx]) + critic_1_list.append(critic_1) + critic_2.load_state_dict(checkpoint["critics_2_state_dict"][idx]) + critic_2_list.append(critic_2) + critic_target_1.load_state_dict( + checkpoint["critic_targets_1_state_dict"][idx] + ) + critic_target_1_list.append(critic_target_1) + critic_target_2.load_state_dict( + checkpoint["critic_targets_2_state_dict"][idx] + ) + critic_target_2_list.append(critic_target_2) + actor_optimizer.load_state_dict( + checkpoint["actor_optimizers_state_dict"][idx] + ) + actor_optimizer_list.append(actor_optimizer) + critic_1_optimizer.load_state_dict( + checkpoint["critic_1_optimizers_state_dict"][idx] + ) + critic_1_optimizer_list.append(critic_1_optimizer) + critic_2_optimizer.load_state_dict( + checkpoint["critic_2_optimizers_state_dict"][idx] + ) + critic_2_optimizer_list.append(critic_2_optimizer) + + self.actors = actor_list + self.actor_targets = actor_target_list + self.critics_1 = critic_1_list + self.critic_targets_1 = critic_target_1_list + self.critics_2 = critic_2_list + self.critic_targets_2 = critic_target_2_list + self.actor_optimizers = actor_optimizer_list + self.critic_1_optimizers = critic_1_optimizer_list + self.critic_2_optimizers = critic_2_optimizer_list + self.batch_size = checkpoint["batch_size"] + self.learn_step = checkpoint["learn_step"] + self.policy_freq = checkpoint["policy_freq"] + self.gamma = checkpoint["gamma"] + self.tau = checkpoint["tau"] + self.mut = checkpoint["mutation"] + self.index = checkpoint["index"] + self.scores = checkpoint["scores"] + self.fitness = checkpoint["fitness"] + self.steps = checkpoint["steps"] diff --git a/controllers/agilerl_train/soccer/soccer.py b/controllers/agilerl_train/soccer/soccer.py index 1e4fcc0..a57fac3 100755 --- a/controllers/agilerl_train/soccer/soccer.py +++ b/controllers/agilerl_train/soccer/soccer.py @@ -69,8 +69,9 @@ def __init__(self, max_cycles=300, render_mode=None): self.observation_spaces = dict(zip(self.agents, [obs_space for _ in enumerate(self.agents)])) self.action_spaces = dict(zip(self.agents, [Discrete(8) for _ in enumerate(self.agents)])) self.actions = ["walk,1,0,0", "walk,-1,0,0", "walk,0,1,0", "walk,0,-1,0", "walk,0,0,1", "walk,0,0,-1", "motion,left_kick", "motion,right_kick"] - #self.action_mask = Box(low=0, high=1, shape = ([8,]), dtype=np.int8) + self.action_mask = [0, 0, 0, 0, 0, 0, 1, 1] self.state_space = Box(low=-5, high=5, shape = ([21]), dtype=np.float16) + self.lesson = {} self.possible_agents = copy.deepcopy(self.agents) self._agent_selector = agent_selector(self.agents) @@ -134,6 +135,7 @@ def step(self, action): self._was_dead_step(action) return self._cumulative_rewards[self.agent_selection] = 0 + self._clear_rewards() agent = self.agent_list[self.agent_name_mapping[self.agent_selection]] agent.score = 0 @@ -168,7 +170,6 @@ def step(self, action): if self._agent_selector.is_last(): self.frames += 1 - self._clear_rewards() for i in range(40): self.supervisor.step(self.time_step) ball_x, ball_y, _ = self.ball_pos.getSFVec3f() @@ -176,29 +177,29 @@ def step(self, action): for agent in self.agents: x, y, the = self.agent_list[self.agent_name_mapping[agent]].pos length = math.sqrt((x-ball_x)**2+(y-ball_y)**2) - #self.rewards[agent] += 0.2/length/40 + self.rewards[agent] += self.lesson["rewards"]["approach_ball"]/length/40 if length < 0.3: if agent.startswith("blue"): ball_dx, ball_dy = 4.5 - ball_x, 0 - ball_y ball_len = math.sqrt(ball_dx**2+ball_dy**2) ball_dx, ball_dy = ball_dx / ball_len, ball_dy / ball_len reward = round(ball_vel_x, 1) * ball_dx + round(ball_vel_y, 1) * ball_dy - self.rewards[agent] += max(reward, 0) * 10 + self.rewards[agent] += max(reward, 0) * self.lesson["rewards"]["dribble_or_kick"] elif agent.startswith("red"): ball_dx, ball_dy = 4.5 - ( -ball_x), 0 - (-ball_y) ball_len = math.sqrt(ball_dx**2+ball_dy**2) ball_dx, ball_dy = ball_dx / ball_len, ball_dy / ball_len reward = round(-ball_vel_x, 1) * ball_dx + round(-ball_vel_y, 1) * ball_dy - self.rewards[agent] += max(reward, 0) * 10 + self.rewards[agent] += max(reward, 0) * self.lesson["rewards"]["dribble_or_kick"] for agent in self.agents: # local rewards x, y, the = self.agent_list[self.agent_name_mapping[agent]].pos - if abs(x) > 5.0 or abs(y) > 3.5: - self.rewards[agent] += -1 + if abs(x) > 4.7 or abs(y) > 3.2: + self.rewards[agent] += self.lesson["rewards"]["off_field"] #if self.rewards[agent] > 0.1: # print("reward: "+str(agent)+" "+str(self.rewards[agent])) if self.agent_list[self.agent_name_mapping[agent]].is_replace: - self.rewards[agent] += -10 + self.rewards[agent] += self.lesson["rewards"]["fall_down"] self.agent_list[self.agent_name_mapping[agent]].is_replace = False print("reward(fall), reward: "+str(agent)+" "+str(self.rewards[agent])) @@ -207,17 +208,17 @@ def step(self, action): goal = True truncate = True if agent.startswith("blue"): - self.rewards[agent] += 1000 + self.rewards[agent] += self.lesson["rewards"]["score_goal"] elif agent.startswith("red"): - self.rewards[agent] += -1000 + self.rewards[agent] += self.lesson["rewards"]["lose_point"] print("Team blue Goal, reward: "+str(agent)+" "+str(self.rewards[agent])) elif ball_x < -4.5 and abs(ball_y) < 1.3: goal = True truncate = True if agent.startswith("blue"): - self.rewards[agent] += -1000 + self.rewards[agent] += self.lesson["rewards"]["lose_point"] elif agent.startswith("red"): - self.rewards[agent] += 1000 + self.rewards[agent] += self.lesson["rewards"]["score_goal"] print("Team red Goal, reward: "+str(agent)+" "+str(self.rewards[agent])) for agent in self.agents: @@ -231,13 +232,15 @@ def step(self, action): self.ball_pos.setSFVec3f([0, y, 0]) - # Actions Masking - #for agent in self.agents: - # ball_x, ball_y, _ = self.ball_pos.getSFVec3f() - # x, y, the = self.agent_list[self.agent_name_mapping[agent]].pos - # length = math.sqrt((x-ball_x)**2+(y-ball_y)**2) - # if length > 0.5: - # self.infos["env_defined_actions"][agent] = [] + # Actions Masking + for agent in self.agents: + ball_x, ball_y, _ = self.ball_pos.getSFVec3f() + x, y, the = self.agent_list[self.agent_name_mapping[agent]].pos + length = math.sqrt((x-ball_x)**2+(y-ball_y)**2) + if length > 0.5: + self.infos[agent]["action_mask"] = self.action_mask + else: + self.infos[agent]["action_mask"] = None if self.frames >= self.max_cycles: truncate = True @@ -303,6 +306,8 @@ def reinit(self): def reset(self, seed = None, options = None): if seed is not None: self._seed(seed=seed) + if options is not None: + self.lesson = options self.agents = copy.deepcopy(self.possible_agents) self._agent_selector.reinit(self.agents) self.agent_selection = self._agent_selector.next() diff --git a/controllers/agilerl_train/soccer/utils.py b/controllers/agilerl_train/soccer/utils.py new file mode 100644 index 0000000..5579fc3 --- /dev/null +++ b/controllers/agilerl_train/soccer/utils.py @@ -0,0 +1,331 @@ +import gymnasium as gym +import matplotlib.pyplot as plt +import numpy as np + +from agilerl.algorithms.cqn import CQN +from agilerl.algorithms.ddpg import DDPG +from agilerl.algorithms.dqn import DQN +from agilerl.algorithms.dqn_rainbow import RainbowDQN +from agilerl.algorithms.maddpg import MADDPG +#from agilerl.algorithms.matd3 import MATD3 +from soccer.matd3 import MATD3 +from agilerl.algorithms.ppo import PPO +from agilerl.algorithms.td3 import TD3 + + +def makeVectEnvs(env_name, num_envs=1): + """Returns async-vectorized gym environments. + + :param env_name: Gym environment name + :type env_name: str + :param num_envs: Number of vectorized environments, defaults to 1 + :type num_envs: int, optional + """ + return gym.vector.AsyncVectorEnv( + [lambda: gym.make(env_name) for i in range(num_envs)] + ) + + +def initialPopulation( + algo, + state_dim, + action_dim, + one_hot, + net_config, + INIT_HP, + actor_network=None, + critic_network=None, + population_size=1, + device="cpu", + accelerator=None, +): + """Returns population of identical agents. + + :param algo: RL algorithm + :type algo: str + :param state_dim: State observation dimension + :type state_dim: int + :param action_dim: Action dimension + :type action_dim: int + :param one_hot: One-hot encoding + :type one_hot: bool + :param INIT_HP: Initial hyperparameters + :type INIT_HP: dict + :param actor_network: Custom actor network, defaults to None + :type actor_network: nn.Module, optional + :param critic_network: Custom critic network, defaults to None + :type critic_network: nn.Module, optional + :param population_size: Number of agents in population, defaults to 1 + :type population_size: int, optional + :param device: Device for accelerated computing, 'cpu' or 'cuda', defaults to 'cpu' + :type device: str, optional + :param accelerator: Accelerator for distributed computing, defaults to None + :type accelerator: accelerate.Accelerator(), optional + """ + population = [] + + if algo == "DQN": + for idx in range(population_size): + agent = DQN( + state_dim=state_dim, + action_dim=action_dim, + one_hot=one_hot, + index=idx, + net_config=net_config, + batch_size=INIT_HP["BATCH_SIZE"], + lr=INIT_HP["LR"], + learn_step=INIT_HP["LEARN_STEP"], + gamma=INIT_HP["GAMMA"], + tau=INIT_HP["TAU"], + double=INIT_HP["DOUBLE"], + actor_network=actor_network, + device=device, + accelerator=accelerator, + ) + population.append(agent) + + elif algo == "Rainbow DQN": + for idx in range(population_size): + agent = RainbowDQN( + state_dim=state_dim, + action_dim=action_dim, + one_hot=one_hot, + index=idx, + net_config=net_config, + batch_size=INIT_HP["BATCH_SIZE"], + lr=INIT_HP["LR"], + learn_step=INIT_HP["LEARN_STEP"], + gamma=INIT_HP["GAMMA"], + tau=INIT_HP["TAU"], + beta=INIT_HP["BETA"], + prior_eps=INIT_HP["PRIOR_EPS"], + num_atoms=INIT_HP["NUM_ATOMS"], + v_min=INIT_HP["V_MIN"], + v_max=INIT_HP["V_MAX"], + n_step=INIT_HP["N_STEP"], + device=device, + accelerator=accelerator, + ) + population.append(agent) + + elif algo == "DDPG": + for idx in range(population_size): + agent = DDPG( + state_dim=state_dim, + action_dim=action_dim, + one_hot=one_hot, + index=idx, + net_config=net_config, + batch_size=INIT_HP["BATCH_SIZE"], + lr=INIT_HP["LR"], + learn_step=INIT_HP["LEARN_STEP"], + gamma=INIT_HP["GAMMA"], + tau=INIT_HP["TAU"], + policy_freq=INIT_HP["POLICY_FREQ"], + actor_network=actor_network, + critic_network=critic_network, + device=device, + accelerator=accelerator, + ) + population.append(agent) + + elif algo == "PPO": + for idx in range(population_size): + agent = PPO( + state_dim=state_dim, + action_dim=action_dim, + one_hot=one_hot, + discrete_actions=INIT_HP["DISCRETE_ACTIONS"], + index=idx, + net_config=net_config, + batch_size=INIT_HP["BATCH_SIZE"], + lr=INIT_HP["LR"], + gamma=INIT_HP["GAMMA"], + gae_lambda=INIT_HP["GAE_LAMBDA"], + action_std_init=INIT_HP["ACTION_STD_INIT"], + clip_coef=INIT_HP["CLIP_COEF"], + ent_coef=INIT_HP["ENT_COEF"], + vf_coef=INIT_HP["VF_COEF"], + max_grad_norm=INIT_HP["MAX_GRAD_NORM"], + target_kl=INIT_HP["TARGET_KL"], + update_epochs=INIT_HP["UPDATE_EPOCHS"], + actor_network=actor_network, + critic_network=critic_network, + device=device, + accelerator=accelerator, + ) + population.append(agent) + + elif algo == "CQN": + for idx in range(population_size): + agent = CQN( + state_dim=state_dim, + action_dim=action_dim, + one_hot=one_hot, + index=idx, + net_config=net_config, + batch_size=INIT_HP["BATCH_SIZE"], + lr=INIT_HP["LR"], + learn_step=INIT_HP["LEARN_STEP"], + gamma=INIT_HP["GAMMA"], + tau=INIT_HP["TAU"], + double=INIT_HP["DOUBLE"], + actor_network=actor_network, + device=device, + accelerator=accelerator, + ) + population.append(agent) + + elif algo == "TD3": + for idx in range(population_size): + agent = TD3( + state_dim=state_dim, + action_dim=action_dim, + one_hot=one_hot, + max_action=INIT_HP["MAX_ACTION"], + index=idx, + net_config=net_config, + batch_size=INIT_HP["BATCH_SIZE"], + lr=INIT_HP["LR"], + learn_step=INIT_HP["LEARN_STEP"], + gamma=INIT_HP["GAMMA"], + tau=INIT_HP["TAU"], + policy_freq=INIT_HP["POLICY_FREQ"], + actor_network=actor_network, + critic_networks=critic_network, + device=device, + accelerator=accelerator, + ) + population.append(agent) + + elif algo == "MADDPG": + for idx in range(population_size): + agent = MADDPG( + state_dims=state_dim, + action_dims=action_dim, + one_hot=one_hot, + n_agents=INIT_HP["N_AGENTS"], + agent_ids=INIT_HP["AGENT_IDS"], + index=idx, + max_action=INIT_HP["MAX_ACTION"], + min_action=INIT_HP["MIN_ACTION"], + net_config=net_config, + batch_size=INIT_HP["BATCH_SIZE"], + lr=INIT_HP["LR"], + learn_step=INIT_HP["LEARN_STEP"], + gamma=INIT_HP["GAMMA"], + tau=INIT_HP["TAU"], + discrete_actions=INIT_HP["DISCRETE_ACTIONS"], + actor_networks=actor_network, + critic_networks=critic_network, + device=device, + accelerator=accelerator, + ) + population.append(agent) + + elif algo == "MATD3": + for idx in range(population_size): + agent = MATD3( + state_dims=state_dim, + action_dims=action_dim, + one_hot=one_hot, + n_agents=INIT_HP["N_AGENTS"], + agent_ids=INIT_HP["AGENT_IDS"], + index=idx, + max_action=INIT_HP["MAX_ACTION"], + min_action=INIT_HP["MIN_ACTION"], + net_config=net_config, + batch_size=INIT_HP["BATCH_SIZE"], + lr=INIT_HP["LR"], + policy_freq=INIT_HP["POLICY_FREQ"], + learn_step=INIT_HP["LEARN_STEP"], + gamma=INIT_HP["GAMMA"], + tau=INIT_HP["TAU"], + discrete_actions=INIT_HP["DISCRETE_ACTIONS"], + actor_networks=actor_network, + critic_networks=critic_network, + device=device, + accelerator=accelerator, + ) + population.append(agent) + + return population + + +def calculate_vectorized_scores( + rewards, terminations, include_unterminated=False, only_first_episode=True +): + episode_rewards = [] + num_envs, _ = rewards.shape + + for env_index in range(num_envs): + # Find the indices where episodes terminate for the current environment + termination_indices = np.where(terminations[env_index] == 1)[0] + + # If no terminations, sum the entire reward array for this environment + if len(termination_indices) == 0: + episode_reward = np.sum(rewards[env_index]) + episode_rewards.append(episode_reward) + continue # Skip to the next environment + + # Initialize the starting index for segmenting + start_index = 0 + + for termination_index in termination_indices: + # Sum the rewards for the current episode + episode_reward = np.sum( + rewards[env_index, start_index : termination_index + 1] + ) + + # Store the episode reward + episode_rewards.append(episode_reward) + + # If only the first episode is required, break after processing it + if only_first_episode: + break + + # Update the starting index for segmenting + start_index = termination_index + 1 + + # If include_unterminated is True, sum the rewards from the last termination index to the end + if ( + not only_first_episode + and include_unterminated + and start_index < len(rewards[env_index]) + ): + episode_reward = np.sum(rewards[env_index, start_index:]) + episode_rewards.append(episode_reward) + + return episode_rewards + + +def printHyperparams(pop): + """Prints current hyperparameters of agents in a population and their fitnesses. + + :param pop: Population of agents + :type pop: list[object] + """ + + for agent in pop: + print( + "Agent ID: {} Mean 100 fitness: {:.2f} lr: {} Batch Size: {}".format( + agent.index, np.mean(agent.fitness[-100:]), agent.lr, agent.batch_size + ) + ) + + +def plotPopulationScore(pop): + """Plots the fitness scores of agents in a population. + + :param pop: Population of agents + :type pop: list[object] + """ + plt.figure() + for agent in pop: + scores = agent.fitness + steps = agent.steps[:-1] + plt.plot(steps, scores) + plt.title("Score History - Mutations") + plt.xlabel("Steps") + plt.ylim(bottom=-400) + plt.show()