From 436e824359e99f7a04e915641cbc59cfcdda8459 Mon Sep 17 00:00:00 2001 From: omid Date: Mon, 29 Apr 2024 17:12:24 +0200 Subject: [PATCH 1/3] debuge EUPG to with discount factor --- examples/eupg_fishwood.py | 15 +++++++++++---- morl_baselines/single_policy/esr/eupg.py | 21 +++++++++++++++------ tests/test_algos.py | 13 ++++++++++--- 3 files changed, 36 insertions(+), 13 deletions(-) diff --git a/examples/eupg_fishwood.py b/examples/eupg_fishwood.py index ae27bc4e..5c87da85 100644 --- a/examples/eupg_fishwood.py +++ b/examples/eupg_fishwood.py @@ -4,16 +4,23 @@ from morl_baselines.common.evaluation import eval_mo_reward_conditioned from morl_baselines.single_policy.esr.eupg import EUPG - +import torch as th if __name__ == "__main__": env = MORecordEpisodeStatistics(mo_gym.make("fishwood-v0"), gamma=0.99) eval_env = mo_gym.make("fishwood-v0") - def scalarization(reward: np.ndarray, w): - return min(reward[0], reward[1] // 2) + def scalarization(reward: np.ndarray, w=None): + reward = th.tensor(reward) if not isinstance(reward, th.Tensor) else reward + # Handle the case when reward is a single tensor of shape (2, ) + if reward.dim() == 1 and reward.size(0) == 2: + return min(reward[0], reward[1] // 2).item() + + # Handle the case when reward is a tensor of shape (200, 2) + elif reward.dim() == 2 and reward.size(1) == 2: + return th.min(reward[:, 0], reward[:, 1] // 2) - agent = EUPG(env, scalarization=scalarization, weights=np.ones(2), gamma=0.99, log=True, learning_rate=0.001) + agent = EUPG(env, scalarization=scalarization, weights=np.ones(2), gamma=0.99, log=False, learning_rate=0.001) agent.train(total_timesteps=int(4e6), eval_env=eval_env, eval_freq=1000) print(eval_mo_reward_conditioned(agent, env=eval_env, scalarization=scalarization)) diff --git a/morl_baselines/single_policy/esr/eupg.py b/morl_baselines/single_policy/esr/eupg.py index 3799e66f..480d14c4 100644 --- a/morl_baselines/single_policy/esr/eupg.py +++ b/morl_baselines/single_policy/esr/eupg.py @@ -214,9 +214,7 @@ def eval(self, obs: np.ndarray, accrued_reward: Optional[np.ndarray]) -> Union[i else: obs = th.as_tensor(obs).to(self.device) accrued_reward = th.as_tensor(accrued_reward).float().to(self.device) - probas = self.net(obs, accrued_reward) - greedy_act = th.argmax(probas) - return greedy_act.detach().item() + return self.__choose_action(obs, accrued_reward) @th.no_grad() def __choose_action(self, obs: th.Tensor, accrued_reward: th.Tensor) -> int: @@ -234,16 +232,18 @@ def update(self): next_obs, terminateds, ) = self.buffer.get_all_data(to_tensor=True, device=self.device) - # Scalarized episodic reward, our target :-) + episodic_return = th.sum(rewards, dim=0) scalarized_return = self.scalarization(episodic_return.cpu().numpy(), self.weights) scalarized_return = th.scalar_tensor(scalarized_return).to(self.device) + discounted_forward_rewards = self._forward_cumulative_rewards(rewards) + scalarized_values = self.scalarization(discounted_forward_rewards) # For each sample in the batch, get the distribution over actions current_distribution = self.net.distribution(obs, accrued_rewards) # Policy gradient - log_probs = current_distribution.log_prob(actions) - loss = -th.mean(log_probs * scalarized_return) + log_probs = current_distribution.log_prob(actions.squeeze()) + loss = -th.mean(log_probs * scalarized_values) self.optimizer.zero_grad() loss.backward() @@ -259,6 +259,15 @@ def update(self): }, ) + def _forward_cumulative_rewards(self, rewards): + flip_rewards = rewards.flip(dims=[0]) + cumulative_rewards = th.zeros(self.reward_dim).to(self.device) + for i in range(len(rewards)): + cumulative_rewards = self.gamma * cumulative_rewards + flip_rewards[i] + flip_rewards[i] = cumulative_rewards + forward_rewards = flip_rewards.flip(dims=[0]) + return forward_rewards + def train(self, total_timesteps: int, eval_env: Optional[gym.Env] = None, eval_freq: int = 1000, start_time=None): """Train the agent. diff --git a/tests/test_algos.py b/tests/test_algos.py index 77337754..b1da88d2 100644 --- a/tests/test_algos.py +++ b/tests/test_algos.py @@ -1,6 +1,6 @@ """Mostly tests to make sure the algorithms are able to run.""" import time - +import torch as th import mo_gymnasium as mo_gym import numpy as np from mo_gymnasium.envs.deep_sea_treasure.deep_sea_treasure import CONCAVE_MAP @@ -54,8 +54,15 @@ def test_eupg(): env = mo_gym.make("fishwood-v0") eval_env = mo_gym.make("fishwood-v0") - def scalarization(reward: np.ndarray, w): - return min(reward[0], (reward[1] // 2) + 1) + def scalarization(reward: np.ndarray, w=None): + reward = th.tensor(reward) if not isinstance(reward, th.Tensor) else reward + # Handle the case when reward is a single tensor of shape (2, ) + if reward.dim() == 1 and reward.size(0) == 2: + return min(reward[0], reward[1] // 2).item() + + # Handle the case when reward is a tensor of shape (200, 2) + elif reward.dim() == 2 and reward.size(1) == 2: + return th.min(reward[:, 0], reward[:, 1] // 2) agent = EUPG(env, scalarization=scalarization, gamma=0.99, log=False) agent.train(total_timesteps=10000, eval_env=eval_env, eval_freq=100) From 9267836369664763a0dac870fb0132b35d2a432f Mon Sep 17 00:00:00 2001 From: omid Date: Mon, 29 Apr 2024 17:22:25 +0200 Subject: [PATCH 2/3] set log as True --- examples/eupg_fishwood.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/eupg_fishwood.py b/examples/eupg_fishwood.py index 5c87da85..2ed62f22 100644 --- a/examples/eupg_fishwood.py +++ b/examples/eupg_fishwood.py @@ -20,7 +20,7 @@ def scalarization(reward: np.ndarray, w=None): elif reward.dim() == 2 and reward.size(1) == 2: return th.min(reward[:, 0], reward[:, 1] // 2) - agent = EUPG(env, scalarization=scalarization, weights=np.ones(2), gamma=0.99, log=False, learning_rate=0.001) + agent = EUPG(env, scalarization=scalarization, weights=np.ones(2), gamma=0.99, log=True, learning_rate=0.001) agent.train(total_timesteps=int(4e6), eval_env=eval_env, eval_freq=1000) print(eval_mo_reward_conditioned(agent, env=eval_env, scalarization=scalarization)) From 2868f29032dbd3c4071d56ecb6db2f25786a43ae Mon Sep 17 00:00:00 2001 From: omid Date: Fri, 3 May 2024 10:07:27 +0200 Subject: [PATCH 3/3] change the order of the imports --- examples/eupg_fishwood.py | 3 ++- tests/test_algos.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/examples/eupg_fishwood.py b/examples/eupg_fishwood.py index 2ed62f22..7b253522 100644 --- a/examples/eupg_fishwood.py +++ b/examples/eupg_fishwood.py @@ -1,10 +1,11 @@ import mo_gymnasium as mo_gym import numpy as np +import torch as th from mo_gymnasium.utils import MORecordEpisodeStatistics from morl_baselines.common.evaluation import eval_mo_reward_conditioned from morl_baselines.single_policy.esr.eupg import EUPG -import torch as th + if __name__ == "__main__": env = MORecordEpisodeStatistics(mo_gym.make("fishwood-v0"), gamma=0.99) diff --git a/tests/test_algos.py b/tests/test_algos.py index b1da88d2..d64174e7 100644 --- a/tests/test_algos.py +++ b/tests/test_algos.py @@ -1,8 +1,9 @@ """Mostly tests to make sure the algorithms are able to run.""" import time -import torch as th + import mo_gymnasium as mo_gym import numpy as np +import torch as th from mo_gymnasium.envs.deep_sea_treasure.deep_sea_treasure import CONCAVE_MAP from morl_baselines.common.evaluation import eval_mo, eval_mo_reward_conditioned