From ecf5112eee62905dd9c3afc379c47fef7e5bebee Mon Sep 17 00:00:00 2001 From: YuriCat Date: Mon, 17 Jan 2022 06:13:49 +0900 Subject: [PATCH 01/13] experiment: generalized policy gradient --- handyrl/agent.py | 32 ++++++++++++++------------------ handyrl/envs/tictactoe.py | 19 +++++++++++++++++-- handyrl/generation.py | 17 ++++++----------- handyrl/model.py | 16 +++++++++++++--- handyrl/train.py | 18 ++++++++++-------- 5 files changed, 60 insertions(+), 42 deletions(-) diff --git a/handyrl/agent.py b/handyrl/agent.py index 72f1778b..6165892b 100755 --- a/handyrl/agent.py +++ b/handyrl/agent.py @@ -30,16 +30,16 @@ def action(self, env, player, show=False): return random.choice(env.legal_actions(player)) -def print_outputs(env, prob, v): +def print_outputs(env, action, prob, v): if hasattr(env, 'print_outputs'): - env.print_outputs(prob, v) + env.print_outputs(action, prob, v) else: print('v = %f' % v) - print('p = %s' % (prob * 1000).astype(int)) + print('a = %d prob = %f' % (action, prob)) class Agent: - def __init__(self, model, observation=False, temperature=0.0): + def __init__(self, model, observation=False, temperature=1e-6): # model might be a neural net, or some planning algorithm such as game tree search self.model = model self.hidden = None @@ -49,28 +49,24 @@ def __init__(self, model, observation=False, temperature=0.0): def reset(self, env, show=False): self.hidden = self.model.init_hidden() - def plan(self, obs): - outputs = self.model.inference(obs, self.hidden) + def plan(self, obs, legal_actions): + outputs = self.model.inference(obs, self.hidden, legal_actions=legal_actions, temperature=self.temperature) self.hidden = outputs.pop('hidden', None) return outputs def action(self, env, player, show=False): - outputs = self.plan(env.observation(player)) - actions = env.legal_actions(player) - p = outputs['policy'] + obs = env.observation(player) + legal_actions = env.legal_actions(player) + outputs = self.plan(obs, legal_actions) + + action = outputs['action'] + prob = outputs['selected_prob'] v = outputs.get('value', None) - mask = np.ones_like(p) - mask[actions] = 0 - p = p - mask * 1e32 if show: - print_outputs(env, softmax(p), v) + print_outputs(env, action, selected_prob, v) - if self.temperature == 0: - ap_list = sorted([(a, p[a]) for a in actions], key=lambda x: -x[1]) - return ap_list[0][0] - else: - return random.choices(np.arange(len(p)), weights=softmax(p / self.temperature))[0] + return action def observe(self, env, player, show=False): v = None diff --git a/handyrl/envs/tictactoe.py b/handyrl/envs/tictactoe.py index 2c27809c..f0db354c 100755 --- a/handyrl/envs/tictactoe.py +++ b/handyrl/envs/tictactoe.py @@ -10,6 +10,7 @@ import torch import torch.nn as nn import torch.nn.functional as F +import torch.distributions as dist from ..environment import BaseEnvironment @@ -59,14 +60,28 @@ def __init__(self): self.head_p = Head((filters, 3, 3), 2, 9) self.head_v = Head((filters, 3, 3), 1, 1) - def forward(self, x, hidden=None): + def forward(self, x, hidden=None, action=None, action_mask=None, legal_actions=None, temperature=1.0): h = F.relu(self.conv(x)) for block in self.blocks: h = F.relu(block(h)) h_p = self.head_p(h) h_v = self.head_v(h) - return {'policy': h_p, 'value': torch.tanh(h_v)} + if action_mask is None: + assert legal_actions is not None + action_mask = torch.ones_like(h_p) * 1e32 + action_mask[:,legal_actions] = 0 + p = (h_p - action_mask) / temperature + + log_prob = F.log_softmax(p, -1) + prob = torch.exp(log_prob) + entropy = dist.Categorical(logits=log_prob).entropy().unsqueeze(-1) + + if action is None: + action = prob.multinomial(num_samples=1, replacement=True) + selected_prob = prob.gather(-1, action) + + return {'action': action, 'selected_prob': selected_prob, 'value': torch.tanh(h_v), 'entropy': entropy, 'action_mask': action_mask} class Environment(BaseEnvironment): diff --git a/handyrl/generation.py b/handyrl/generation.py index f426e5d2..e2b69f37 100755 --- a/handyrl/generation.py +++ b/handyrl/generation.py @@ -37,7 +37,9 @@ def generate(self, models, args): if player in turn_players or self.args['observation']: obs = self.env.observation(player) model = models[player] - outputs = model.inference(obs, hidden[player]) + + legal_actions = self.env.legal_actions(player) + outputs = model.inference(obs, hidden[player], legal_actions=legal_actions) hidden[player] = outputs.get('hidden', None) v = outputs.get('value', None) @@ -45,16 +47,9 @@ def generate(self, models, args): moment['value'][player] = v if player in turn_players: - p_ = outputs['policy'] - legal_actions = self.env.legal_actions(player) - action_mask = np.ones_like(p_) * 1e32 - action_mask[legal_actions] = 0 - p = softmax(p_ - action_mask) - action = random.choices(legal_actions, weights=p[legal_actions])[0] - - moment['selected_prob'][player] = p[action] - moment['action_mask'][player] = action_mask - moment['action'][player] = action + moment['action'][player] = outputs['action'][0] + moment['selected_prob'][player] = outputs['selected_prob'][0] + moment['action_mask'][player] = outputs['action_mask'] err = self.env.step(moment['action']) if err: diff --git a/handyrl/model.py b/handyrl/model.py index 621d703f..0a06ae97 100755 --- a/handyrl/model.py +++ b/handyrl/model.py @@ -6,6 +6,9 @@ import os os.environ['OMP_NUM_THREADS'] = '1' +import copy +import random + import numpy as np import torch torch.set_num_threads(1) @@ -63,8 +66,15 @@ def __init__(self, model, x): super().__init__() wrapped_model = ModelWrapper(model) hidden = wrapped_model.init_hidden() - outputs = wrapped_model.inference(x, hidden) + outputs = wrapped_model.inference(x, hidden, legal_actions=[]) self.output_dict = {key: np.zeros_like(value) for key, value in outputs.items()} - def inference(self, *args): - return self.output_dict + def inference(self, *args, **kwargs): + outputs = copy.deepcopy(self.output_dict) + outputs['action'].fill(random.choice(kwargs['legal_actions'])) + prob = 1.0 / len(kwargs['legal_actions']) + outputs['selected_prob'].fill(prob) + outputs['entropy'].fill(-np.log(prob)) + outputs['action_mask'].fill(1e32) + outputs['action_mask'][kwargs['legal_actions']] = 0 + return outputs diff --git a/handyrl/train.py b/handyrl/train.py index 1e76ac1f..52ff20f8 100755 --- a/handyrl/train.py +++ b/handyrl/train.py @@ -64,12 +64,12 @@ def replace_none(a, b): if args['turn_based_training'] and not args['observation']: obs = [[m['observation'][m['turn'][0]]] for m in moments] prob = np.array([[[m['selected_prob'][m['turn'][0]]]] for m in moments]) - act = np.array([[m['action'][m['turn'][0]]] for m in moments], dtype=np.int64)[..., np.newaxis] + act = np.array([[[m['action'][m['turn'][0]]]] for m in moments], dtype=np.int64) amask = np.array([[m['action_mask'][m['turn'][0]]] for m in moments]) else: obs = [[replace_none(m['observation'][player], obs_zeros) for player in players] for m in moments] prob = np.array([[[replace_none(m['selected_prob'][player], 1.0)] for player in players] for m in moments]) - act = np.array([[replace_none(m['action'][player], 0) for player in players] for m in moments], dtype=np.int64)[..., np.newaxis] + act = np.array([[[replace_none(m['action'][player], 0)] for player in players] for m in moments], dtype=np.int64) amask = np.array([[replace_none(m['action_mask'][player], amask_zeros + 1e32) for player in players] for m in moments]) # reshape observation @@ -138,7 +138,9 @@ def forward_prediction(model, hidden, batch, args): if hidden is None: # feed-forward neural network obs = map_r(observations, lambda o: o.view(-1, *o.size()[3:])) - outputs = model(obs, None) + action = action=batch['action'].view(-1, *batch['action'].size()[3:]) + action_mask = batch['action_mask'].view(-1, *batch['action_mask'].size()[3:]) + outputs = model(obs, None, action=action, action_mask=action_mask) else: # sequential computation with RNN outputs = {} @@ -163,9 +165,9 @@ def forward_prediction(model, hidden, batch, args): for k, o in outputs.items(): o = o.view(*batch['turn_mask'].size()[:2], -1, o.size(-1)) - if k == 'policy': + if k == 'selected_prob': # gather turn player's policies - outputs[k] = o.mul(batch['turn_mask']).sum(2, keepdim=True) - batch['action_mask'] + outputs[k] = o.mul(batch['turn_mask']).sum(2, keepdim=True) else: # mask valid target values and cumulative rewards outputs[k] = o.mul(batch['observation_mask']) @@ -193,11 +195,11 @@ def compose_losses(outputs, log_selected_policies, total_advantages, targets, ba if 'return' in outputs: losses['r'] = F.smooth_l1_loss(outputs['return'], targets['return'], reduction='none').mul(omasks).sum() - entropy = dist.Categorical(logits=outputs['policy']).entropy().mul(tmasks.sum(-1)) + entropy = outputs['entropy'].mul(tmasks) losses['ent'] = entropy.sum() base_loss = losses['p'] + losses.get('v', 0) + losses.get('r', 0) - entropy_loss = entropy.mul(1 - batch['progress'] * (1 - args['entropy_regularization_decay'])).sum() * -args['entropy_regularization'] + entropy_loss = entropy.mul(1 - batch['progress'].unsqueeze(-2) * (1 - args['entropy_regularization_decay'])).sum() * -args['entropy_regularization'] losses['total'] = base_loss + entropy_loss return losses, dcnt @@ -210,7 +212,7 @@ def compute_loss(batch, model, hidden, args): clip_rho_threshold, clip_c_threshold = 1.0, 1.0 log_selected_b_policies = torch.log(torch.clamp(batch['selected_prob'], 1e-16, 1)) * emasks - log_selected_t_policies = F.log_softmax(outputs['policy'], dim=-1).gather(-1, actions) * emasks + log_selected_t_policies = torch.log(torch.clamp(outputs['selected_prob'], 1e-16, 1)) * emasks # thresholds of importance sampling log_rhos = log_selected_t_policies.detach() - log_selected_b_policies From f33745ab5d397b6652e50f7f90aa2b90c1509507 Mon Sep 17 00:00:00 2001 From: YuriCat Date: Mon, 17 Jan 2022 08:10:17 +0900 Subject: [PATCH 02/13] feature: remove action mask --- handyrl/agent.py | 7 +++---- handyrl/envs/tictactoe.py | 16 +++++++--------- handyrl/generation.py | 6 ++---- handyrl/model.py | 21 --------------------- handyrl/train.py | 12 +++--------- handyrl/worker.py | 7 +------ 6 files changed, 16 insertions(+), 53 deletions(-) diff --git a/handyrl/agent.py b/handyrl/agent.py index 6165892b..4100d5ec 100755 --- a/handyrl/agent.py +++ b/handyrl/agent.py @@ -49,15 +49,14 @@ def __init__(self, model, observation=False, temperature=1e-6): def reset(self, env, show=False): self.hidden = self.model.init_hidden() - def plan(self, obs, legal_actions): - outputs = self.model.inference(obs, self.hidden, legal_actions=legal_actions, temperature=self.temperature) + def plan(self, obs): + outputs = self.model.inference(obs, self.hidden, temperature=self.temperature) self.hidden = outputs.pop('hidden', None) return outputs def action(self, env, player, show=False): obs = env.observation(player) - legal_actions = env.legal_actions(player) - outputs = self.plan(obs, legal_actions) + outputs = self.plan(obs) action = outputs['action'] prob = outputs['selected_prob'] diff --git a/handyrl/envs/tictactoe.py b/handyrl/envs/tictactoe.py index f0db354c..a6125c5f 100755 --- a/handyrl/envs/tictactoe.py +++ b/handyrl/envs/tictactoe.py @@ -60,19 +60,14 @@ def __init__(self): self.head_p = Head((filters, 3, 3), 2, 9) self.head_v = Head((filters, 3, 3), 1, 1) - def forward(self, x, hidden=None, action=None, action_mask=None, legal_actions=None, temperature=1.0): + def forward(self, x, hidden=None, action=None, temperature=1.0): h = F.relu(self.conv(x)) for block in self.blocks: h = F.relu(block(h)) h_p = self.head_p(h) h_v = self.head_v(h) - if action_mask is None: - assert legal_actions is not None - action_mask = torch.ones_like(h_p) * 1e32 - action_mask[:,legal_actions] = 0 - p = (h_p - action_mask) / temperature - + p = h_p / temperature log_prob = F.log_softmax(p, -1) prob = torch.exp(log_prob) entropy = dist.Categorical(logits=log_prob).entropy().unsqueeze(-1) @@ -81,7 +76,7 @@ def forward(self, x, hidden=None, action=None, action_mask=None, legal_actions=N action = prob.multinomial(num_samples=1, replacement=True) selected_prob = prob.gather(-1, action) - return {'action': action, 'selected_prob': selected_prob, 'value': torch.tanh(h_v), 'entropy': entropy, 'action_mask': action_mask} + return {'action': action, 'selected_prob': selected_prob, 'value': torch.tanh(h_v), 'entropy': entropy} class Environment(BaseEnvironment): @@ -119,7 +114,10 @@ def play(self, action, _=None): # state transition function # action is integer (0 ~ 8) x, y = action // 3, action % 3 - self.board[x, y] = self.color + if self.board[x, y] != 0: # illegal action + self.win_color = -self.color + else: + self.board[x, y] = self.color # check winning condition win = self.board[x, :].sum() == 3 * self.color \ diff --git a/handyrl/generation.py b/handyrl/generation.py index e2b69f37..d8974479 100755 --- a/handyrl/generation.py +++ b/handyrl/generation.py @@ -29,7 +29,7 @@ def generate(self, models, args): return None while not self.env.terminal(): - moment_keys = ['observation', 'selected_prob', 'action_mask', 'action', 'value', 'reward', 'return'] + moment_keys = ['observation', 'selected_prob', 'action', 'value', 'reward', 'return'] moment = {key: {p: None for p in self.env.players()} for key in moment_keys} turn_players = self.env.turns() @@ -38,8 +38,7 @@ def generate(self, models, args): obs = self.env.observation(player) model = models[player] - legal_actions = self.env.legal_actions(player) - outputs = model.inference(obs, hidden[player], legal_actions=legal_actions) + outputs = model.inference(obs, hidden[player]) hidden[player] = outputs.get('hidden', None) v = outputs.get('value', None) @@ -49,7 +48,6 @@ def generate(self, models, args): if player in turn_players: moment['action'][player] = outputs['action'][0] moment['selected_prob'][player] = outputs['selected_prob'][0] - moment['action_mask'][player] = outputs['action_mask'] err = self.env.step(moment['action']) if err: diff --git a/handyrl/model.py b/handyrl/model.py index 0a06ae97..deb1ee42 100755 --- a/handyrl/model.py +++ b/handyrl/model.py @@ -57,24 +57,3 @@ def inference(self, x, hidden, **kwargs): ht = map_r(hidden, lambda h: torch.from_numpy(np.array(h)).contiguous().unsqueeze(0) if h is not None else None) outputs = self.forward(xt, ht, **kwargs) return map_r(outputs, lambda o: o.detach().numpy().squeeze(0) if o is not None else None) - - -# simple model - -class RandomModel(nn.Module): - def __init__(self, model, x): - super().__init__() - wrapped_model = ModelWrapper(model) - hidden = wrapped_model.init_hidden() - outputs = wrapped_model.inference(x, hidden, legal_actions=[]) - self.output_dict = {key: np.zeros_like(value) for key, value in outputs.items()} - - def inference(self, *args, **kwargs): - outputs = copy.deepcopy(self.output_dict) - outputs['action'].fill(random.choice(kwargs['legal_actions'])) - prob = 1.0 / len(kwargs['legal_actions']) - outputs['selected_prob'].fill(prob) - outputs['entropy'].fill(-np.log(prob)) - outputs['action_mask'].fill(1e32) - outputs['action_mask'][kwargs['legal_actions']] = 0 - return outputs diff --git a/handyrl/train.py b/handyrl/train.py index 52ff20f8..f737b7b8 100755 --- a/handyrl/train.py +++ b/handyrl/train.py @@ -58,19 +58,16 @@ def replace_none(a, b): players = [random.choice(players)] obs_zeros = map_r(moments[0]['observation'][moments[0]['turn'][0]], lambda o: np.zeros_like(o)) # template for padding - amask_zeros = np.zeros_like(moments[0]['action_mask'][moments[0]['turn'][0]]) # template for padding # data that is chainge by training configuration if args['turn_based_training'] and not args['observation']: obs = [[m['observation'][m['turn'][0]]] for m in moments] prob = np.array([[[m['selected_prob'][m['turn'][0]]]] for m in moments]) act = np.array([[[m['action'][m['turn'][0]]]] for m in moments], dtype=np.int64) - amask = np.array([[m['action_mask'][m['turn'][0]]] for m in moments]) else: obs = [[replace_none(m['observation'][player], obs_zeros) for player in players] for m in moments] prob = np.array([[[replace_none(m['selected_prob'][player], 1.0)] for player in players] for m in moments]) act = np.array([[[replace_none(m['action'][player], 0)] for player in players] for m in moments], dtype=np.int64) - amask = np.array([[replace_none(m['action_mask'][player], amask_zeros + 1e32) for player in players] for m in moments]) # reshape observation obs = rotate(rotate(obs)) # (T, P, ..., ...) -> (P, ..., T, ...) -> (..., T, P, ...) @@ -100,14 +97,13 @@ def replace_none(a, b): emask = np.pad(emask, [(0, pad_len), (0, 0), (0, 0)], 'constant', constant_values=0) tmask = np.pad(tmask, [(0, pad_len), (0, 0), (0, 0)], 'constant', constant_values=0) omask = np.pad(omask, [(0, pad_len), (0, 0), (0, 0)], 'constant', constant_values=0) - amask = np.pad(amask, [(0, pad_len), (0, 0), (0, 0)], 'constant', constant_values=1e32) progress = np.pad(progress, [(0, pad_len), (0, 0)], 'constant', constant_values=1) obss.append(obs) - datum.append((prob, v, act, oc, rew, ret, emask, tmask, omask, amask, progress)) + datum.append((prob, v, act, oc, rew, ret, emask, tmask, omask, progress)) obs = to_torch(bimap_r(obs_zeros, rotate(obss), lambda _, o: np.array(o))) - prob, v, act, oc, rew, ret, emask, tmask, omask, amask, progress = [to_torch(np.array(val)) for val in zip(*datum)] + prob, v, act, oc, rew, ret, emask, tmask, omask, progress = [to_torch(np.array(val)) for val in zip(*datum)] return { 'observation': obs, @@ -116,7 +112,6 @@ def replace_none(a, b): 'reward': rew, 'return': ret, 'episode_mask': emask, 'turn_mask': tmask, 'observation_mask': omask, - 'action_mask': amask, 'progress': progress, } @@ -139,8 +134,7 @@ def forward_prediction(model, hidden, batch, args): # feed-forward neural network obs = map_r(observations, lambda o: o.view(-1, *o.size()[3:])) action = action=batch['action'].view(-1, *batch['action'].size()[3:]) - action_mask = batch['action_mask'].view(-1, *batch['action_mask'].size()[3:]) - outputs = model(obs, None, action=action, action_mask=action_mask) + outputs = model(obs, None, action=action) else: # sequential computation with RNN outputs = {} diff --git a/handyrl/worker.py b/handyrl/worker.py index 58cd12f7..faec64bc 100755 --- a/handyrl/worker.py +++ b/handyrl/worker.py @@ -19,7 +19,7 @@ from .connection import connect_socket_connection, accept_socket_connections from .evaluation import Evaluator from .generation import Generator -from .model import ModelWrapper, RandomModel +from .model import ModelWrapper class Worker: @@ -51,11 +51,6 @@ def _gather_models(self, model_ids): else: # get model from server model = pickle.loads(send_recv(self.conn, ('model', model_id))) - if model_id == 0: - # use random model - self.env.reset() - obs = self.env.observation(self.env.players()[0]) - model = RandomModel(model, obs) model_pool[model_id] = ModelWrapper(model) # update latest model if model_id > self.latest_model[0]: From c239a431195185ebfe743d5fb0aade322b1cfb65 Mon Sep 17 00:00:00 2001 From: YuriCat Date: Mon, 17 Jan 2022 09:18:54 +0900 Subject: [PATCH 03/13] feature: generalized policy for RNN --- handyrl/envs/geister.py | 19 +++++++++++++++++-- handyrl/train.py | 5 +++-- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/handyrl/envs/geister.py b/handyrl/envs/geister.py index 747beec2..136a24ee 100755 --- a/handyrl/envs/geister.py +++ b/handyrl/envs/geister.py @@ -10,6 +10,7 @@ import torch import torch.nn as nn import torch.nn.functional as F +import torch.distributions as dist from ..environment import BaseEnvironment @@ -148,7 +149,7 @@ def __init__(self): def init_hidden(self, batch_size=None): return self.body.init_hidden(self.input_size[1:], batch_size) - def forward(self, x, hidden): + def forward(self, x, hidden, action=None, action_mask=None, legal_actions=None, temperature=1.0): b, s = x['board'], x['scalar'] h_s = s.view(*s.size(), 1, 1).repeat(1, 1, 6, 6) h = torch.cat([h_s, b], -3) @@ -164,7 +165,21 @@ def forward(self, x, hidden): h_v = self.head_v(h) h_r = self.head_r(h) - return {'policy': h_p, 'value': torch.tanh(h_v), 'return': h_r, 'hidden': hidden} + if action_mask is None: + assert legal_actions is not None + action_mask = torch.ones_like(h_p) * 1e32 + action_mask[:,legal_actions] = 0 + p = (h_p - action_mask) / temperature + + log_prob = F.log_softmax(p, -1) + prob = torch.exp(log_prob) + entropy = dist.Categorical(logits=log_prob).entropy().unsqueeze(-1) + + if action is None: + action = prob.multinomial(num_samples=1, replacement=True) + selected_prob = prob.gather(-1, action) + + return {'action': action, 'selected_prob': selected_prob, 'value': torch.tanh(h_v), 'return': h_r, 'hidden': hidden, 'entropy': entropy, 'action_mask': action_mask} class Environment(BaseEnvironment): diff --git a/handyrl/train.py b/handyrl/train.py index 52ff20f8..d898855b 100755 --- a/handyrl/train.py +++ b/handyrl/train.py @@ -17,7 +17,6 @@ import torch import torch.nn as nn import torch.nn.functional as F -import torch.distributions as dist import torch.optim as optim import psutil @@ -146,6 +145,8 @@ def forward_prediction(model, hidden, batch, args): outputs = {} for t in range(batch['turn_mask'].size(1)): obs = map_r(observations, lambda o: o[:, t].reshape(-1, *o.size()[3:])) # (..., B * P, ...) + action = action=batch['action'][:, t].view(-1, *batch['action'].size()[3:]) + action_mask = batch['action_mask'][:, t].view(-1, *batch['action_mask'].size()[3:]) omask_ = batch['observation_mask'][:, t] omask = map_r(hidden, lambda h: omask_.view(*h.size()[:2], *([1] * (len(h.size()) - 2)))) hidden_ = bimap_r(hidden, omask, lambda h, m: h * m) # (..., B, P, ...) @@ -153,7 +154,7 @@ def forward_prediction(model, hidden, batch, args): hidden_ = map_r(hidden_, lambda h: h.sum(1)) # (..., B * 1, ...) else: hidden_ = map_r(hidden_, lambda h: h.view(-1, *h.size()[2:])) # (..., B * P, ...) - outputs_ = model(obs, hidden_) + outputs_ = model(obs, hidden_, action=action, action_mask=action_mask) for k, o in outputs_.items(): if k == 'hidden': next_hidden = outputs_['hidden'] From 30960aad8d94b37ff1cfe2c727ec8c627d1d4331 Mon Sep 17 00:00:00 2001 From: YuriCat Date: Mon, 17 Jan 2022 09:19:07 +0900 Subject: [PATCH 04/13] fix: generalized action dimension --- handyrl/agent.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/handyrl/agent.py b/handyrl/agent.py index 6165892b..003616e2 100755 --- a/handyrl/agent.py +++ b/handyrl/agent.py @@ -59,8 +59,8 @@ def action(self, env, player, show=False): legal_actions = env.legal_actions(player) outputs = self.plan(obs, legal_actions) - action = outputs['action'] - prob = outputs['selected_prob'] + action = outputs['action'][0] + prob = outputs['selected_prob'][0] v = outputs.get('value', None) if show: From cce36664a4b5ceab59db7c8bacccb949fe5b4e5a Mon Sep 17 00:00:00 2001 From: YuriCat Date: Mon, 17 Jan 2022 09:34:34 +0900 Subject: [PATCH 05/13] feature: remove action mask for RNN --- handyrl/envs/geister.py | 17 +++++++++-------- handyrl/train.py | 3 +-- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/handyrl/envs/geister.py b/handyrl/envs/geister.py index 136a24ee..3e3015bd 100755 --- a/handyrl/envs/geister.py +++ b/handyrl/envs/geister.py @@ -149,7 +149,7 @@ def __init__(self): def init_hidden(self, batch_size=None): return self.body.init_hidden(self.input_size[1:], batch_size) - def forward(self, x, hidden, action=None, action_mask=None, legal_actions=None, temperature=1.0): + def forward(self, x, hidden, action=None, temperature=1.0): b, s = x['board'], x['scalar'] h_s = s.view(*s.size(), 1, 1).repeat(1, 1, 6, 6) h = torch.cat([h_s, b], -3) @@ -165,12 +165,7 @@ def forward(self, x, hidden, action=None, action_mask=None, legal_actions=None, h_v = self.head_v(h) h_r = self.head_r(h) - if action_mask is None: - assert legal_actions is not None - action_mask = torch.ones_like(h_p) * 1e32 - action_mask[:,legal_actions] = 0 - p = (h_p - action_mask) / temperature - + p = h_p / temperature log_prob = F.log_softmax(p, -1) prob = torch.exp(log_prob) entropy = dist.Categorical(logits=log_prob).entropy().unsqueeze(-1) @@ -179,7 +174,7 @@ def forward(self, x, hidden, action=None, action_mask=None, legal_actions=None, action = prob.multinomial(num_samples=1, replacement=True) selected_prob = prob.gather(-1, action) - return {'action': action, 'selected_prob': selected_prob, 'value': torch.tanh(h_v), 'return': h_r, 'hidden': hidden, 'entropy': entropy, 'action_mask': action_mask} + return {'action': action, 'selected_prob': selected_prob, 'value': torch.tanh(h_v), 'return': h_r, 'hidden': hidden, 'entropy': entropy} class Environment(BaseEnvironment): @@ -373,6 +368,10 @@ def _set(self, layout): def play(self, action, _=None): # state transition + if not self.legal(action): + self.win_color = self.opponent(self.color) + return + if self.turn_count < 0: layout = action - 4 * 6 * 6 return self._set(layout) @@ -463,6 +462,8 @@ def legal(self, action): if self.turn_count < 0: layout = action - 4 * 6 * 6 return 0 <= layout < 70 + elif not 0 <= action < 4 * 6 * 6: + return False pos_from = self.action2from(action, self.color) pos_to = self.action2to(action, self.color) diff --git a/handyrl/train.py b/handyrl/train.py index 74c2563f..5a28ca9c 100755 --- a/handyrl/train.py +++ b/handyrl/train.py @@ -140,7 +140,6 @@ def forward_prediction(model, hidden, batch, args): for t in range(batch['turn_mask'].size(1)): obs = map_r(observations, lambda o: o[:, t].reshape(-1, *o.size()[3:])) # (..., B * P, ...) action = action=batch['action'][:, t].view(-1, *batch['action'].size()[3:]) - action_mask = batch['action_mask'][:, t].view(-1, *batch['action_mask'].size()[3:]) omask_ = batch['observation_mask'][:, t] omask = map_r(hidden, lambda h: omask_.view(*h.size()[:2], *([1] * (len(h.size()) - 2)))) hidden_ = bimap_r(hidden, omask, lambda h, m: h * m) # (..., B, P, ...) @@ -148,7 +147,7 @@ def forward_prediction(model, hidden, batch, args): hidden_ = map_r(hidden_, lambda h: h.sum(1)) # (..., B * 1, ...) else: hidden_ = map_r(hidden_, lambda h: h.view(-1, *h.size()[2:])) # (..., B * P, ...) - outputs_ = model(obs, hidden_, action=action, action_mask=action_mask) + outputs_ = model(obs, hidden_, action=action) for k, o in outputs_.items(): if k == 'hidden': next_hidden = outputs_['hidden'] From a53da197a6fda3210b890f0837461699de0be3af Mon Sep 17 00:00:00 2001 From: YuriCat Date: Tue, 18 Jan 2022 01:17:35 +0900 Subject: [PATCH 06/13] feature: log_selected_prob based training --- handyrl/agent.py | 2 +- handyrl/envs/geister.py | 8 ++++---- handyrl/envs/tictactoe.py | 9 ++++----- handyrl/generation.py | 4 ++-- handyrl/train.py | 20 ++++++++++---------- 5 files changed, 21 insertions(+), 22 deletions(-) diff --git a/handyrl/agent.py b/handyrl/agent.py index cc759cb0..93374693 100755 --- a/handyrl/agent.py +++ b/handyrl/agent.py @@ -59,7 +59,7 @@ def action(self, env, player, show=False): outputs = self.plan(obs) action = outputs['action'][0] - prob = outputs['selected_prob'][0] + prob = np.exp(outputs['log_selected_prob'][0]) v = outputs.get('value', None) if show: diff --git a/handyrl/envs/geister.py b/handyrl/envs/geister.py index 3e3015bd..ba7edf69 100755 --- a/handyrl/envs/geister.py +++ b/handyrl/envs/geister.py @@ -165,16 +165,16 @@ def forward(self, x, hidden, action=None, temperature=1.0): h_v = self.head_v(h) h_r = self.head_r(h) - p = h_p / temperature - log_prob = F.log_softmax(p, -1) + log_prob = F.log_softmax(h_p / temperature, -1) prob = torch.exp(log_prob) entropy = dist.Categorical(logits=log_prob).entropy().unsqueeze(-1) if action is None: + prob = torch.exp(log_prob) action = prob.multinomial(num_samples=1, replacement=True) - selected_prob = prob.gather(-1, action) + log_selected_prob = log_prob.gather(-1, action) - return {'action': action, 'selected_prob': selected_prob, 'value': torch.tanh(h_v), 'return': h_r, 'hidden': hidden, 'entropy': entropy} + return {'action': action, 'log_selected_prob': log_selected_prob, 'value': torch.tanh(h_v), 'return': h_r, 'hidden': hidden, 'entropy': entropy} class Environment(BaseEnvironment): diff --git a/handyrl/envs/tictactoe.py b/handyrl/envs/tictactoe.py index a6125c5f..ceffc155 100755 --- a/handyrl/envs/tictactoe.py +++ b/handyrl/envs/tictactoe.py @@ -67,16 +67,15 @@ def forward(self, x, hidden=None, action=None, temperature=1.0): h_p = self.head_p(h) h_v = self.head_v(h) - p = h_p / temperature - log_prob = F.log_softmax(p, -1) - prob = torch.exp(log_prob) + log_prob = F.log_softmax(h_p / temperature, -1) entropy = dist.Categorical(logits=log_prob).entropy().unsqueeze(-1) if action is None: + prob = torch.exp(log_prob) action = prob.multinomial(num_samples=1, replacement=True) - selected_prob = prob.gather(-1, action) + log_selected_prob = log_prob.gather(-1, action) - return {'action': action, 'selected_prob': selected_prob, 'value': torch.tanh(h_v), 'entropy': entropy} + return {'action': action, 'log_selected_prob': log_selected_prob, 'value': torch.tanh(h_v), 'entropy': entropy} class Environment(BaseEnvironment): diff --git a/handyrl/generation.py b/handyrl/generation.py index d8974479..9144bc1f 100755 --- a/handyrl/generation.py +++ b/handyrl/generation.py @@ -29,7 +29,7 @@ def generate(self, models, args): return None while not self.env.terminal(): - moment_keys = ['observation', 'selected_prob', 'action', 'value', 'reward', 'return'] + moment_keys = ['observation', 'log_selected_prob', 'action', 'value', 'reward', 'return'] moment = {key: {p: None for p in self.env.players()} for key in moment_keys} turn_players = self.env.turns() @@ -47,7 +47,7 @@ def generate(self, models, args): if player in turn_players: moment['action'][player] = outputs['action'][0] - moment['selected_prob'][player] = outputs['selected_prob'][0] + moment['log_selected_prob'][player] = outputs['log_selected_prob'][0] err = self.env.step(moment['action']) if err: diff --git a/handyrl/train.py b/handyrl/train.py index 5a28ca9c..cbb19eaf 100755 --- a/handyrl/train.py +++ b/handyrl/train.py @@ -61,11 +61,11 @@ def replace_none(a, b): # data that is chainge by training configuration if args['turn_based_training'] and not args['observation']: obs = [[m['observation'][m['turn'][0]]] for m in moments] - prob = np.array([[[m['selected_prob'][m['turn'][0]]]] for m in moments]) + log_prob = np.array([[[m['log_selected_prob'][m['turn'][0]]]] for m in moments]) act = np.array([[[m['action'][m['turn'][0]]]] for m in moments], dtype=np.int64) else: obs = [[replace_none(m['observation'][player], obs_zeros) for player in players] for m in moments] - prob = np.array([[[replace_none(m['selected_prob'][player], 1.0)] for player in players] for m in moments]) + log_prob = np.array([[[replace_none(m['log_selected_prob'][player], 1.0)] for player in players] for m in moments]) act = np.array([[[replace_none(m['action'][player], 0)] for player in players] for m in moments], dtype=np.int64) # reshape observation @@ -79,7 +79,7 @@ def replace_none(a, b): oc = np.array([ep['outcome'][player] for player in players], dtype=np.float32).reshape(1, len(players), -1) emask = np.ones((len(moments), 1, 1), dtype=np.float32) # episode mask - tmask = np.array([[[m['selected_prob'][player] is not None] for player in players] for m in moments], dtype=np.float32) + tmask = np.array([[[m['log_selected_prob'][player] is not None] for player in players] for m in moments], dtype=np.float32) omask = np.array([[[m['value'][player] is not None] for player in players] for m in moments], dtype=np.float32) progress = np.arange(ep['start'], ep['end'], dtype=np.float32)[..., np.newaxis] / ep['total'] @@ -88,7 +88,7 @@ def replace_none(a, b): if len(tmask) < args['forward_steps']: pad_len = args['forward_steps'] - len(tmask) obs = map_r(obs, lambda o: np.pad(o, [(0, pad_len)] + [(0, 0)] * (len(o.shape) - 1), 'constant', constant_values=0)) - prob = np.pad(prob, [(0, pad_len), (0, 0), (0, 0)], 'constant', constant_values=1) + log_prob = np.pad(log_prob, [(0, pad_len), (0, 0), (0, 0)], 'constant', constant_values=1) v = np.concatenate([v, np.tile(oc, [pad_len, 1, 1])]) act = np.pad(act, [(0, pad_len), (0, 0), (0, 0)], 'constant', constant_values=0) rew = np.pad(rew, [(0, pad_len), (0, 0), (0, 0)], 'constant', constant_values=0) @@ -99,14 +99,14 @@ def replace_none(a, b): progress = np.pad(progress, [(0, pad_len), (0, 0)], 'constant', constant_values=1) obss.append(obs) - datum.append((prob, v, act, oc, rew, ret, emask, tmask, omask, progress)) + datum.append((log_prob, v, act, oc, rew, ret, emask, tmask, omask, progress)) obs = to_torch(bimap_r(obs_zeros, rotate(obss), lambda _, o: np.array(o))) - prob, v, act, oc, rew, ret, emask, tmask, omask, progress = [to_torch(np.array(val)) for val in zip(*datum)] + log_prob, v, act, oc, rew, ret, emask, tmask, omask, progress = [to_torch(np.array(val)) for val in zip(*datum)] return { 'observation': obs, - 'selected_prob': prob, 'value': v, + 'log_selected_prob': log_prob, 'value': v, 'action': act, 'outcome': oc, 'reward': rew, 'return': ret, 'episode_mask': emask, @@ -159,7 +159,7 @@ def forward_prediction(model, hidden, batch, args): for k, o in outputs.items(): o = o.view(*batch['turn_mask'].size()[:2], -1, o.size(-1)) - if k == 'selected_prob': + if k == 'log_selected_prob': # gather turn player's policies outputs[k] = o.mul(batch['turn_mask']).sum(2, keepdim=True) else: @@ -205,8 +205,8 @@ def compute_loss(batch, model, hidden, args): emasks = batch['episode_mask'] clip_rho_threshold, clip_c_threshold = 1.0, 1.0 - log_selected_b_policies = torch.log(torch.clamp(batch['selected_prob'], 1e-16, 1)) * emasks - log_selected_t_policies = torch.log(torch.clamp(outputs['selected_prob'], 1e-16, 1)) * emasks + log_selected_b_policies = batch['log_selected_prob'] * emasks + log_selected_t_policies = outputs['log_selected_prob'] * emasks # thresholds of importance sampling log_rhos = log_selected_t_policies.detach() - log_selected_b_policies From f25b8ed83d85589a392425e8ad848504a304503c Mon Sep 17 00:00:00 2001 From: YuriCat Date: Tue, 18 Jan 2022 02:27:39 +0900 Subject: [PATCH 07/13] feature: action dimension --- handyrl/agent.py | 4 ++-- handyrl/generation.py | 4 ++-- handyrl/train.py | 10 ++++++---- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/handyrl/agent.py b/handyrl/agent.py index cc759cb0..4100d5ec 100755 --- a/handyrl/agent.py +++ b/handyrl/agent.py @@ -58,8 +58,8 @@ def action(self, env, player, show=False): obs = env.observation(player) outputs = self.plan(obs) - action = outputs['action'][0] - prob = outputs['selected_prob'][0] + action = outputs['action'] + prob = outputs['selected_prob'] v = outputs.get('value', None) if show: diff --git a/handyrl/generation.py b/handyrl/generation.py index d8974479..48398ff6 100755 --- a/handyrl/generation.py +++ b/handyrl/generation.py @@ -46,8 +46,8 @@ def generate(self, models, args): moment['value'][player] = v if player in turn_players: - moment['action'][player] = outputs['action'][0] - moment['selected_prob'][player] = outputs['selected_prob'][0] + moment['action'][player] = outputs['action'] + moment['selected_prob'][player] = outputs['selected_prob'] err = self.env.step(moment['action']) if err: diff --git a/handyrl/train.py b/handyrl/train.py index 5a28ca9c..82290c80 100755 --- a/handyrl/train.py +++ b/handyrl/train.py @@ -57,16 +57,18 @@ def replace_none(a, b): players = [random.choice(players)] obs_zeros = map_r(moments[0]['observation'][moments[0]['turn'][0]], lambda o: np.zeros_like(o)) # template for padding + prob_ones = np.ones_like(moments[0]['selected_prob'][moments[0]['turn'][0]]) + act_zeros = np.zeros_like(moments[0]['action'][moments[0]['turn'][0]]) # data that is chainge by training configuration if args['turn_based_training'] and not args['observation']: obs = [[m['observation'][m['turn'][0]]] for m in moments] - prob = np.array([[[m['selected_prob'][m['turn'][0]]]] for m in moments]) - act = np.array([[[m['action'][m['turn'][0]]]] for m in moments], dtype=np.int64) + prob = np.array([[m['selected_prob'][m['turn'][0]]] for m in moments]) + act = np.array([[m['action'][m['turn'][0]]] for m in moments]) else: obs = [[replace_none(m['observation'][player], obs_zeros) for player in players] for m in moments] - prob = np.array([[[replace_none(m['selected_prob'][player], 1.0)] for player in players] for m in moments]) - act = np.array([[[replace_none(m['action'][player], 0)] for player in players] for m in moments], dtype=np.int64) + prob = np.array([[replace_none(m['selected_prob'][player], prob_ones) for player in players] for m in moments]) + act = np.array([[replace_none(m['action'][player], act_zeros) for player in players] for m in moments]) # reshape observation obs = rotate(rotate(obs)) # (T, P, ..., ...) -> (P, ..., T, ...) -> (..., T, P, ...) From ea55fe997f90b9e7bbecbdf3426c9fee743709d9 Mon Sep 17 00:00:00 2001 From: YuriCat Date: Tue, 18 Jan 2022 03:11:36 +0900 Subject: [PATCH 08/13] fix: padded log_selected_prob --- handyrl/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/handyrl/train.py b/handyrl/train.py index 158a44fe..d91a8f39 100755 --- a/handyrl/train.py +++ b/handyrl/train.py @@ -90,7 +90,7 @@ def replace_none(a, b): if len(tmask) < args['forward_steps']: pad_len = args['forward_steps'] - len(tmask) obs = map_r(obs, lambda o: np.pad(o, [(0, pad_len)] + [(0, 0)] * (len(o.shape) - 1), 'constant', constant_values=0)) - log_prob = np.pad(log_prob, [(0, pad_len), (0, 0), (0, 0)], 'constant', constant_values=1) + log_prob = np.pad(log_prob, [(0, pad_len), (0, 0), (0, 0)], 'constant', constant_values=0) v = np.concatenate([v, np.tile(oc, [pad_len, 1, 1])]) act = np.pad(act, [(0, pad_len), (0, 0), (0, 0)], 'constant', constant_values=0) rew = np.pad(rew, [(0, pad_len), (0, 0), (0, 0)], 'constant', constant_values=0) From f05d45f721eebdf9ffcd300bc8884fc2d712c2f8 Mon Sep 17 00:00:00 2001 From: YuriCat Date: Tue, 18 Jan 2022 05:39:05 +0900 Subject: [PATCH 09/13] fix: prob in agent.py --- handyrl/agent.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/handyrl/agent.py b/handyrl/agent.py index 4100d5ec..bd7d3d31 100755 --- a/handyrl/agent.py +++ b/handyrl/agent.py @@ -63,7 +63,7 @@ def action(self, env, player, show=False): v = outputs.get('value', None) if show: - print_outputs(env, action, selected_prob, v) + print_outputs(env, action, prob, v) return action From d448cb5c9c4c30f57d9916c93aa257f76ef0dae5 Mon Sep 17 00:00:00 2001 From: YuriCat Date: Sat, 22 Jan 2022 08:30:54 +0900 Subject: [PATCH 10/13] fix: duplicate substitution --- handyrl/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/handyrl/train.py b/handyrl/train.py index d898855b..dacb03df 100755 --- a/handyrl/train.py +++ b/handyrl/train.py @@ -145,7 +145,7 @@ def forward_prediction(model, hidden, batch, args): outputs = {} for t in range(batch['turn_mask'].size(1)): obs = map_r(observations, lambda o: o[:, t].reshape(-1, *o.size()[3:])) # (..., B * P, ...) - action = action=batch['action'][:, t].view(-1, *batch['action'].size()[3:]) + action = batch['action'][:, t].view(-1, *batch['action'].size()[3:]) action_mask = batch['action_mask'][:, t].view(-1, *batch['action_mask'].size()[3:]) omask_ = batch['observation_mask'][:, t] omask = map_r(hidden, lambda h: omask_.view(*h.size()[:2], *([1] * (len(h.size()) - 2)))) From 4d5b38ecc50d797949fa29f7b3145a935398061c Mon Sep 17 00:00:00 2001 From: YuriCat Date: Tue, 25 Jan 2022 21:48:07 +0900 Subject: [PATCH 11/13] fix: small codefix --- handyrl/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/handyrl/train.py b/handyrl/train.py index 3758e981..d96107f9 100755 --- a/handyrl/train.py +++ b/handyrl/train.py @@ -136,7 +136,7 @@ def forward_prediction(model, hidden, batch, args): if hidden is None: # feed-forward neural network obs = map_r(observations, lambda o: o.view(-1, *o.size()[3:])) - action = action=batch['action'].view(-1, *batch['action'].size()[3:]) + action = batch['action'].view(-1, *batch['action'].size()[3:]) action_mask = batch['action_mask'].view(-1, *batch['action_mask'].size()[3:]) outputs = model(obs, None, action=action, action_mask=action_mask) else: From e92e4d54321b29a2970fc0311f9f2eade101bf05 Mon Sep 17 00:00:00 2001 From: YuriCat Date: Mon, 31 Jan 2022 20:49:32 +0900 Subject: [PATCH 12/13] fix: there is no action mask --- handyrl/train.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/handyrl/train.py b/handyrl/train.py index 3d843eb3..faabe424 100755 --- a/handyrl/train.py +++ b/handyrl/train.py @@ -137,7 +137,7 @@ def forward_prediction(model, hidden, batch, args): # feed-forward neural network obs = map_r(observations, lambda o: o.flatten(0, 2)) # (..., B * T * P or 1, ...) action = batch['action'].flatten(0, 2) - outputs = model(obs, None, action=action, action_mask=action_mask) + outputs = model(obs, None, action=action) outputs = map_r(outputs, lambda o: o.unflatten(0, batch_shape)) # (..., B, T, P or 1, ...) else: # sequential computation with RNN @@ -174,8 +174,6 @@ def forward_prediction(model, hidden, batch, args): o = o.mul(batch['turn_mask']) if o.size(2) > 1 and batch_shape[2] == 1: # turn-alternating batch o = o.sum(2, keepdim=True) # gather turn player's policies - if k == 'selected_prob': - o = o - batch['action_mask'] outputs[k] = o else: # mask valid target values and cumulative rewards From 3bc5f866996c8661cc9ddff8cba90bb6e4cc4268 Mon Sep 17 00:00:00 2001 From: YuriCat Date: Tue, 8 Feb 2022 07:22:39 +0900 Subject: [PATCH 13/13] chore: remove unused imports from model.py --- handyrl/model.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/handyrl/model.py b/handyrl/model.py index 002c1194..44ab9c60 100755 --- a/handyrl/model.py +++ b/handyrl/model.py @@ -6,9 +6,6 @@ import os os.environ['OMP_NUM_THREADS'] = '1' -import copy -import random - import numpy as np import torch torch.set_num_threads(1)