Skip to content

Commit

Permalink
Merge branch 'develop' into feature/remove_entry_server_but_leave_entry
Browse files Browse the repository at this point in the history
  • Loading branch information
YuriCat committed Nov 21, 2023
2 parents 95d3551 + 4565bec commit 5240658
Show file tree
Hide file tree
Showing 10 changed files with 87 additions and 67 deletions.
3 changes: 1 addition & 2 deletions .github/workflows/action.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ jobs:
strategy:
matrix:
os: [ubuntu-latest, macos-latest, windows-latest]
python-version: [3.7, 3.8, 3.9]
python-version: ['3.8', '3.9', '3.10']
steps:
- name: Checkout
uses: actions/checkout@v2
Expand All @@ -24,7 +24,6 @@ jobs:
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
pip install -r handyrl/envs/kaggle/requirements.txt
- name: pytest
run: |
python -m pytest tests
2 changes: 1 addition & 1 deletion handyrl/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def observe(self, env, player, show=False):

class RuleBasedAgent(RandomAgent):
def __init__(self, key=None):
self.key = None
self.key = key

def action(self, env, player, show=False):
if hasattr(env, 'rule_based_action'):
Expand Down
3 changes: 0 additions & 3 deletions handyrl/connection.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,6 @@ def close(self):
self.conn.close()
self.conn = None

def fileno(self):
return self.conn.fileno()

def _recv(self, size):
buf = io.BytesIO()
while size > 0:
Expand Down
22 changes: 12 additions & 10 deletions handyrl/envs/geister.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,18 +131,19 @@ class GeisterNet(nn.Module):
def __init__(self):
super().__init__()

layers, filters, p_filters = 3, 32, 8
layers, filters = 3, 32
p_filters, v_filters = 8, 2
input_channels = 7 + 18 # board channels + scalar inputs
self.input_size = (input_channels, 6, 6)

self.conv1 = nn.Conv2d(input_channels, filters, kernel_size=3, stride=1, padding=1, bias=False)
self.bn1 = nn.BatchNorm2d(filters)
self.body = DRC(layers, filters, filters)

self.head_p_move = Conv2dHead((filters * 2, 6, 6), p_filters, 4)
self.head_p_move = Conv2dHead((filters, 6, 6), p_filters, 4)
self.head_p_set = nn.Linear(1, 70, bias=True)
self.head_v = ScalarHead((filters * 2, 6, 6), 1, 1)
self.head_r = ScalarHead((filters * 2, 6, 6), 1, 1)
self.head_v = ScalarHead((filters, 6, 6), v_filters, 1)
self.head_r = ScalarHead((filters, 6, 6), v_filters, 1)

def init_hidden(self, batch_size=[]):
return self.body.init_hidden(self.input_size[1:], batch_size)
Expand All @@ -154,7 +155,6 @@ def forward(self, x, hidden):

h_e = F.relu(self.bn1(self.conv1(h)))
h, hidden = self.body(h_e, hidden, num_repeats=3)
h = torch.cat([h_e, h], -3)

h_p_move = self.head_p_move(h)
turn_color = s[:, :1]
Expand Down Expand Up @@ -189,10 +189,11 @@ class Environment(BaseEnvironment):

def __init__(self, args=None):
super().__init__()
self.args = args if args is not None else {}
self.reset()

def reset(self, args={}):
self.args = args
def reset(self, args=None):
self.game_args = args if args is not None else {}
self.board = -np.ones((6, 6), dtype=np.int32) # (x, y) -1 is empty
self.color = self.BLACK
self.turn_count = -2 # before setting original positions
Expand Down Expand Up @@ -343,8 +344,9 @@ def _piece(p):
s = ' ' + ' '.join(self.Y) + '\n'
for i in range(6):
s += self.X[i] + ' ' + ' '.join([self.P[_piece(self.board[i, j])] for j in range(6)]) + '\n'
s += 'color = ' + self.C[self.color] + '\n'
s += 'record = ' + self.record_string()
s += 'remained = B:%d R:%d b:%d r:%d' % tuple(self.piece_cnt) + '\n'
s += 'turn = ' + str(self.turn_count).ljust(3) + ' color = ' + self.C[self.color]
# s += 'record = ' + self.record_string()
return s

def _set(self, layout):
Expand Down Expand Up @@ -409,7 +411,7 @@ def diff_info(self, player):

def update(self, info, reset):
if reset:
self.args = {**self.args, **info}
self.game_args = {**self.game_args, **info}
self.reset(info)
elif 'set' in info:
self._set(info['set'])
Expand Down
39 changes: 23 additions & 16 deletions handyrl/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,8 @@ def run(self):
reset = args[1]
if reset:
self.agent.reset(self.env, show=True)
view_transition(self.env)
else:
view_transition(self.env)
self.conn.send(ret)


Expand Down Expand Up @@ -105,7 +106,7 @@ def exec_match(env, agents, critic=None, show=False, game_args={}):
outcome = env.outcome()
if show:
print('final outcome = %s' % outcome)
return outcome
return {'result': outcome}


def exec_network_match(env, network_agents, critic=None, show=False, game_args={}):
Expand Down Expand Up @@ -137,7 +138,7 @@ def exec_network_match(env, network_agents, critic=None, show=False, game_args={
outcome = env.outcome()
for p, agent in network_agents.items():
agent.outcome(outcome[p])
return outcome
return {'result': outcome}


def build_agent(raw, env=None):
Expand Down Expand Up @@ -169,11 +170,11 @@ def execute(self, models, args):
else:
agents[p] = Agent(model)

outcome = exec_match(self.env, agents)
if outcome is None:
results = exec_match(self.env, agents)
if results is None:
print('None episode in evaluation!')
return None
return {'args': args, 'result': outcome, 'opponent': opponent}
return {'args': args, 'opponent': opponent, **results}


def wp_func(results):
Expand All @@ -195,10 +196,10 @@ def eval_process_mp_child(agents, critic, env_args, index, in_queue, out_queue,
print('*** Game %d ***' % g)
agent_map = {env.players()[p]: agents[ai] for p, ai in enumerate(agent_ids)}
if isinstance(list(agent_map.values())[0], NetworkAgent):
outcome = exec_network_match(env, agent_map, critic, show=show, game_args=game_args)
results = exec_network_match(env, agent_map, critic, show=show, game_args=game_args)
else:
outcome = exec_match(env, agent_map, critic, show=show, game_args=game_args)
out_queue.put((pat_idx, agent_ids, outcome))
results = exec_match(env, agent_map, critic, show=show, game_args=game_args)
out_queue.put((pat_idx, agent_ids, results))
out_queue.put(None)


Expand Down Expand Up @@ -245,7 +246,8 @@ def evaluate_mp(env, agents, critic, env_args, args_patterns, num_process, num_g
if ret is None:
finished_cnt += 1
continue
pat_idx, agent_ids, outcome = ret
pat_idx, agent_ids, results = ret
outcome = results.get('result')
if outcome is not None:
for idx, p in enumerate(env.players()):
agent_id = agent_ids[idx]
Expand Down Expand Up @@ -378,22 +380,27 @@ def eval_main(args, argv):
prepare_env(env_args)
env = make_env(env_args)

model_path = argv[0] if len(argv) >= 1 else 'models/latest.pth'
model_paths = argv[0].split(':') if len(argv) >= 1 else ['models/latest.pth']
num_games = int(argv[1]) if len(argv) >= 2 else 100
num_process = int(argv[2]) if len(argv) >= 3 else 1

agent1 = build_agent(model_path, env)
if agent1 is None:
model = load_model(model_path, env.net())
agent1 = Agent(model)
def resolve_agent(model_path):
agent = build_agent(model_path, env)
if agent is None:
model = load_model(model_path, env.net())
agent = Agent(model)
return agent

main_agent = resolve_agent(model_paths[0])
critic = None

print('%d process, %d games' % (num_process, num_games))

seed = random.randrange(1e8)
print('seed = %d' % seed)

agents = [agent1] + [RandomAgent() for _ in range(len(env.players()) - 1)]
opponent = model_paths[1] if len(model_paths) > 1 else 'random'
agents = [main_agent] + [resolve_agent(opponent) for _ in range(len(env.players()) - 1)]

evaluate_mp(env, agents, critic, env_args, {'default': {}}, num_process, num_games, seed)

Expand Down
27 changes: 16 additions & 11 deletions handyrl/losses.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,38 +17,40 @@ def monte_carlo(values, returns):
return returns, returns - values


def temporal_difference(values, returns, rewards, lmb, gamma):
def temporal_difference(values, returns, rewards, lambda_, gamma):
target_values = deque([returns[:, -1]])
for i in range(values.size(1) - 2, -1, -1):
reward = rewards[:, i] if rewards is not None else 0
target_values.appendleft(reward + gamma * ((1 - lmb) * values[:, i + 1] + lmb * target_values[0]))
lamb = lambda_[:, i + 1]
target_values.appendleft(reward + gamma * ((1 - lamb) * values[:, i + 1] + lamb * target_values[0]))

target_values = torch.stack(tuple(target_values), dim=1)

return target_values, target_values - values


def upgo(values, returns, rewards, lmb, gamma):
def upgo(values, returns, rewards, lambda_, gamma):
target_values = deque([returns[:, -1]])
for i in range(values.size(1) - 2, -1, -1):
value = values[:, i + 1]
reward = rewards[:, i] if rewards is not None else 0
target_values.appendleft(reward + gamma * torch.max(value, (1 - lmb) * value + lmb * target_values[0]))
lamb = lambda_[:, i + 1]
target_values.appendleft(reward + gamma * torch.max(value, (1 - lamb) * value + lamb * target_values[0]))

target_values = torch.stack(tuple(target_values), dim=1)

return target_values, target_values - values


def vtrace(values, returns, rewards, lmb, gamma, rhos, cs):
def vtrace(values, returns, rewards, lambda_, gamma, rhos, cs):
rewards = rewards if rewards is not None else 0
values_t_plus_1 = torch.cat([values[:, 1:], returns[:, -1:]], dim=1)
deltas = rhos * (rewards + gamma * values_t_plus_1 - values)

# compute Vtrace value target recursively
vs_minus_v_xs = deque([deltas[:, -1]])
for i in range(values.size(1) - 2, -1, -1):
vs_minus_v_xs.appendleft(deltas[:, i] + gamma * lmb * cs[:, i] * vs_minus_v_xs[0])
vs_minus_v_xs.appendleft(deltas[:, i] + gamma * lambda_[:, i + 1] * cs[:, i] * vs_minus_v_xs[0])

vs_minus_v_xs = torch.stack(tuple(vs_minus_v_xs), dim=1)
vs = vs_minus_v_xs + values
Expand All @@ -58,18 +60,21 @@ def vtrace(values, returns, rewards, lmb, gamma, rhos, cs):
return vs, advantages


def compute_target(algorithm, values, returns, rewards, lmb, gamma, rhos, cs):
def compute_target(algorithm, values, returns, rewards, lmb, gamma, rhos, cs, masks):
if values is None:
# In the absence of a baseline, Monte Carlo returns are used.
return returns, returns

if algorithm == 'MC':
return monte_carlo(values, returns)
elif algorithm == 'TD':
return temporal_difference(values, returns, rewards, lmb, gamma)

lambda_ = lmb + (1 - lmb) * (1 - masks)

if algorithm == 'TD':
return temporal_difference(values, returns, rewards, lambda_, gamma)
elif algorithm == 'UPGO':
return upgo(values, returns, rewards, lmb, gamma)
return upgo(values, returns, rewards, lambda_, gamma)
elif algorithm == 'VTRACE':
return vtrace(values, returns, rewards, lmb, gamma, rhos, cs)
return vtrace(values, returns, rewards, lambda_, gamma, rhos, cs)
else:
print('No algorithm named %s' % algorithm)
2 changes: 1 addition & 1 deletion handyrl/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,5 +70,5 @@ def __init__(self, model, x):
outputs = wrapped_model.inference(x, hidden)
self.output_dict = {key: np.zeros_like(value) for key, value in outputs.items() if key != 'hidden'}

def inference(self, *args):
def inference(self, *args, **kwargs):
return self.output_dict
49 changes: 29 additions & 20 deletions handyrl/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,24 +63,23 @@ def replace_none(a, b):

# data that is changed by training configuration
if args['turn_based_training'] and not args['observation']:
obs = [[m['observation'][m['turn'][0]]] for m in moments]
prob = np.array([[[m['selected_prob'][m['turn'][0]]]] for m in moments])
act = np.array([[m['action'][m['turn'][0]]] for m in moments], dtype=np.int64)[..., np.newaxis]
amask = np.array([[m['action_mask'][m['turn'][0]]] for m in moments])
players_list = [[m['turn'][0]] for m in moments]
else:
obs = [[replace_none(m['observation'][player], obs_zeros) for player in players] for m in moments]
prob = np.array([[[replace_none(m['selected_prob'][player], 1.0)] for player in players] for m in moments])
act = np.array([[replace_none(m['action'][player], 0) for player in players] for m in moments], dtype=np.int64)[..., np.newaxis]
amask = np.array([[replace_none(m['action_mask'][player], amask_zeros + 1e32) for player in players] for m in moments])
players_list = [players for m in moments]

obs = [[replace_none(m['observation'][player], obs_zeros) for player in players_] for m, players_ in zip(moments, players_list)]
prob = np.array([[[replace_none(m['selected_prob'][player], 1.0)] for player in players_] for m, players_ in zip(moments, players_list)])
act = np.array([[replace_none(m['action'][player], 0) for player in players_] for m, players_ in zip(moments, players_list)], dtype=np.int64)[..., np.newaxis]
amask = np.array([[replace_none(m['action_mask'][player], amask_zeros + 1e32) for player in players_] for m, players_ in zip(moments, players_list)])

# reshape observation
obs = rotate(rotate(obs)) # (T, P, ..., ...) -> (P, ..., T, ...) -> (..., T, P, ...)
obs = bimap_r(obs_zeros, obs, lambda _, o: np.array(o))

# datum that is not changed by training configuration
v = np.array([[replace_none(m['value'][player], [0]) for player in players] for m in moments], dtype=np.float32).reshape(len(moments), len(players), -1)
rew = np.array([[replace_none(m['reward'][player], [0]) for player in players] for m in moments], dtype=np.float32).reshape(len(moments), len(players), -1)
ret = np.array([[replace_none(m['return'][player], [0]) for player in players] for m in moments], dtype=np.float32).reshape(len(moments), len(players), -1)
v = np.array([[replace_none(m['value'][player], 0) for player in players] for m in moments], dtype=np.float32).reshape(len(moments), len(players), -1)
rew = np.array([[replace_none(m['reward'][player], 0) for player in players] for m in moments], dtype=np.float32).reshape(len(moments), len(players), -1)
ret = np.array([[replace_none(m['return'][player], 0) for player in players] for m in moments], dtype=np.float32).reshape(len(moments), len(players), -1)
oc = np.array([ep['outcome'][player] for player in players], dtype=np.float32).reshape(1, len(players), -1)

emask = np.ones((len(moments), 1, 1), dtype=np.float32) # episode mask
Expand Down Expand Up @@ -224,6 +223,9 @@ def compute_loss(batch, model, hidden, args):

actions = batch['action']
emasks = batch['episode_mask']
omasks = batch['observation_mask']
value_target_masks, return_target_masks = omasks, omasks

clip_rho_threshold, clip_c_threshold = 1.0, 1.0

log_selected_b_policies = torch.log(torch.clamp(batch['selected_prob'], 1e-16, 1)) * emasks
Expand All @@ -239,16 +241,18 @@ def compute_loss(batch, model, hidden, args):
if 'value' in outputs_nograd:
values_nograd = outputs_nograd['value']
if args['turn_based_training'] and values_nograd.size(2) == 2: # two player zerosum game
values_nograd_opponent = -torch.stack([values_nograd[:, :, 1], values_nograd[:, :, 0]], dim=2)
values_nograd = (values_nograd + values_nograd_opponent) / (batch['observation_mask'].sum(dim=2, keepdim=True) + 1e-8)
values_nograd_opponent = -torch.flip(values_nograd, dims=[2])
omasks_opponent = torch.flip(omasks, dims=[2])
values_nograd = (values_nograd * omasks + values_nograd_opponent * omasks_opponent) / (omasks + omasks_opponent + 1e-8)
value_target_masks = torch.clamp(omasks + omasks_opponent, 0, 1)
outputs_nograd['value'] = values_nograd * emasks + batch['outcome'] * (1 - emasks)

# compute targets and advantage
targets = {}
advantages = {}

value_args = outputs_nograd.get('value', None), batch['outcome'], None, args['lambda'], 1, clipped_rhos, cs
return_args = outputs_nograd.get('return', None), batch['return'], batch['reward'], args['lambda'], args['gamma'], clipped_rhos, cs
value_args = outputs_nograd.get('value', None), batch['outcome'], None, args['lambda'], 1, clipped_rhos, cs, value_target_masks
return_args = outputs_nograd.get('return', None), batch['return'], batch['reward'], args['lambda'], args['gamma'], clipped_rhos, cs, return_target_masks

targets['value'], advantages['value'] = compute_target(args['value_target'], *value_args)
targets['return'], advantages['return'] = compute_target(args['value_target'], *return_args)
Expand Down Expand Up @@ -286,11 +290,16 @@ def run(self):

def select_episode(self):
while True:
ep_idx = random.randrange(min(len(self.episodes), self.args['maximum_episodes']))
accept_rate = 1 - (len(self.episodes) - 1 - ep_idx) / self.args['maximum_episodes']
if random.random() < accept_rate:
ep_count = min(len(self.episodes), self.args['maximum_episodes'])
ep_idx = random.randrange(ep_count)
accept_rate = 1 - (ep_count - 1 - ep_idx) / ep_count
if random.random() >= accept_rate:
continue
try:
ep = self.episodes[ep_idx]
break
ep = self.episodes[ep_idx]
except IndexError:
continue
turn_candidates = 1 + max(0, ep['steps'] - self.args['forward_steps']) # change start turn by sequence length
train_st = random.randrange(turn_candidates)
st = max(0, train_st - self.args['burn_in_steps'])
Expand Down Expand Up @@ -427,7 +436,7 @@ def __init__(self, args, net=None, remote=False):
self.worker = WorkerServer(args) if remote else WorkerCluster(args)

# thread connection
self.trainer = Trainer(args, self.model)
self.trainer = Trainer(args, copy.deepcopy(self.model))

def model_path(self, model_id):
return os.path.join('models', str(model_id) + '.pth')
Expand Down
5 changes: 3 additions & 2 deletions scripts/win_rate_plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,8 +87,9 @@ def get_wp_list(path):
for opponent in opponents:
wp_list = averaged_wp_lists[opponent]
start = start_epoch[opponent]
# ax.plot(clipped_epoch_list[start:], wp_list[start:], label=opponent)
ax.plot(clipped_game_list[start:], wp_list[start:], label=opponent)
end = min(min(len(clipped_epoch_list), len(clipped_game_list)), len(wp_list))
# ax.plot(clipped_epoch_list[start:end], wp_list[start:end], label=opponent)
ax.plot(clipped_game_list[start:end], wp_list[start:end], label=opponent)
last_win_rate[opponent] = wp_list[-1]

ax.set_xlabel('Games', size=14)
Expand Down
Loading

0 comments on commit 5240658

Please sign in to comment.