-
Notifications
You must be signed in to change notification settings - Fork 1
/
gridworld.py
executable file
·171 lines (122 loc) · 3.9 KB
/
gridworld.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
import numpy as np
import pandas as pd
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from IPython.display import display, HTML
pd.set_option("display.precision", 2)
action_map = {0: "up", 1:"down", 2:"right", 3:"left"}
symbol_map = [u"\u2b06", u"\u2b07", u"\u27a1", u"\u2b05"]
class gridworld:
def __init__(self, dims, start, goal, error_states, p, gen_reward, goal_reward, error_reward):
#M - rows
self.M = dims[0]
#N - columns
self.N = dims[1]
#start - start state
self.start = start
#current state (initial) - start state
self.current_state = start
#goal state
self.goal = goal
#error_states - Error state indices
self.error_states = error_states
#p - probability of moving in the direction of action
self.p = p
#np - probability of moving in a direction other than that of action
self.np = (1.0 - p)/3
#reward for reaching a general state
self.gen_reward = gen_reward
#reward for reaching an error state
self.error_reward = error_reward
#reward for reaching a goal state
self.goal_reward = goal_reward
# #visual representation of gridworld
self.visual_gw = np.array([["-" for i in range(self.N)] for j in range(self.M)])
self.visual_gw[self.start] = "S"
self.visual_gw[self.goal] = "G"
for es in self.error_states:
self.visual_gw[es] = "E"
#set current state to start state
def reset(self):
self.current_state = self.start
return self.current_state
#input - action
#output - new state, reward
def step(self, action_num):
action = action_map[action_num]
alternate_moves = { "up" : ["down", "right", "left"],
"down" : ["up", "right", "left"],
"left" : ["down", "right", "up"],
"right" : ["down", "up", "left"],
}
#sample direction of movement according to action and p
rand_num = np.random.rand(1)[0]
if rand_num < self.p:
move = action
elif rand_num < self.p + self.np:
move = alternate_moves[action][0]
elif rand_num < self.p + 2*self.np:
move = alternate_moves[action][1]
else:
move = alternate_moves[action][2]
#find new state using state and move
new_state = (-1, -1)
if move == "up":
if self.current_state[0] == 0:
new_state = self.current_state
else:
new_state = (self.current_state[0]-1, self.current_state[1])
elif move == "down":
if self.current_state[0] == self.M-1:
new_state = self.current_state
else:
new_state = (self.current_state[0]+1, self.current_state[1])
elif move == "left":
if self.current_state[1] == 0:
new_state = self.current_state
else:
new_state = (self.current_state[0], self.current_state[1]-1)
elif move == "right":
if self.current_state[1] == self.N-1:
new_state = self.current_state
else:
new_state = (self.current_state[0], self.current_state[1]+1)
self.current_state = new_state
#return new state, reward and a boolean representing whether a terminal state is done or not
if new_state == self.goal:
return new_state, self.goal_reward, True
elif new_state in self.error_states:
return new_state, self.error_reward, True
else:
return new_state, self.gen_reward, False
#print gridworld
def print_gw(self):
display(pd.DataFrame(self.visual_gw))
#print value
def print_v(self, v):
vis_v = v.tolist()
g = self.goal
vis_v[g[0]][g[1]] = "G"
for e in self.error_states:
vis_v[e[0]][e[1]] = "E"
vis_v = pd.DataFrame(vis_v)
display(vis_v)
#print policy
def print_policy(self, pi):
vis_pi = pi.tolist()
for i in range(pi.shape[0]):
for j in range(pi.shape[1]):
vis_pi[i][j] = symbol_map[pi[i, j]]
g = self.goal
vis_pi[g[0]][g[1]] = "G"
for e in self.error_states:
vis_pi[e[0]][e[1]] = "E"
vis_pi = pd.DataFrame(vis_pi)
display(vis_pi)
# gw = gridworld((2, 2), (0, 0), (1, 1), [(0, 1)], 0.5, 0, 100, -100)
# gw.reset()
# gw.print_gw()
# gw.step(1)
# gw.print_gw()
# gw.step(1)
# gw.print_gw()