-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathqmc.py
More file actions
83 lines (71 loc) · 2.61 KB
/
qmc.py
File metadata and controls
83 lines (71 loc) · 2.61 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import gym
import pandas as pd
import numpy as np
import random
MAX_STEPS = 50000
MAX_EPSILON_RED_STEPS = 5000
class Agent(object):
def __init__(self, eta, gamma, env, num_states, num_actions):
self.eta = eta
self.gamma = gamma
self.actions = [i for i in xrange(num_actions)]
self.env = env
# self.q_table = np.random.uniform(low=-1, high=1, size=(num_states, num_actions))
self.q_table = np.zeros(shape=(num_states, num_actions))
self.epsilon = 0.5
self.epsilon_init = 0.5
self.epsilon_final = 0.3
def execute_action(self, action):
return self.env.step(action) # observation, reward, done, info
def maxQ(self, s):
return max(self.q_table[s])
def argmaxQ(self, s):
return self.q_table[s].argsort()[-1]
def updateQ(self, s, a, r, new_s):
self.q_table[s][a] += self.eta * (r + self.gamma * self.maxQ(new_s) - self.q_table[s][a])
def update_epsilon(self, t):
if self.epsilon > self.epsilon_final:
self.epsilon -= (self.epsilon_init - self.epsilon_final) / MAX_EPSILON_RED_STEPS
def policy(self, s):
epsilon = random.uniform(0, 1)
if epsilon <= 0.1:
action = random.choice(self.actions)
else:
action = self.argmaxQ(s)
return action
def build_state(self, o):
position, velocity = o
features = [self.to_bin(position, position_bins),
self.to_bin(velocity, velocity_bins)]
return int("".join(map(lambda feature: str(int(feature)), features)))
def to_bin(self, value, bins):
return np.digitize(x=[value], bins=bins)[0]
env = gym.make('MountainCar-v0')
experiment_filename = './mountaincar-experiment-1'
env.monitor.start(experiment_filename, force=True)
n_features = env.observation_space.shape[0]
bins = 10
n_states = bins ** n_features
n_actions = env.action_space.n
position_bins = pd.cut([-1.2, 0.6], bins=bins, retbins=True)[1][1:-1]
velocity_bins = pd.cut([-0.07, 0.07], bins=bins, retbins=True)[1][1:-1]
q_agent = Agent(0.05, 0.99, env, n_states, n_actions)
step = 0
for e in xrange(50000):
o = env.reset()
s = q_agent.build_state(o)
accum_r = 0
for t in xrange(200):
a = q_agent.policy(s)
o, r, done, i = q_agent.execute_action(a)
# env.render()
new_s = q_agent.build_state(o)
q_agent.updateQ(s, a, r, new_s)
s = new_s
accum_r += r
step += 1
if done:
print "Episode=" + str(e) + ", Total episodic reward=" + str(accum_r)
break
env.monitor.close()
print q_agent.q_table