-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtd_pong_learn.py
110 lines (79 loc) · 2.62 KB
/
td_pong_learn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import random
import numpy
import sys
import time
import os
import json
#import pong_environment as env
import pong_environment_training as env
policy_filename = "pong_policy.dat"
values_filename = "pong_values.dat"
alpha = 0.1 # values / critic learning parameter
beta = 0.01 # actor learning parameter
gamma = 0.5 # error signal: future states parameter
world_dim = env.getWorldDim()
num_possible_moves = env.getActionDim()
state = env.getState()
iterations = int(sys.argv[1])
pol_file = None
val_file = None
if os.path.exists(policy_filename):
pol_file = open(policy_filename, 'r+')
policy = numpy.array(json.loads(pol_file.read()))
pol_file.close()
else:
#create random policy
policy = numpy.random.rand(world_dim[1], world_dim[0], num_possible_moves)
pol_file = open(policy_filename, 'w+')
if os.path.exists(values_filename):
val_file = open(values_filename, 'r+')
values = numpy.array(json.loads(val_file.read()))
val_file.close()
else:
#create empty value funcion
values = numpy.zeros([world_dim[1], world_dim[0]])
val_file = open(values_filename, 'w+')
def cum_softmax_direction_prop(state):
# calculates the cumulated softmax propability for every possible action
current_policy = policy[state[1], state[0], :] # prop in this agent_pos
#print current_policy
softmax_prop = numpy.exp(current_policy)
#print numpy.sum(softmax_prop)
softmax_prop = softmax_prop / numpy.sum(softmax_prop) # softmax: (e^prop) / (sum(e^prop))
cum_softmax_prop = numpy.cumsum(softmax_prop) # cumulating
return (cum_softmax_prop)
def pick_action(state):
cum_softmax_prop = cum_softmax_direction_prop(state)
r = numpy.random.rand()
for i in range(len(cum_softmax_prop)):
if cum_softmax_prop[i] > r:
return i
def critic(state, last_state, reward):
error = reward - values[last_state[1], last_state[0]] + gamma * values[state[1], state[0]]
return (error)
i = 0
while i < iterations:
#time.sleep(0.9)
i += 1
sys.stdout.write(str(float(i)/iterations) + "\r")
direction = pick_action(state)
last_state = state[:][:]
outcome = 0
state, outcome = env.move(direction)
error = critic(state, last_state, outcome)
if outcome != 0 or state != last_state:
# print "error ", error
values[last_state[1], last_state[0]] += alpha * error
policy[last_state[1], last_state[0], direction] += beta * error
# if outcome != 0:
# for row in values:
# print numpy.array(row, dtype=int)
pol_file.seek(0)
val_file.seek(0)
pol_file.write(json.dumps(policy.tolist()))
val_file.write(json.dumps(values.tolist()))
pol_file.truncate()
val_file.truncate()
print values
pol_file.close()
val_file.close()