-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathModel.py
More file actions
90 lines (77 loc) · 4.62 KB
/
Model.py
File metadata and controls
90 lines (77 loc) · 4.62 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import numpy as np
import random
import tensorflow as tf
import tensorflow.contrib.slim as slim
class Qnetwork:
def __init__(self, h_size, rnn_cell, myScope):
# The network receives a frame from the game, flattened into an array.
# It then resizes it and processes it through four convolutional layers.
self.scalarInput = tf.placeholder(shape=[None, 21168], dtype=tf.float32)
self.imageIn = tf.reshape(self.scalarInput, shape=[-1, 84, 84, 3])
self.conv1 = slim.convolution2d(
inputs=self.imageIn, num_outputs=32,
kernel_size=[8, 8], stride=[4, 4], padding='VALID',
biases_initializer=None, scope=myScope + '_conv1')
self.conv2 = slim.convolution2d(
inputs=self.conv1, num_outputs=64,
kernel_size=[4, 4], stride=[2, 2], padding='VALID',
biases_initializer=None, scope=myScope + '_conv2')
self.conv3 = slim.convolution2d(
inputs=self.conv2, num_outputs=64,
kernel_size=[3, 3], stride=[1, 1], padding='VALID',
biases_initializer=None, scope=myScope + '_conv3')
self.conv4 = slim.convolution2d(
inputs=self.conv3, num_outputs=h_size,
kernel_size=[7, 7], stride=[1, 1], padding='VALID',
biases_initializer=None, scope=myScope + '_conv4')
self.trainLength = tf.placeholder(dtype=tf.int32)
# We take the output from the final convolutional layer and send it to a recurrent layer.
# The input must be reshaped into [batch x trace x units] for rnn processing,
# and then returned to [batch x units] when sent through the upper levels.
self.batch_size = tf.placeholder(dtype=tf.int32, shape=[])
self.convFlat = tf.reshape(slim.flatten(self.conv4), [self.batch_size, self.trainLength, h_size])
self.state_in = rnn_cell.zero_state(self.batch_size, tf.float32)
self.rnn, self.rnn_state = tf.nn.dynamic_rnn(
inputs=self.convFlat, cell=rnn_cell, dtype=tf.float32, initial_state=self.state_in, scope=myScope + '_rnn')
self.rnn = tf.reshape(self.rnn, shape=[-1, h_size])
# The output from the recurrent player is then split into separate Value and Advantage streams
self.streamA, self.streamV = tf.split(self.rnn, 2, 1)
self.AW = tf.Variable(tf.random_normal([h_size // 2, 4]))
self.VW = tf.Variable(tf.random_normal([h_size // 2, 1]))
self.Advantage = tf.matmul(self.streamA, self.AW)
self.Value = tf.matmul(self.streamV, self.VW)
self.salience = tf.gradients(self.Advantage, self.imageIn)
# Then combine them together to get our final Q-values.
self.Qout = self.Value + tf.subtract(self.Advantage, tf.reduce_mean(self.Advantage, axis=1, keep_dims=True))
self.predict = tf.argmax(self.Qout, 1)
# Below we obtain the loss by taking the sum of squares difference between the target and prediction Q values.
self.targetQ = tf.placeholder(shape=[None], dtype=tf.float32)
self.actions = tf.placeholder(shape=[None], dtype=tf.int32)
self.actions_onehot = tf.one_hot(self.actions, 4, dtype=tf.float32)
self.Q = tf.reduce_sum(tf.multiply(self.Qout, self.actions_onehot), axis=1)
self.td_error = tf.square(self.targetQ - self.Q)
# In order to only propagate accurate gradients through the network, we will mask the first
# half of the losses for each trace as per Lample & Chatlot 2016
self.maskA = tf.zeros([self.batch_size, self.trainLength // 2])
self.maskB = tf.ones([self.batch_size, self.trainLength // 2])
self.mask = tf.concat([self.maskA, self.maskB], 1)
self.mask = tf.reshape(self.mask, [-1])
self.loss = tf.reduce_mean(self.td_error * self.mask)
self.trainer = tf.train.AdamOptimizer(learning_rate=0.0001)
self.updateModel = self.trainer.minimize(self.loss)
class experience_buffer():
def __init__(self, buffer_size=1000):
self.buffer = []
self.buffer_size = buffer_size
def add(self, experience):
if len(self.buffer) + 1 >= self.buffer_size:
self.buffer[0:(1 + len(self.buffer)) - self.buffer_size] = []
self.buffer.append(experience)
def sample(self, batch_size, trace_length):
sampled_episodes = random.sample(self.buffer, batch_size)
sampledTraces = []
for episode in sampled_episodes:
point = np.random.randint(0, len(episode) + 1 - trace_length)
sampledTraces.append(episode[point:point + trace_length])
sampledTraces = np.array(sampledTraces)
return np.reshape(sampledTraces, [batch_size * trace_length, 5])