DRL.py


def part_0():
    import gym
    import numpy as np
    import tensorflow as tf
    import matplotlib.pyplot as plt

    env = gym.make('FrozenLake-v0')
    env.render()

    tf.reset_default_graph()

    # These lines establish the feed-forward part of the network used to choose actions
    inputs1 = tf.placeholder(shape=[1, 16], dtype=tf.float32)
    W = tf.Variable(tf.random_uniform([16, 4], 0, 0.01))
    Qout = tf.matmul(inputs1, W)
    predict = tf.argmax(Qout, 1)

    # Below we obtain the loss by taking the sum of squares difference between the target and prediction Q values.
    nextQ = tf.placeholder(shape=[1, 4], dtype=tf.float32)
    loss = tf.reduce_sum(tf.square(nextQ - Qout))
    trainer = tf.train.GradientDescentOptimizer(learning_rate=0.1)
    updateModel = trainer.minimize(loss)

    # init = tf.initialize_all_variables()      # deprecated
    init = tf.global_variables_initializer()
    print(init)

    # Set learning parameters
    y = .99
    e = 0.1
    num_episodes = 2000     # default: 2000
    # create lists to contain total rewards and steps per episode
    jList = []
    rList = []
    with tf.Session() as sess:
        sess.run(init)
        for i in range(num_episodes):
            if i % 100 == 0:
                print('episode:', i)

            # Reset environment and get first new observation
            s = env.reset()
            rAll = 0
            d = False
            j = 0
            # The Q-Network
            while j < 99:
                j += 1
                # Choose an action by greedily (with e chance of random action) from the Q-network
                a, allQ = sess.run([predict, Qout], feed_dict={inputs1: np.identity(16)[s:s + 1]})
                if np.random.rand(1) < e:
                    a[0] = env.action_space.sample()

                # print('j:', j)
                # print('s:', s)
                # print('action', a)
                # print(allQ, '\n')

                # Get new state and reward from environment
                s1, r, d, _ = env.step(a[0])
                # Obtain the Q' values by feeding the new state through our network
                Q1 = sess.run(Qout, feed_dict={inputs1: np.identity(16)[s1:s1 + 1]})
                # Obtain maxQ' and set our target value for chosen action.
                maxQ1 = np.max(Q1)
                targetQ = allQ
                targetQ[0, a[0]] = r + y * maxQ1
                # Train our network using target and predicted Q values
                _, W1 = sess.run([updateModel, W], feed_dict={inputs1: np.identity(16)[s:s + 1], nextQ: targetQ})
                rAll += r
                s = s1
                if d:
                    # Reduce chance of random action as we train the model.
                    e = 1. / ((i / 50) + 10)
                    break

            jList.append(j)
            rList.append(rAll)

    print("Percent of succesful episodes: " + str(sum(rList) / num_episodes))

    print(np.argmax(W1, axis=1))

    # plt.plot(rList)
    # plt.show()
    #
    # plt.plot(jList)
    # plt.show()


def part_4():
    import gym
    import numpy as np
    import random
    import tensorflow as tf
    import tensorflow.contrib.slim as slim
    import matplotlib.pyplot as plt
    import scipy.misc
    import os

    from gridworld import gameEnv

    env = gameEnv(partial=False, size=5)
    # print(env)
    # print(env.renderEnv())


    class Qnetwork():
        def __init__(self, h_size):
            # The network recieves a frame from the game, flattened into an array.
            # It then resizes it and processes it through four convolutional layers.
            self.scalarInput = tf.placeholder(shape=[None, 21168], dtype=tf.float32)
            self.imageIn = tf.reshape(self.scalarInput, shape=[-1, 84, 84, 3])
            self.conv1 = slim.conv2d(inputs=self.imageIn, num_outputs=32, kernel_size=[8, 8], stride=[4, 4], padding='VALID',
                biases_initializer=None)
            self.conv2 = slim.conv2d(inputs=self.conv1, num_outputs=64, kernel_size=[4, 4], stride=[2, 2], padding='VALID',
                biases_initializer=None)
            self.conv3 = slim.conv2d(inputs=self.conv2, num_outputs=64, kernel_size=[3, 3], stride=[1, 1], padding='VALID',
                biases_initializer=None)
            self.conv4 = slim.conv2d(inputs=self.conv3, num_outputs=h_size, kernel_size=[7, 7], stride=[1, 1], padding='VALID',
                biases_initializer=None)

            # We take the output from the final convolutional layer and split it into separate advantage and value streams.
            self.streamAC, self.streamVC = tf.split(self.conv4, 2, 3)
            self.streamA = slim.flatten(self.streamAC)
            self.streamV = slim.flatten(self.streamVC)
            xavier_init = tf.contrib.layers.xavier_initializer()
            self.AW = tf.Variable(xavier_init([h_size // 2, env.actions]))
            self.VW = tf.Variable(xavier_init([h_size // 2, 1]))
            self.Advantage = tf.matmul(self.streamA, self.AW)
            self.Value = tf.matmul(self.streamV, self.VW)

            # Then combine them together to get our final Q-values.
            self.Qout = self.Value + tf.subtract(self.Advantage, tf.reduce_mean(self.Advantage, axis=1, keep_dims=True))
            self.predict = tf.argmax(self.Qout, 1)

            # Below we obtain the loss by taking the sum of squares difference between the target and prediction Q values.
            self.targetQ = tf.placeholder(shape=[None], dtype=tf.float32)
            self.actions = tf.placeholder(shape=[None], dtype=tf.int32)
            self.actions_onehot = tf.one_hot(self.actions, env.actions, dtype=tf.float32)

            self.Q = tf.reduce_sum(tf.multiply(self.Qout, self.actions_onehot), axis=1)

            self.td_error = tf.square(self.targetQ - self.Q)
            self.loss = tf.reduce_mean(self.td_error)
            self.trainer = tf.train.AdamOptimizer(learning_rate=0.0001)
            self.updateModel = self.trainer.minimize(self.loss)


    class experience_buffer():
        def __init__(self, buffer_size=50000):
            self.buffer = []
            self.buffer_size = buffer_size

        def add(self, experience):
            if len(self.buffer) + len(experience) >= self.buffer_size:
                self.buffer[0:(len(experience) + len(self.buffer)) - self.buffer_size] = []
            self.buffer.extend(experience)

        def sample(self, size):
            return np.reshape(np.array(random.sample(self.buffer, size)), [size, 5])


    def processState(states):
        return np.reshape(states, [21168])

    def updateTargetGraph(tfVars, tau):
        total_vars = len(tfVars)
        op_holder = []
        for idx, var in enumerate(tfVars[0:total_vars // 2]):
            op_holder.append(tfVars[idx + total_vars // 2].assign(
                (var.value() * tau) + ((1 - tau) * tfVars[idx + total_vars // 2].value())))
        return op_holder

    def updateTarget(op_holder, sess):
        for op in op_holder:
            sess.run(op)


    batch_size = 32  # How many experiences to use for each training step.
    update_freq = 4  # How often to perform a training step.
    y = .99  # Discount factor on the target Q-values
    startE = 1  # Starting chance of random action
    endE = 0.1  # Final chance of random action
    annealing_steps = 10000.  # How many steps of training to reduce startE to endE.
    num_episodes = 220    # How many episodes of game environment to train network with.    ##### default = 10000
    pre_train_steps = 10000  # How many steps of random actions before training begins.
    max_epLength = 50  # The max allowed length of our episode.
    load_model = False  # Whether to load a saved model.
    path = "./dqn"  # The path to save our model to.
    h_size = 512  # The size of the final convolutional layer before splitting it into Advantage and Value streams.
    tau = 0.001  # Rate to update target network toward primary network


    tf.reset_default_graph()
    mainQN = Qnetwork(h_size)
    targetQN = Qnetwork(h_size)

    init = tf.global_variables_initializer()

    saver = tf.train.Saver()

    trainables = tf.trainable_variables()

    targetOps = updateTargetGraph(trainables, tau)

    myBuffer = experience_buffer()

    # Set the rate of random action decrease.
    e = startE
    stepDrop = (startE - endE) / annealing_steps

    # create lists to contain total rewards and steps per episode
    jList = []
    rList = []
    total_steps = 0

    # Make a path for our model to be saved in.
    if not os.path.exists(path):
        os.makedirs(path)

    with tf.Session() as sess:
        sess.run(init)
        if load_model == True:
            print('Loading Model...')
            ckpt = tf.train.get_checkpoint_state(path)
            saver.restore(sess, ckpt.model_checkpoint_path)
        for i in range(num_episodes):
            print('episode:', i)

            episodeBuffer = experience_buffer()
            # Reset environment and get first new observation
            s = env.reset()
            s = processState(s)
            d = False
            rAll = 0
            j = 0
            # The Q-Network
            while j < max_epLength:  # If the agent takes longer than 200 moves to reach either of the blocks, end the trial.
                j += 1
                # Choose an action by greedily (with e chance of random action) from the Q-network
                if np.random.rand(1) < e or total_steps < pre_train_steps:
                    a = np.random.randint(0, 4)
                else:
                    a = sess.run(mainQN.predict, feed_dict={mainQN.scalarInput: [s]})[0]
                s1, r, d = env.step(a)
                s1 = processState(s1)
                total_steps += 1
                episodeBuffer.add(
                    np.reshape(np.array([s, a, r, s1, d]), [1, 5]))  # Save the experience to our episode buffer.

                if total_steps > pre_train_steps:
                    if e > endE:
                        e -= stepDrop

                    if total_steps % (update_freq) == 0:
                        trainBatch = myBuffer.sample(batch_size)  # Get a random batch of experiences.
                        # Below we perform the Double-DQN update to the target Q-values
                        Q1 = sess.run(mainQN.predict, feed_dict={mainQN.scalarInput: np.vstack(trainBatch[:, 3])})
                        Q2 = sess.run(targetQN.Qout, feed_dict={targetQN.scalarInput: np.vstack(trainBatch[:, 3])})
                        end_multiplier = -(trainBatch[:, 4] - 1)
                        doubleQ = Q2[range(batch_size), Q1]
                        targetQ = trainBatch[:, 2] + (y * doubleQ * end_multiplier)
                        # Update the network with our target values.
                        _ = sess.run(mainQN.updateModel, feed_dict={mainQN.scalarInput: np.vstack(trainBatch[:, 0]),
                                                mainQN.targetQ: targetQ, mainQN.actions: trainBatch[:, 1]})

                        updateTarget(targetOps, sess)  # Update the target network toward the primary network.
                rAll += r
                s = s1

                if d == True:
                    break

            myBuffer.add(episodeBuffer.buffer)
            jList.append(j)
            rList.append(rAll)
            # Periodically save the model.
            if i % 1000 == 0:
                saver.save(sess, path + '/model-' + str(i) + '.ckpt')
                print("Saved Model")
            if len(rList) % 10 == 0:
                print(total_steps, np.mean(rList[-10:]), e)
        saver.save(sess, path + '/model-' + str(i) + '.ckpt')
    print("Percent of successful episodes: " + str(sum(rList) / num_episodes) + "%")


    rMat = np.resize(np.array(rList), [len(rList) // 100, 100])
    rMean = np.average(rMat, 1)
    plt.plot(rMean)
    plt.show()


### rMean.shape = (2, )


if __name__ == '__main__':
    part_4()