-
Notifications
You must be signed in to change notification settings - Fork 0
/
ML.py
106 lines (73 loc) · 2.74 KB
/
ML.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import pickle
import random
import pandas as pd
import numpy as np
# numpy is in python library to support large multidimensional matrices and arrays
# R matrix
R = np.matrix([[-1, -1, -1, -1, 0, -1],
[-1, -1, -1, 0, -1, 100],
[-1, -1, -1, 0, -1, -1],
[-1, 0, 0, -1, 0, -1],
[0, -1, -1, 0, -1, 100],
[-1, 0, -1, -1, 0, 100]])
# Q matrix
Q = np.matrix(np.zeros([6, 6]))
# Gamma (learning parameter)
gamma = 0.8
# Initial state(Usually to be chosen at random)
initial_state = random.choice([0, 1, 2, 3, 4])
# This function returns all available actions in the state given as an argument
def available_actions(state):
current_state_row = R[state,]
av_act = np.where(current_state_row >= 0)[1]
return av_act
# Get available actions in the current state
available_act = available_actions(initial_state)
# This function chooses at random which to be performed within the range
# of all available actions.
def sample_next_actions(available_actions_range):
next_action = int(np.random.choice(available_act, 1))
return next_action
# sample next action to be performed
action = sample_next_actions(available_act)
# This function updates the Q matrix according to the path selected and the q
# learning algorithm
def update(current_state, action, gamma):
max_index = np.where(Q[action,] == np.max(Q[action,]))[1]
if max_index.shape[0] > 1:
max_index = int(np.random.choice(max_index, size=1))
else:
max_index = int(max_index)
max_value = Q[action, max_index]
Q[current_state, action] = R[current_state, action] + gamma * max_value
# update Q matrix
update(initial_state, action, gamma)
# _______________________________________________________
# Training
# Train over 10 000 iterations.(Re-iterate the process above)
for i in range(100):
current_state = np.random.randint(0, int(Q.shape[0]))
available_act = available_actions(current_state)
action = sample_next_actions(available_act)
update(current_state, action, gamma)
# Normalize the "trained" Q matrix
print("Trained Q matrix:")
print(Q / np.max(Q) * 100)
# -----------------------------------------------------
# Testing
# Goal state = 5
# Best sequence path starting from 2 -> 2,3,1,5
current_state = 3
steps = [current_state]
while current_state != 5:
next_step_index = np.where(Q[current_state,] == np.max(Q[current_state,]))[1]
if next_step_index.shape[0] > 1:
next_step_index = int(np.random.choice(next_step_index, size=1))
else:
next_step_index = int(next_step_index)
steps.append(next_step_index)
current_state = next_step_index
# print selected sequence of steps
print("Selected path:")
print(type(current_state))
print(steps)