Skip to content

Commit ebadf44

Browse files
authored
Merge pull request #6 from josiahls/version_0_7_0
DDPG / Testing Init
2 parents ed2d54f + 13e1227 commit ebadf44

19 files changed

+446
-349
lines changed

README.md

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,9 @@
1+
[![Build Status](https://dev.azure.com/jokellum/jokellum/_apis/build/status/josiahls.fast-reinforcement-learning?branchName=master)](https://dev.azure.com/jokellum/jokellum/_build/latest?definitionId=1&branchName=master)
2+
[![pypi fasti_rl version](https://img.shields.io/pypi/v/fast_rl)](https://pypi.python.org/pypi/fast_rl)
3+
[![github_master version](https://img.shields.io/github/v/release/josiahls/fast-reinforcement-learning?include_prereleases)](https://github.com/josiahls/fast-reinforcement-learning/releases)
4+
5+
**Note: Test passing will not be a useful stability indicator until version 1.0+**
6+
17
# Fast Reinforcement Learning
28
This repo is not affiliated with Jeremy Howard or his course which can be found here: [here](https://www.fast.ai/about/)
39
We will be using components from the Fastai library however for building and training our reinforcement learning (RL)
@@ -221,8 +227,8 @@ learn.fit(5)
221227
```
222228

223229

224-
- [ ] **Working On** 0.7.0 Full test suite using multi-processing. Connect to CI.
225-
- [ ] 0.8.0 Comprehensive model eval **debug/verify**. Each model should succeed at at least a few known environments.
230+
- [X] 0.7.0 Full test suite using multi-processing. Connect to CI.
231+
- [ ] **Working On** 0.8.0 Comprehensive model eval **debug/verify**. Each model should succeed at at least a few known environments.
226232
- [ ] 0.9.0 Notebook demonstrations of basic model usage
227233
- [ ] **1.0.0** Base version is completed with working model visualizations proving performance / expected failure. At
228234
this point, all models should have guaranteed environments they should succeed in.

azure-pipelines.yml

Lines changed: 33 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,40 @@ trigger:
77
- master
88

99
pool:
10-
vmImage: 'ubuntu-latest'
10+
vmImage: 'ubuntu-16.04'
1111

1212
steps:
13-
- script: echo Hello, world!
14-
displayName: 'Run a one-line script'
13+
14+
- bash: "sudo apt-get install -y ffmpeg xvfb freeglut3-dev python-opengl"
15+
displayName: 'Install ffmpeg, freeglut3-dev, and xvfb'
16+
17+
- task: UsePythonVersion@0
18+
inputs:
19+
versionSpec: '3.6'
20+
21+
- script: sh ./build/azure_pipeline_helper.sh
22+
displayName: 'Complex Installs'
1523

1624
- script: |
17-
echo Add other tasks to build, test, and deploy your project.
18-
echo See https://aka.ms/yaml
19-
displayName: 'Run a multi-line script'
25+
pip install Bottleneck
26+
python setup.py install
27+
pip install pytest
28+
pip install pytest-cov
29+
pip install pytest-xdist
30+
displayName: 'Install Python Packages'
31+
32+
- script: |
33+
xvfb-run -s "-screen 0 1400x900x24" pytest -n 8 fast_rl/tests --doctest-modules --junitxml=junit/test-results.xml --cov=./ --cov-report=xml --cov-report=html
34+
displayName: 'Test with pytest'
35+
36+
- task: PublishTestResults@2
37+
condition: succeededOrFailed()
38+
inputs:
39+
testResultsFiles: '**/test-*.xml'
40+
testRunTitle: 'Publish test results for Python $(python.version)'
41+
42+
- task: PublishCodeCoverageResults@1
43+
inputs:
44+
codeCoverageTool: Cobertura
45+
summaryFileLocation: '$(System.DefaultWorkingDirectory)/**/coverage.xml'
46+
reportDirectory: '$(System.DefaultWorkingDirectory)/**/htmlcov'

build/azure_pipeline_helper.sh

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
#!/usr/bin/env bash
2+
3+
# Install pybullet
4+
git clone https://github.com/benelot/pybullet-gym.git
5+
cd pybullet-gym
6+
pip install -e .
7+
cd ../
8+
9+
# Install gym_maze
10+
git clone https://github.com/MattChanTK/gym-maze.git
11+
cd gym-maze
12+
python setup.py install
13+
cd ../
14+

fast_rl/agents/BaseAgent.py

Lines changed: 30 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -45,13 +45,12 @@ def pick_action(self, x):
4545
with torch.no_grad():
4646
if len(x.shape) > 2: raise ValueError('The agent is outputting actions with more than 1 dimension...')
4747

48-
action, x, perturbed = self.exploration_strategy.perturb(x, x, self.data.train_ds.env.action_space)
49-
x = np.clip(x, -1.0, 1.0)
48+
if isinstance(self.data.train_ds.env.action_space, Discrete): action = x.argmax().numpy().item()
49+
elif isinstance(self.data.train_ds.env.action_space, Box) and len(x.shape) != 1: action = x.squeeze(0).numpy()
5050

51-
if isinstance(self.data.train_ds.env.action_space, Discrete) and not perturbed: action = x.argmax().numpy().item()
52-
elif isinstance(self.data.train_ds.env.action_space, Box): action = x.squeeze(0).numpy()
51+
action = self.exploration_strategy.perturb(action, self.data.train_ds.env.action_space)
5352

54-
return action, x
53+
return action
5554

5655
def interpret_q(self, items):
5756
raise NotImplementedError
@@ -68,13 +67,20 @@ def forward(self, x):
6867
return x.long()
6968

7069

70+
class SwapImageChannel(nn.Module):
71+
def forward(self, x):
72+
return x.transpose(1, 3)
73+
74+
7175
class Flatten(nn.Module):
7276
def forward(self, x):
7377
return x.view(x.size(0), -1)
7478

7579

76-
def create_nn_model(layer_list: list, action_size, state_size, use_bn=False, use_embed=True,
77-
activation_function=None, final_activation_function=None):
80+
81+
82+
def create_nn_model(layer_list: list, action_size, state_size, use_bn=False, use_embed=False,
83+
activation_function=None, final_activation_function=None, action_val_to_dim=True):
7884
"""Generates an nn module.
7985
8086
Notes:
@@ -84,7 +90,8 @@ def create_nn_model(layer_list: list, action_size, state_size, use_bn=False, use
8490
8591
"""
8692
act = nn.LeakyReLU if activation_function is None else activation_function
87-
action_size = action_size[0] # For now the dimension of the action does not make a difference.
93+
# For now the dimension of the action does not make a difference.
94+
action_size = action_size[0] if not action_val_to_dim else action_size[1]
8895
# For now keep drop out as 0, test including dropout later
8996
ps = [0] * len(layer_list)
9097
sizes = [state_size] + layer_list + [action_size]
@@ -126,24 +133,25 @@ def get_conv(input_tuple, act, kernel_size, stride, n_conv_layers, layers):
126133
\times (\text{kernel\_size}[1] - 1) - 1}{\text{stride}[1]} + 1\right\rfloor
127134
128135
129-
:param input_tuple:
130-
:param act:
131-
:param kernel_size:
132-
:param stride:
133-
:param n_conv_layers:
134-
:param layers:
135-
:return:
136+
Args:
137+
input_tuple:
138+
act:
139+
kernel_size:
140+
stride:
141+
n_conv_layers:
142+
layers:
136143
"""
137144
h, w = input_tuple[0], input_tuple[1]
138-
conv_layers = []
145+
conv_layers = [SwapImageChannel()]
139146
for i in range(n_conv_layers):
140147
h, w = get_next_conv_shape(h, w, stride, kernel_size)
141148
conv_layers.append(torch.nn.Conv2d(input_tuple[2], 3, kernel_size=kernel_size, stride=stride))
142149
conv_layers.append(act)
143150
return layers + conv_layers, 3 * (h + 1) * (w + 1)
144151

145152

146-
def create_cnn_model(layer_list: list, action_size, state_size, use_bn=False, kernel_size=5, stride=3, n_conv_layers=3):
153+
def create_cnn_model(layer_list: list, action_size, state_size, use_bn=False, kernel_size=5, stride=3, n_conv_layers=3,
154+
activation_function=None, final_activation_function=None, action_val_to_dim=True):
147155
"""Generates an nn module.
148156
149157
Notes:
@@ -152,15 +160,18 @@ def create_cnn_model(layer_list: list, action_size, state_size, use_bn=False, ke
152160
Returns:
153161
154162
"""
163+
act = nn.LeakyReLU if activation_function is None else activation_function
155164
# For now keep drop out as 0, test including dropout later
156165
ps = [0] * len(layer_list)
157-
sizes = [state_size] + layer_list + [action_size]
158-
actns = [nn.ReLU() for _ in range(n_conv_layers + len(sizes) - 2)] + [None]
166+
action_size = action_size[0] if not action_val_to_dim else action_size[1]
167+
sizes = [state_size[0]] + layer_list + [action_size]
168+
actns = [act() for _ in range(n_conv_layers + len(sizes) - 2)] + [None]
159169
layers = []
160170
for i, (n_in, n_out, dp, act) in enumerate(zip(sizes[:-1], sizes[1:], [0.] + ps, actns)):
161171
if type(n_in) == tuple:
162172
layers, n_in = get_conv(n_in, act, kernel_size, n_conv_layers=n_conv_layers, layers=layers, stride=stride)
163173
layers += [Flatten()]
164174

165175
layers += bn_drop_lin(n_in, n_out, bn=use_bn and i != 0, p=dp, actn=act)
176+
if final_activation_function is not None: layers += [final_activation_function()]
166177
return nn.Sequential(*layers)

fast_rl/agents/DDPG.py

Lines changed: 51 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,8 @@
88
from torch.nn import MSELoss
99
from torch.optim import Adam
1010

11-
from fast_rl.agents.BaseAgent import BaseAgent, create_nn_model
11+
from fast_rl.agents.BaseAgent import BaseAgent, create_nn_model, create_cnn_model, get_next_conv_shape, get_conv, \
12+
Flatten
1213
from fast_rl.core.Learner import AgentLearner
1314
from fast_rl.core.MarkovDecisionProcess import MDPDataBunch
1415
from fast_rl.core.agent_core import GreedyEpsilon, ExperienceReplay
@@ -27,6 +28,8 @@ def on_train_begin(self, n_epochs, **kwargs: Any):
2728

2829
def on_epoch_begin(self, epoch, **kwargs: Any):
2930
self.episode = epoch
31+
# if self.learn.model.training and self.iteration != 0:
32+
# self.learn.model.memory.update(item=self.learn.data.x.items[-1])
3033
self.iteration = 0
3134

3235
def on_loss_begin(self, **kwargs: Any):
@@ -47,7 +50,7 @@ def on_loss_begin(self, **kwargs: Any):
4750
# self.learn.model.target_copy_over()
4851

4952

50-
class Critic(nn.Module):
53+
class NNCritic(nn.Module):
5154
def __init__(self, layer_list: list, action_size, state_size, use_bn=False, use_embed=True,
5255
activation_function=None):
5356
super().__init__()
@@ -59,7 +62,7 @@ def __init__(self, layer_list: list, action_size, state_size, use_bn=False, use_
5962
self.fc3 = nn.Linear(layer_list[1], 1)
6063

6164
def forward(self, x):
62-
action, x = x[:, self.state_size:], x[:, :self.state_size]
65+
x, action = x
6366

6467
x = nn.LeakyReLU()(self.fc1(x))
6568
x = nn.LeakyReLU()(self.fc2(torch.cat((x, action), 1)))
@@ -68,17 +71,41 @@ def forward(self, x):
6871
return x
6972

7073

74+
class CNNCritic(nn.Module):
75+
def __init__(self, layer_list: list, action_size, state_size, activation_function=None):
76+
super().__init__()
77+
self.action_size = action_size[0]
78+
self.state_size = state_size[0]
79+
80+
layers = []
81+
layers, input_size = get_conv(self.state_size, nn.LeakyReLU(), 8, 2, 3, layers)
82+
layers += [Flatten()]
83+
self.conv_layers = nn.Sequential(*layers)
84+
85+
self.fc1 = nn.Linear(input_size + self.action_size, 200)
86+
self.fc2 = nn.Linear(200, 1)
87+
88+
def forward(self, x):
89+
x, action = x
90+
91+
x = nn.LeakyReLU()(self.conv_layers(x))
92+
x = nn.LeakyReLU()(self.fc1(torch.cat((x, action), 1)))
93+
x = nn.LeakyReLU()(self.fc2(x))
94+
95+
return x
96+
97+
7198
class DDPG(BaseAgent):
7299

73100
def __init__(self, data: MDPDataBunch, memory=None, tau=1e-3, batch=64, discount=0.99,
74-
lr=1e-3, actor_lr=1e-4, exploration_strategy=None, env_was_discrete=False):
101+
lr=1e-3, actor_lr=1e-4, exploration_strategy=None):
75102
"""
76103
Implementation of a continuous control algorithm using an actor/critic architecture.
77104
78105
Notes:
79106
Uses 4 networks, 2 actors, 2 critics.
80107
All models use batch norm for feature invariance.
81-
Critic simply predicts Q while the Actor proposes the actions to take given a state s.
108+
NNCritic simply predicts Q while the Actor proposes the actions to take given a state s.
82109
83110
References:
84111
[1] Lillicrap, Timothy P., et al. "Continuous control with deep reinforcement learning."
@@ -93,7 +120,6 @@ def __init__(self, data: MDPDataBunch, memory=None, tau=1e-3, batch=64, discount
93120
lr: Rate that the opt will learn parameter gradients.
94121
"""
95122
super().__init__(data)
96-
self.env_was_discrete = env_was_discrete
97123
self.name = 'DDPG'
98124
self.lr = lr
99125
self.discount = discount
@@ -122,21 +148,30 @@ def __init__(self, data: MDPDataBunch, memory=None, tau=1e-3, batch=64, discount
122148
do_exploration=self.training))
123149

124150
def initialize_action_model(self, layers, data):
125-
return create_nn_model(layers, *data.get_action_state_size(), False, use_embed=data.train_ds.embeddable,
126-
final_activation_function=nn.Tanh)
151+
actions, state = data.get_action_state_size()
152+
if type(state[0]) is tuple and len(state[0]) == 3:
153+
# actions, state = actions[0], state[0]
154+
# If the shape has 3 dimensions, we will try using cnn's instead.
155+
return create_cnn_model([200, 200], actions, state, False, kernel_size=8,
156+
final_activation_function=nn.Tanh, action_val_to_dim=False)
157+
else:
158+
return create_nn_model(layers, *data.get_action_state_size(), False, use_embed=data.train_ds.embeddable,
159+
final_activation_function=nn.Tanh, action_val_to_dim=False)
127160

128161
def initialize_critic_model(self, layers, data):
129162
""" Instead of state -> action, we are going state + action -> single expected reward. """
130-
return Critic(layers, *data.get_action_state_size())
163+
actions, state = data.get_action_state_size()
164+
if type(state[0]) is tuple and len(state[0]) == 3:
165+
return CNNCritic(layers, *data.get_action_state_size())
166+
else:
167+
return NNCritic(layers, *data.get_action_state_size())
131168

132169
def pick_action(self, x):
133170
if self.training: self.action_model.eval()
134171
with torch.no_grad():
135-
action, x = super(DDPG, self).pick_action(x)
172+
action = super(DDPG, self).pick_action(x)
136173
if self.training: self.action_model.train()
137-
138-
if not self.env_was_discrete: action = np.clip(action, -1, 1)
139-
return action, np.clip(x, -1, 1)
174+
return np.clip(action, -1, 1)
140175

141176
def optimize(self):
142177
"""
@@ -160,12 +195,11 @@ def optimize(self):
160195
s_prime = torch.from_numpy(np.array([item.result_state for item in sampled])).float()
161196
s = torch.from_numpy(np.array([item.current_state for item in sampled])).float()
162197
a = torch.from_numpy(np.array([item.actions for item in sampled]).astype(float)).float()
163-
if self.env_was_discrete: a = torch.from_numpy(np.array([item.raw_action for item in sampled]).astype(float)).float()
164198

165199
with torch.no_grad():
166-
y = r + self.discount * self.t_critic_model(torch.cat((s_prime, self.t_action_model(s_prime)), 1))
200+
y = r + self.discount * self.t_critic_model((s_prime, self.t_action_model(s_prime)))
167201

168-
y_hat = self.critic_model(torch.cat((s, a), 1))
202+
y_hat = self.critic_model((s, a))
169203

170204
critic_loss = self.loss_func(y_hat, y)
171205

@@ -175,7 +209,7 @@ def optimize(self):
175209
critic_loss.backward()
176210
self.critic_optimizer.step()
177211

178-
actor_loss = -self.critic_model(torch.cat((s, self.action_model(s)), 1)).mean()
212+
actor_loss = -self.critic_model((s, self.action_model(s))).mean()
179213

180214
self.loss = critic_loss.cpu().detach()
181215

0 commit comments

Comments
 (0)