Reinforcement Learning Guide
BulletLab exposes clean state/action interfaces for custom RL implementations without depending on PyTorch, Stable-Baselines3, or Gymnasium.
State and Action Interface
# Get state as a flat numpy array
state = robot.get_state()
# Shape: (13 + 2*N,) where N = num controllable joints
# [x, y, z, qx, qy, qz, qw, vx, vy, vz, wx, wy, wz, j0_pos..., j0_vel...]
# Apply action (target velocities for each controllable joint)
action = my_policy(state) # → numpy array of length N
robot.apply_action(action)
# Apply torques instead
robot.apply_torques(torques) # → numpy array of length N
Manual Q-Learning Example
import numpy as np
from bulletlab import Simulation, Robot
# Discretize state and action spaces yourself
def discretize(state: np.ndarray) -> int:
...
def select_action(q_table, state_idx, epsilon) -> int:
if np.random.random() < epsilon:
return np.random.randint(num_actions)
return np.argmax(q_table[state_idx])
# Training loop
sim = Simulation(mode="direct") # headless for speed
sim.start()
robot = Robot.load("cartpole.urdf", sim=sim)
Q = np.zeros((num_states, num_actions))
epsilon = 0.9
for episode in range(1000):
robot.reset()
state = discretize(robot.get_state())
total_reward = 0
for step in range(500):
action_idx = select_action(Q, state, epsilon)
action = action_space[action_idx]
robot.apply_action(action)
for _ in range(4): # 4 physics steps per action
sim.step()
next_state = discretize(robot.get_state())
reward = compute_reward(robot)
done = check_done(robot)
# Q-update
Q[state, action_idx] += alpha * (
reward + gamma * np.max(Q[next_state]) - Q[state, action_idx]
)
state = next_state
total_reward += reward
if done:
break
epsilon *= 0.99 # decay
sim.stop()
Evolutionary Algorithm Example
import numpy as np
def evaluate(policy_weights: np.ndarray, sim, robot) -> float:
"""Run one episode and return total reward."""
robot.reset()
total = 0.0
for _ in range(500):
state = robot.get_state()
action = np.tanh(policy_weights @ state) # simple linear policy
robot.apply_action(action)
sim.step()
total += compute_reward(robot)
return total
# Evolve
population = [np.random.randn(num_actions, 13 + 2*N) for _ in range(20)]
for generation in range(100):
fitness = [evaluate(w, sim, robot) for w in population]
survivors = [population[i] for i in np.argsort(fitness)[-10:]]
# Mutate survivors
new_pop = []
for w in survivors:
new_pop.append(w + 0.1 * np.random.randn(*w.shape))
population = survivors + new_pop
Tips for RL with BulletLab
- Use
mode="direct"for headless, fast training - Use
sim.reset()between episodes (callrobot.reset()too) - Scale the timestep to speed up training:
sim.timestep = 1/60 - Normalize state before passing to any policy
- Use
robot.speedandrobot.rollas reward signals for locomotion