diff --git a/checkpoints/dqn/3af88528-ca2d-4697-8bf9-136efa356bed/checkpoint.pkl b/checkpoints/dqn/3af88528-ca2d-4697-8bf9-136efa356bed/checkpoint.pkl new file mode 100644 index 0000000..7bd84d0 Binary files /dev/null and b/checkpoints/dqn/3af88528-ca2d-4697-8bf9-136efa356bed/checkpoint.pkl differ diff --git a/checkpoints/dqn/3af88528-ca2d-4697-8bf9-136efa356bed/params b/checkpoints/dqn/3af88528-ca2d-4697-8bf9-136efa356bed/params new file mode 100644 index 0000000..8fa5325 --- /dev/null +++ b/checkpoints/dqn/3af88528-ca2d-4697-8bf9-136efa356bed/params @@ -0,0 +1 @@ +{"max_env_steps": 200, "nn_type": "conv", "alpha": 0.001, "gamma": 0.98, "train_epsilon": 0.9, "test_epsilon": 0.05, "epsilon_decay": 0.999, "replay_buffer_size": 200, "batch_size": 32, "target_update_interval": 50, "numTraining": 2000, "verbose": false, "device": "cuda:4", "state_size": [10, 10], "action_size": 4} \ No newline at end of file diff --git a/checkpoints/dqn/8af2e326-b625-49a3-8a2d-55eb17272310/checkpoint.pkl b/checkpoints/dqn/8af2e326-b625-49a3-8a2d-55eb17272310/checkpoint.pkl new file mode 100644 index 0000000..554726c Binary files /dev/null and b/checkpoints/dqn/8af2e326-b625-49a3-8a2d-55eb17272310/checkpoint.pkl differ diff --git a/checkpoints/dqn/8af2e326-b625-49a3-8a2d-55eb17272310/params b/checkpoints/dqn/8af2e326-b625-49a3-8a2d-55eb17272310/params new file mode 100644 index 0000000..8fa5325 --- /dev/null +++ b/checkpoints/dqn/8af2e326-b625-49a3-8a2d-55eb17272310/params @@ -0,0 +1 @@ +{"max_env_steps": 200, "nn_type": "conv", "alpha": 0.001, "gamma": 0.98, "train_epsilon": 0.9, "test_epsilon": 0.05, "epsilon_decay": 0.999, "replay_buffer_size": 200, "batch_size": 32, "target_update_interval": 50, "numTraining": 2000, "verbose": false, "device": "cuda:4", "state_size": [10, 10], "action_size": 4} \ No newline at end of file diff --git a/cli.py b/cli.py index ff76911..5c286a0 100644 --- a/cli.py +++ b/cli.py @@ -15,6 +15,7 @@ def main(cfg): environment_args = { 'environment_name': cfg.environment.type, 'grid_size': cfg.grid_size, + 'max_steps': cfg.max_steps } drawer_args = { 'grid_size': cfg.grid_size, @@ -42,6 +43,24 @@ def main(cfg): 'max_iterations': cfg.controller.max_iterations, 'model_path': cfg.controller.model_path, }) + elif controller == 'dqn': + controller_args.update({ + 'max_env_steps': cfg.controller.max_env_steps, + 'model_path': cfg.controller.model_path, + 'nn_type': cfg.controller.nn_type, + 'alpha': cfg.controller.alpha, + 'gamma': cfg.controller.gamma, + 'train_epsilon': cfg.controller.train_epsilon, + 'test_epsilon': cfg.controller.test_epsilon, + 'epsilon_decay': cfg.controller.epsilon_decay, + 'replay_buffer_size': cfg.controller.replay_buffer_size, + 'batch_size': cfg.controller.batch_size, + 'target_update_interval': cfg.controller.target_update_interval, + 'numTraining': cfg.controller.numTraining, + 'verbose': cfg.controller.verbose, + 'max_env_steps': cfg.controller.max_env_steps, + 'device': cfg.controller.device + }) else: raise ValueError(f"Unknown controller: {controller}") diff --git a/config/config.yaml b/config/config.yaml index 3cb83bd..54fb46c 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -9,3 +9,4 @@ cell_size: 40 framerate: 2 eval: true num_episodes: 1000 +max_steps: 200 \ No newline at end of file diff --git a/config/controller/dqn.yaml b/config/controller/dqn.yaml new file mode 100644 index 0000000..7e40138 --- /dev/null +++ b/config/controller/dqn.yaml @@ -0,0 +1,21 @@ +type: dqn + +# common training params +alpha: 0.001 +batch_size: 32 +numTraining: 2000 +nn_type: conv # or use conv + +# RL training params +target_update_interval: 50 +replay_buffer_size: 200 +epsilon_decay: 0.999 +gamma: 0.98 +train_epsilon: 0.9 +test_epsilon: 0.05 + +# other params +verbose: False +max_env_steps: 200 +model_path: null +device: cuda:4 diff --git a/config/controller/dqn_ghosts.yaml b/config/controller/dqn_ghosts.yaml new file mode 100644 index 0000000..bd82f04 --- /dev/null +++ b/config/controller/dqn_ghosts.yaml @@ -0,0 +1,21 @@ +type: dqn + +# common training params +alpha: 0.001 +batch_size: 32 +numTraining: 4000 +nn_type: conv # or use conv + +# RL training params +target_update_interval: 50 +replay_buffer_size: 200 +epsilon_decay: 0.9995 +gamma: 0.98 +train_epsilon: 0.5 +test_epsilon: 0.05 + +# other params +verbose: False +max_env_steps: 200 +model_path: null +device: cuda:4 diff --git a/config/controller/qlearn_ghosts.yaml b/config/controller/qlearn_ghosts.yaml new file mode 100644 index 0000000..07ae91d --- /dev/null +++ b/config/controller/qlearn_ghosts.yaml @@ -0,0 +1,9 @@ +type: qlearn +alpha: 0.25 +train_epsilon: 0.9 +test_epsilon: 0.05 +gamma: 0.98 +gamma_eps: 0.99995 +numTraining: 100000 +verbose: False +model_path: null \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index b77f2e5..c511e77 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,6 +11,8 @@ numpy = "^2.2.2" pygame = "^2.6.1" click = "^8.1.7" hydra-core = "^1.3.2" +tqdm = "^4.67.1" +torch = "2.6.0" [build-system] requires = ["poetry-core"] diff --git a/src/controller/__init__.py b/src/controller/__init__.py index 3b93077..e37cf76 100644 --- a/src/controller/__init__.py +++ b/src/controller/__init__.py @@ -1,4 +1,5 @@ from .controller import Controller from .basic import BasicController from .qlearn import QTable, QLearnAgent -from .value_iteration import ValueIterationAgent \ No newline at end of file +from .value_iteration import ValueIterationAgent +from .dqn import DQNAgent diff --git a/src/controller/dqn.py b/src/controller/dqn.py new file mode 100644 index 0000000..ffaacfd --- /dev/null +++ b/src/controller/dqn.py @@ -0,0 +1,464 @@ +import random +from collections import deque +import time + +import torch +import torch.nn as nn +import torch.optim as optim +import torch.nn.functional as F +from torch.utils.tensorboard import SummaryWriter +from tqdm import tqdm + +from src.controller import Controller +from src.state import Map, Observation, ActionSpaceEnum +from src.environment import PacmanEnvironment + + +class QNetworkDense(nn.Module): + """ + A neural network for estimating Q-values of state-action pairs. + """ + + def __init__(self, state_size, action_size, hidden_size=128): + """ + Initializes the Q-Network. + + Args: + state_size (int): The size of the state space. + action_size (int): The size of the action space. + hidden_size (int): The number of units in the hidden layer. + """ + super(QNetworkDense, self).__init__() + self.sequantial = nn.Sequential( + nn.Flatten(), + nn.Linear(state_size, hidden_size), + nn.ReLU(), + nn.Linear(hidden_size, hidden_size), + nn.ReLU(), + nn.Linear(hidden_size, action_size) + ) + self.float() + + def forward(self, state: torch.Tensor) -> torch.Tensor: + state = state.float() + x = self.sequantial(state) + + return x + + +class QNetworkConv(nn.Module): + """ + A neural network for estimating Q-values of state-action pairs using convolutional layers. + """ + + def __init__(self, state_size, action_size, hidden_size=128): + """ + Initializes the Q-Network. + + Args: + state_size (int): The size of the state space. + action_size (int): The size of the action space. + hidden_size (int): The number of units in the hidden layer. + """ + super(QNetworkConv, self).__init__() + + # Слой свертки + self.conv_layers = nn.Sequential( + nn.Conv2d(1, 8, kernel_size=3, padding=1), + nn.ReLU(), + # nn.MaxPool2d(kernel_size=2), + + nn.Conv2d(8, 16, kernel_size=3, padding=1), + nn.ReLU(), + # nn.MaxPool2d(kernel_size=2), + + nn.Conv2d(16, 32, kernel_size=3, padding=1), + nn.ReLU(), + nn.MaxPool2d(kernel_size=2) + ) + + # Полносвязные слои + self.fc_layers = nn.Sequential( + nn.Flatten(), + nn.Linear(32 * state_size[0] * state_size[1] // 4, hidden_size), + nn.ReLU(), + nn.Linear(hidden_size, action_size) + ) + self.state_size = state_size + + self.float() + + def forward(self, state: torch.Tensor) -> torch.Tensor: + state = state.unsqueeze(1) + state = state.float() + x = self.conv_layers(state) + x = self.fc_layers(x) + + return x + + +class DQNAgent(Controller): + """ + Deep Q-Network agent. + """ + + def __init__(self, state_size, action_size, nn_type='dense', alpha=0.01, + gamma=0.98, train_epsilon=0.9, test_epsilon=0.05, + epsilon_decay=0.99997, replay_buffer_size=1000, + batch_size=64, target_update_interval=1000, + numTraining=100000, verbose=False, max_env_steps=200, + device="cuda"): + """ + Initializes the DQN agent. + + Args: + state_size (int): The size of the state space. + action_size (int): The size of the action space. + nn_type (str): type of the network to use. + alpha (float): Learning rate. + gamma (float): Discount factor. + test_epsilon (float): Exploration rate during testing. + epsilon (float): Exploration rate during training. + epsilon_decay (float): Decay rate for epsilon. + replay_buffer_size (int): Size of the replay buffer. + batch_size (int): Batch size for training. + target_update_interval (int): Interval to update the target network. + numTraining (int): Number of training episodes. + """ + self.nn_type = nn_type + self.state_size = state_size + self.action_size = action_size + self.alpha = alpha + self.gamma = gamma + self.test_epsilon = test_epsilon + self.epsilon = train_epsilon + self.epsilon_decay = epsilon_decay + self.replay_buffer_size = replay_buffer_size + self.batch_size = batch_size + self.target_update_interval = target_update_interval + self.numTraining = numTraining + self.verbose = verbose + self.device = device + self.max_env_steps = max_env_steps + + # Q-Network and Target Network + if self.nn_type == 'dense': + self.q_network = QNetworkDense(state_size, action_size) + self.target_network = QNetworkDense(state_size, action_size) + if self.nn_type == 'conv': + self.q_network = QNetworkConv(state_size, action_size) + self.target_network = QNetworkConv(state_size, action_size) + self.q_network = self.q_network.to(self.device) + self.target_network = self.target_network.to(self.device) + + self.target_network.load_state_dict(self.q_network.state_dict()) # Initialize target network with Q-network weights + + for param in self.target_network.parameters(): + param.requires_grad = False + + self.optimizer = optim.Adam(self.q_network.parameters(), lr=self.alpha) + self.scheduler = optim.lr_scheduler.CosineAnnealingLR(self.optimizer, T_max=self.numTraining * self.max_env_steps // self.target_update_interval) + + self.loss = nn.MSELoss() + + # Replay Buffer + self.replay_buffer = deque(maxlen=self.replay_buffer_size) + + # Training variables + self.episodesPassed = 0 + self.lastAction: ActionSpaceEnum | None = None + self.lastState: Map | None = None + self.train_ = True + self.step_count = 0 + + self.writer = SummaryWriter() + hyperparams = { + 'nn_type': self.nn_type, + 'state_size': self.state_size, + 'action_size': self.action_size, + 'alpha': self.alpha, + 'gamma': self.gamma, + 'test_epsilon': self.test_epsilon, + 'epsilon': self.epsilon, + 'epsilon_decay': self.epsilon_decay, + 'replay_buffer_size': self.replay_buffer_size, + 'batch_size': self.batch_size, + 'target_update_interval': self.target_update_interval, + 'numTraining': self.numTraining, + 'verbose': self.verbose, + 'device': self.device, + 'max_env_steps': self.max_env_steps, + } + + hyperparams_text = "\n".join([f"{key}: {value}" for key, value in hyperparams.items()]) + + # Сохранение гиперпараметров в TensorBoard + self.writer.add_text("Hyperparameters", hyperparams_text, global_step=0) + + def getQValue(self, state: torch.Tensor, action: int) -> float: + """ + Retrieves the Q-value for a given state-action pair. + + Args: + state (torch.Tensor): The current state of Pac-Man. + action (int): The action taken. + + Returns: + float: The Q-value associated with the given state-action pair. + """ + with torch.no_grad(): + state = state.to(self.device) + q_values = self.q_network(state[None, ...]) + q_values = q_values.squeeze(0) + return q_values[action].item() + + def best_action(self, state: torch.Tensor) -> ActionSpaceEnum | None: + """ + Determines the best action to take in a given state based on current Q-values. + + Args: + state (torch.Tensor): The current state of Pac-Man's environment. + + Returns: + ActionSpaceEnum | None: The best action to take; returns None if no legal actions are available. + """ + actions = list(Map.directions.keys()) + if ((self.train_ and random.random() < self.epsilon) or + (not self.train_ and random.random() < self.test_epsilon)): + return random.choice(actions) + + with torch.no_grad(): + state = state.to(self.device) + q_values = self.q_network(state[None, ...]) + q_values = q_values.squeeze(0) + best_action_index = torch.argmax(q_values).item() + best_action = actions[best_action_index] + return best_action + + def remember(self, state: torch.Tensor, action: ActionSpaceEnum, reward: float, next_state: torch.Tensor, done: bool): + """ + Adds a transition to the replay buffer. + + Args: + state (Map): The current state. + action (ActionSpaceEnum): The action taken. + reward (float): The reward received. + next_state (Map): The next state. + done (bool): Whether the episode is done. + """ + self.replay_buffer.append((state, action, reward, next_state, done)) + + def learning_step(self): + """ + Samples a minibatch from the replay buffer and performs a learning step. + """ + # full_time_start = time.perf_counter() + # measure_time = 0.0 + if len(self.replay_buffer) < self.batch_size: + return + + # Sample a minibatch from the replay buffer + minibatch = random.sample(self.replay_buffer, self.batch_size) + + # Convert the minibatch to tensors + states, actions, rewards, next_states, dones = zip(*minibatch) + + encodings = {k: i for i, k in enumerate(Map.directions.keys())} + + + state_tensors = torch.stack(states).to(self.device) + action_tensors = torch.tensor([encodings[action] for action in actions], dtype=torch.int64).to(self.device) + reward_tensors = torch.tensor(rewards, dtype=torch.float).to(self.device) + next_state_tensors = torch.stack(next_states).to(self.device) + done_tensors = torch.tensor(dones, dtype=torch.bool).to(self.device) + + # Compute Q(s, a) and Q(s', a') + outputs = self.q_network(state_tensors) + q_values = outputs[torch.arange(outputs.size(0)), action_tensors] + # q_values = self.q_network(state_tensors).gather(1, action_tensors.unsqueeze(1)).squeeze() + next_q_values, _ = torch.max(self.target_network(next_state_tensors), dim=-1) + next_q_values[done_tensors] = 0.0 # Zero out terminal states + + # Compute the expected Q values + expected_q_values = reward_tensors + self.gamma * next_q_values + + # Compute the loss + loss = self.loss(q_values, expected_q_values) + + # Optimize the model + # measure_start_time = time.perf_counter() + self.optimizer.zero_grad() + loss.backward() + self.optimizer.step() + # measure_end_time = time.perf_counter() + # measure_time += measure_end_time - measure_start_time + + # Update the target network + self.step_count += 1 + if self.step_count % self.target_update_interval == 0: + self.target_network.load_state_dict(self.q_network.state_dict()) + + for param in self.target_network.parameters(): + param.requires_grad = False + + self.scheduler.step() + + # Log the loss and learning rate + current_lr = self.optimizer.param_groups[0]['lr'] + self.writer.add_scalar("Loss", loss.item(), self.step_count) + self.writer.add_scalar("Learning rate", current_lr, self.step_count) + + # full_time_end = time.perf_counter() + # print(full_time_end - full_time_start, measure_time, measure_time / (full_time_end - full_time_start)) + + def run_episode(self, env: PacmanEnvironment) -> int: + """ + Runs a single episode of training in the specified environment. + + Args: + env (PacmanEnvironment): The environment in which Pac-Man operates. + + Returns: score + """ + observation = env.reset() + self.lastAction = None + self.lastState = observation.map.to_tensor() + total_reward = 0 + + while not observation.done: + action = self.best_action(self.lastState) + next_observation = env.step(action) + reward = next_observation.reward + next_state = next_observation.map.to_tensor() + done = next_observation.done + + self.remember(self.lastState, action, reward, next_state, done) + self.learning_step() + + self.lastState = next_state + self.lastAction = action + total_reward += reward + observation = next_observation + + self.episodesPassed += 1 + if self.train_: + self.epsilon = max(self.test_epsilon, self.epsilon * self.epsilon_decay) + return total_reward, observation.score + + def train(self, env: PacmanEnvironment) -> None: + """ + Trains the agent over a specified number of episodes in the given environment. + + Args: + env (PacmanEnvironment): The environment in which Pac-Man operates. + """ + self.episodesPassed = 0 + self.train_ = True + mean_score = 0.0 + mean_reward = 0.0 + + pbar = tqdm(range(self.numTraining), total=self.numTraining) + for i in pbar: + total_reward, score = self.run_episode(env) + mean_score += score + mean_reward += total_reward + if (i + 1) % 20 == 0: + pbar.set_description(f"Last 20 episodes -- Mean score: {mean_score / 20:.0f}, Mean reward: {mean_reward / 20:.0f}, Epsilon: {self.epsilon:.4f}") + self.writer.add_scalar("Mean score", mean_score / 20, self.step_count) + self.writer.add_scalar("Mean reward", mean_reward / 20, self.step_count) + self.writer.add_scalar("Epsilon", self.epsilon, self.step_count) + mean_score = 0 + mean_reward = 0 + + self.lastAction = None + self.lastState = None + + def get_action(self, observation: Observation) -> ActionSpaceEnum | None: + """ + Retrieves the next action to be taken based on the current observation. + + Args: + observation (Observation): The current observation from the environment. + + Returns: + ActionSpaceEnum | None: The selected action; returns None if no valid action is available. + """ + self.train_ = False + return self.best_action(observation.map.to_tensor()) + + def save_model(self, filename: str) -> None: + """ + Saves the Q-network, target network, optimizer state, replay buffer, and other relevant parameters to a file. + + Args: + filename (str): The path to the file where the agent's state will be saved. + """ + torch.save({ + 'q_network_state_dict': self.q_network.state_dict(), + 'target_network_state_dict': self.target_network.state_dict(), + 'optimizer_state_dict': self.optimizer.state_dict(), + 'scheduler_state_dict': self.scheduler.state_dict(), + 'replay_buffer': self.replay_buffer, + 'epsilon': self.epsilon, + 'episodes_passed': self.episodesPassed, + 'step_count': self.step_count, + 'alpha': self.alpha, + 'gamma': self.gamma, + 'test_epsilon': self.test_epsilon, + 'epsilon_decay': self.epsilon_decay, + 'batch_size': self.batch_size, + 'target_update_interval': self.target_update_interval, + 'num_training': self.numTraining, + 'nn_type': self.nn_type + }, filename) + + def load_model(self, filename: str) -> None: + """ + Loads the Q-network, target network, optimizer state, replay buffer, and other relevant parameters from a file. + + Args: + filename (str): The path to the file from which the agent's state will be loaded. + """ + checkpoint = torch.load(filename, map_location=self.device, weights_only=False) + + # Load network type + self.nn_type = checkpoint.get('nn_type', 'dense') + + # Initialize networks based on loaded type + if self.nn_type == 'dense': + self.q_network = QNetworkDense(self.state_size, self.action_size).to(self.device) + self.target_network = QNetworkDense(self.state_size, self.action_size).to(self.device) + elif self.nn_type == 'conv': + self.q_network = QNetworkConv(self.state_size, self.action_size).to(self.device) + self.target_network = QNetworkConv(self.state_size, self.action_size).to(self.device) + else: + raise ValueError(f"Unknown network type: {self.nn_type}") + + # Load state dictionaries + self.q_network.load_state_dict(checkpoint['q_network_state_dict']) + self.target_network.load_state_dict(checkpoint['target_network_state_dict']) + + # Load optimizer state + self.optimizer.load_state_dict(checkpoint['optimizer_state_dict']) + # Manually move optimizer's state to the correct device + for state in self.optimizer.state.values(): + for k, v in state.items(): + if isinstance(v, torch.Tensor): + state[k] = v.to(self.device) + + # Load scheduler + self.scheduler = optim.lr_scheduler.CosineAnnealingLR(self.optimizer, T_max=self.numTraining * self.max_env_steps // self.target_update_interval) + self.scheduler.load_state_dict(checkpoint['scheduler_state_dict']) + + # Load other parameters + self.replay_buffer = checkpoint['replay_buffer'] + self.epsilon = checkpoint['epsilon'] + self.episodesPassed = checkpoint['episodes_passed'] + self.step_count = checkpoint['step_count'] + self.alpha = checkpoint['alpha'] + self.gamma = checkpoint['gamma'] + self.test_epsilon = checkpoint['test_epsilon'] + self.epsilon_decay = checkpoint['epsilon_decay'] + self.batch_size = checkpoint['batch_size'] + self.target_update_interval = checkpoint['target_update_interval'] + self.numTraining = checkpoint['num_training'] diff --git a/src/environment/basic_ghosts.py b/src/environment/basic_ghosts.py index bd190e1..b33a398 100644 --- a/src/environment/basic_ghosts.py +++ b/src/environment/basic_ghosts.py @@ -175,6 +175,8 @@ def step(self, action: ActionSpaceEnum) -> Observation: new_y = current_position.y + delta[1] new_position = Position(new_x, new_y) + reward1 = self.compute_reward(self.map, current_position, new_position) + # Check if new position is a wall. if new_position in self.map.walls: new_position = current_position # Remain in the same position. @@ -189,8 +191,6 @@ def step(self, action: ActionSpaceEnum) -> Observation: # Collision detection if self.map.pacman_position in self.map.ghost_positions: self.done = True - - reward1 = self.compute_reward(self.map, current_position, new_position) for ghost in self.ghosts: ghost.move(self.map) diff --git a/src/pacman_runner.py b/src/pacman_runner.py index 0232bb2..fa8ba04 100644 --- a/src/pacman_runner.py +++ b/src/pacman_runner.py @@ -2,9 +2,11 @@ import os import json from uuid import uuid4 +import numpy as np + from src.environment import BasicPacmanEnvironment, GhostsPacmanEnvironment from src.drawer import PygameDrawer -from src.controller import BasicController, QLearnAgent, ValueIterationAgent +from src.controller import BasicController, QLearnAgent, ValueIterationAgent, DQNAgent from src.evaluation import evaluate_algorithm @@ -96,7 +98,7 @@ def create_random_controller(): return BasicController() -def create_qlearn_controller(environment, model_path, **params): +def create_qlearn_controller(environment, model_path=None, **params): """ Create and optionally train a Q-learning controller. @@ -119,7 +121,7 @@ def create_qlearn_controller(environment, model_path, **params): return controller -def create_value_iteration_controller(environment, model_path, **params): +def create_value_iteration_controller(environment, model_path=None, **params): """ Create and optionally train a Value Iteration controller. @@ -136,9 +138,40 @@ def create_value_iteration_controller(environment, model_path, **params): print(f"Loading model from {model_path}...") controller.load_model(model_path) else: - print("Training ValueIterationAgent from scratch...") + print("Training Value Iteration Agent from scratch...") controller.train(environment) save_model(controller, params, 'value_iteration') + +def create_dqn_controller(environment, model_path=None, **params): + """ + Creates and trains an instance of a DQN controller. + + Args: + environment: Instance of the environment on which to train the controller. + + Returns: + DQNAgent: Instance of the controller. + """ + observation = environment.reset() + if "nn_type" in params: + if params["nn_type"] == "dense": + state_size = np.prod(observation.map.to_numpy().shape) + elif params["nn_type"] == "conv": + state_size = observation.map.to_numpy().shape + else: + raise AttributeError("Unknown nn_type") + action_size = len(observation.map.directions) + params["state_size"] = state_size + params["action_size"] = action_size + + controller = DQNAgent(**params) + if model_path is not None and os.path.exists(model_path): + print(f"Loading model from {model_path}...") + controller.load_model(model_path) + else: + print("Training DQN from scratch...") + controller.train(environment) + save_model(controller, params, 'dqn') return controller def save_model(controller, params, method): @@ -151,7 +184,7 @@ def save_model(controller, params, method): method (str): The training method ('qlearn' or 'value_iteration'). """ checkpoint_folder = os.path.join('checkpoints', method, str(uuid4())) - os.mkdir(checkpoint_folder) + os.makedirs(checkpoint_folder) model_path = os.path.join(checkpoint_folder, 'checkpoint.pkl') print(f"Saving best model to {model_path}") controller.save_model(model_path) @@ -180,6 +213,8 @@ def create_controller(environment, controller_type, **params): return create_qlearn_controller(environment, **params) elif controller_type == 'value_iteration': return create_value_iteration_controller(environment, **params) + elif controller_type == 'dqn': + return create_dqn_controller(environment, **params) else: raise ValueError(f"Invalid controller type: {controller_type}") @@ -204,6 +239,7 @@ def print_metrics(num_episodes, environment_args, controller_args): """ Evaluate a controller on an environment and print performance metrics. +<<<<<<< HEAD Args: num_episodes (int): The number of episodes to run for evaluation. environment_args (dict): Arguments for creating the game environment. diff --git a/src/state/state.py b/src/state/state.py index be10ff7..0e57d96 100644 --- a/src/state/state.py +++ b/src/state/state.py @@ -2,7 +2,8 @@ from enum import Enum from typing import Set, List, Dict import numpy as np - +from collections import OrderedDict +import torch class ActionSpaceEnum(int, Enum): """ @@ -80,12 +81,12 @@ class Map: ghost_position_to_color: Dict[Position, GhostColorEnum] pacman_position: Position - directions = { - ActionSpaceEnum.UP: (0, -1), - ActionSpaceEnum.RIGHT: (1, 0), - ActionSpaceEnum.DOWN: (0, 1), - ActionSpaceEnum.LEFT: (-1, 0) - } + directions = OrderedDict([ + (ActionSpaceEnum.UP, (0, -1)), + (ActionSpaceEnum.RIGHT, (1, 0)), + (ActionSpaceEnum.DOWN, (0, 1)), + (ActionSpaceEnum.LEFT, (-1, 0)) + ]) def __hash__(self): """ @@ -153,6 +154,11 @@ def to_numpy(self): for ghost in self.ghost_positions: state[ghost.x, ghost.y] = -2 + return state + + def to_tensor(self): + return torch.from_numpy(self.to_numpy()) + class MapFullHash(Map): def __hash__(self):