diff --git a/.gitignore b/.gitignore index e43b0f9..fff1532 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,5 @@ .DS_Store +./lightning_logs +json +/python/obs.npy +/python/actions.npy \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..a324fe4 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,17 @@ +{ + "cSpell.ignoreWords": [ + "RIICHI", + "Shanten", + "TSUMO", + "argmax", + "dataloaders", + "inps", + "logdir", + "logit", + "optim", + "preds", + "proba", + "tenho", + "tgts" + ] +} diff --git a/docker/requirements.txt b/docker/requirements.txt index 3d1ba23..bf4488c 100644 --- a/docker/requirements.txt +++ b/docker/requirements.txt @@ -3,3 +3,29 @@ tqdm python-socketio eventlet python-socketio[client] +aiohttp==3.8.4 +aiosignal==1.3.1 +async-timeout==4.0.2 +attrs==22.2.0 +cloudpickle==2.2.1 +filelock==3.10.2 +frozenlist==1.3.3 +fsspec==2023.3.0 +gym==0.26.2 +gym-notices==0.0.8 +Jinja2==3.1.2 +lightning-utilities==0.8.0 +MarkupSafe==2.1.2 +mpmath==1.3.0 +multidict==6.0.4 +networkx==3.0 +packaging==23.0 +pytorch-lightning==2.0.0 +PyYAML==6.0 +torch +torchaudio +torchmetrics +torchvision +typing_extensions==4.5.0 +sympy==1.11.1 +yarl==1.8.2 \ No newline at end of file diff --git a/python/convert.py b/python/convert.py new file mode 100644 index 0000000..2a0e27a --- /dev/null +++ b/python/convert.py @@ -0,0 +1,27 @@ +from mjx import Observation, State, Action +import glob +import numpy as np +count = 0 +files = glob.glob("./json/json/*") +obs_hist = [] +action_hist = [] +for file in files: + print("#" + str(count) + "Loading file....") + with open(file) as f: + lines = f.readlines() + for line in lines: + state = State(line) + for cpp_obs, cpp_act in state._cpp_obj.past_decisions(): + obs = Observation._from_cpp_obj(cpp_obs) + feature = obs.to_features(feature_name="mjx-small-v0") + action = Action._from_cpp_obj(cpp_act) + action_idx = action.to_idx() + obs_hist.append(feature) + action_hist.append(action_idx) + count += 1 + if count >= 1000: + break + + +np.save("obs.npy", np.stack(obs_hist)) +np.save("actions.npy", np.array(action_hist, dtype=np.int32)) \ No newline at end of file diff --git a/python/custom_client_riku0801.py b/python/custom_client_riku0801.py new file mode 100644 index 0000000..3d36e6e --- /dev/null +++ b/python/custom_client_riku0801.py @@ -0,0 +1,77 @@ +import torch +import mjx +from torch import optim, nn, utils, Tensor +import pytorch_lightning as pl +import mjx.agents +from client.agent import CustomAgentBase +from client.client import SocketIOClient + +class MLP(pl.LightningModule): + def __init__(self, obs_size=544, n_actions=181, hidden_size=544): + super().__init__() + self.net = nn.Sequential( + nn.Linear(obs_size, hidden_size), + nn.ReLU(), + nn.Linear(hidden_size, hidden_size), + nn.ReLU(), + nn.Linear(hidden_size, n_actions), + ) + self.loss_module = nn.CrossEntropyLoss() + + def training_step(self, batch, batch_idx): + x, y = batch + preds = self.forward(x) + loss = self.loss_module(preds, y) + self.log("train_loss", loss) + return loss + + def configure_optimizers(self): + optimizer = optim.Adam(self.parameters(), lr=1e-3) + return optimizer + + def forward(self, x): + return self.net(x.float()) + +model = MLP() +model.load_state_dict(torch.load('./model_0.pth')) + +class MyAgent(CustomAgentBase): + + def __init__(self) -> None: + super().__init__() + + def custom_act(self, obs: mjx.Observation) -> mjx.Action: + legal_actions = obs.legal_actions() + if len(legal_actions) == 1: + return legal_actions[0] + + for action in legal_actions: + if action.type() in [mjx.ActionType.TSUMO, mjx.ActionType.RON]: + return action + elif action.type() == mjx.ActionType.RIICHI: + return action + + feature = obs.to_features(feature_name="mjx-small-v0") + with torch.no_grad(): + action_logit = model(Tensor(feature.ravel())) + action_proba = torch.sigmoid(action_logit).numpy() + + mask = obs.action_mask() + action_idx = (mask * action_proba).argmax() + return mjx.Action.select_from(action_idx, legal_actions) + +if __name__ == "__main__": + # 4人で対局する場合は,4つのSocketIOClientで同一のサーバーに接続する. + my_agent = MyAgent() # 参加者が実装したプレイヤーをインスタンス化 + + sio_client = SocketIOClient( + ip='localhost', + port=5000, + namespace='/test', + query='secret', + agent=my_agent, # プレイヤーの指定 + room_id=123, # 部屋のID.4人で対局させる時は,同じIDを指定する. + ) + # SocketIO Client インスタンスを実行 + sio_client.run() + sio_client.enter_room() \ No newline at end of file diff --git a/python/learning.py b/python/learning.py new file mode 100644 index 0000000..e5e0443 --- /dev/null +++ b/python/learning.py @@ -0,0 +1,69 @@ +from torch import optim, nn, utils, Tensor +import pytorch_lightning as pl +import torch +import mjx +import mjx.agents +import numpy as np +from torch.utils.data import TensorDataset, DataLoader +from client.agent import CustomAgentBase + + +class MLP(pl.LightningModule): + def __init__(self, obs_size=544, n_actions=181, hidden_size=544): + super().__init__() + self.net = nn.Sequential( + nn.Linear(obs_size, hidden_size), + nn.ReLU(), + nn.Linear(hidden_size, hidden_size), + nn.ReLU(), + nn.Linear(hidden_size, n_actions), + ) + self.loss_module = nn.CrossEntropyLoss() + + def training_step(self, batch, batch_idx): + x, y = batch + preds = self.forward(x) + loss = self.loss_module(preds, y) + self.log("train_loss", loss) + return loss + + def configure_optimizers(self): + optimizer = optim.Adam(self.parameters(), lr=1e-3) + return optimizer + + def forward(self, x): + return self.net(x.float()) + +inps = np.load("./obs.npy") +tgts = np.load("./actions.npy") +inps = inps.reshape(661069, 16*34) + +dataset = TensorDataset(torch.Tensor(inps), torch.LongTensor(tgts)) +loader = DataLoader(dataset, batch_size=2) + +model = MLP() +trainer = pl.Trainer(max_epochs=1) +trainer.fit(model=model, train_dataloaders=loader) +torch.save(model.state_dict(), './model_0.pth') + +class MyAgent(CustomAgentBase): + + def __init__(self) -> None: + super().__init__() + + def act(self, obs: mjx.Observation) -> mjx.Action: + legal_actions = obs.legal_actions() + if len(legal_actions) == 1: + return legal_actions[0] + + # 予測 + feature = obs.to_features(feature_name="mjx-small-v0") + with torch.no_grad(): + action_logit = model(Tensor(feature.ravel())) + action_proba = torch.sigmoid(action_logit).numpy() + + # アクション決定 + mask = obs.action_mask() + action_idx = (mask * action_proba).argmax() + return mjx.Action.select_from(action_idx, legal_actions) + diff --git a/python/model_0.pth b/python/model_0.pth new file mode 100644 index 0000000..dea7f03 Binary files /dev/null and b/python/model_0.pth differ diff --git a/python/reinforce.py b/python/reinforce.py new file mode 100644 index 0000000..ce7b1af --- /dev/null +++ b/python/reinforce.py @@ -0,0 +1,236 @@ +# %% + +import random +from typing import Dict, List, Optional + +import gym + +import mjx +from mjx.agents import RandomAgent + +# gym must be 0.25.0+ to use reset(return_info=True) +gym_version = [int(x) for x in gym.__version__.split(".")] +assert ( + gym_version[0] > 0 or gym_version[1] >= 25 +), f"Gym version must be 0.25.0+ to use reset(infos=True): {gym.__version__}" + +# %% + + +class GymEnv(gym.Env): + def __init__( + self, opponent_agents: List[mjx.Agent], reward_type: str, done_type: str, feature_type: str + ) -> None: + super().__init__() + self.opponen_agents = {} + assert len(opponent_agents) == 3 + for i in range(3): + self.opponen_agents[f"player_{i+1}"] = opponent_agents[i] + self.reward_type = reward_type + self.done_type = done_type + self.feature_type = feature_type + + self.target_player = "player_0" + self.mjx_env = mjx.MjxEnv() + self.curr_obs_dict: Dict[str, mjx.Observation] = self.mjx_env.reset() + + def reset( + self, + *, + seed: Optional[int] = None, + return_info: bool = True, + options: Optional[dict] = None, + ): + assert return_info + if self.mjx_env.done("game"): + self.curr_obs_dict = self.mjx_env.reset() + + # skip other players' turns + while self.target_player not in self.curr_obs_dict: + action_dict = { + player_id: self.opponen_agents[player_id].act(obs) + for player_id, obs in self.curr_obs_dict.items() + } + self.curr_obs_dict = self.mjx_env.step(action_dict) + # game ends without player_0's turn + if self.mjx_env.done("game"): + self.curr_obs_dict = self.mjx_env.reset() + + assert self.target_player in self.curr_obs_dict + obs = self.curr_obs_dict[self.target_player] + feat = obs.to_features(self.feature_type) + mask = obs.action_mask() + return feat, {"action_mask": mask} + + def step(self, action: int): + # prepare action_dict + action_dict = {} + legal_actions = self.curr_obs_dict[self.target_player].legal_actions() + action_dict[self.target_player] = mjx.Action.select_from(action, legal_actions) + for player_id, obs in self.curr_obs_dict.items(): + if player_id == self.target_player: + continue + action_dict[player_id] = self.opponen_agents[player_id].act(obs) + + # update curr_obs_dict + self.curr_obs_dict = self.mjx_env.step(action_dict) + + # skip other players' turns + while self.target_player not in self.curr_obs_dict: + action_dict = { + player_id: self.opponen_agents[player_id].act(obs) + for player_id, obs in self.curr_obs_dict.items() + } + self.curr_obs_dict = self.mjx_env.step(action_dict) + + # parepare return + assert self.target_player in self.curr_obs_dict, self.curr_obs_dict.items() + obs = self.curr_obs_dict[self.target_player] + done = self.mjx_env.done(self.done_type) + r = self.mjx_env.rewards(self.reward_type)[self.target_player] + feat = obs.to_features(self.feature_type) + mask = obs.action_mask() + + return feat, r, done, {"action_mask": mask} + + +# %% +def take_random_action(action_mask) -> int: + legal_idxs = [] + for i in range(len(action_mask)): + if action_mask[i] > 0.5: + legal_idxs.append(i) + return random.choice(legal_idxs) + + +# %% +import torch +import torch.nn as nn +import torch.optim as optim +from torch.distributions import Categorical + + +class REINFORCE: + def __init__(self, model: nn.Module, opt: optim.Optimizer) -> None: + self.model = model + self.log_probs = 0 + self.entropy = 0 + self.opt: optim.Optimizer = opt + + def act(self, observation, action_mask): + observation = torch.from_numpy(observation).flatten().float() + mask = torch.from_numpy(action_mask) + # print("mask", mask) + # probs = self.model(observation) + logits = self.model(observation) + logits -= (1 - mask) * 1e9 + dist = Categorical(logits=logits) + # dist = Categorical(probs=probs) + action = dist.sample() + log_prob = dist.log_prob(action) + # print("probs", dist.probs) + self.entropy += dist.entropy() # (num_envs) + self.log_probs += log_prob + assert action_mask[action.item()] == 1, action_mask[action.item()] + return int(action.item()) + + def update_gradient(self, R): + self.opt.zero_grad() + loss = -R * self.log_probs # - self.entropy * 0.01 + loss.backward() + self.opt.step() + self.log_probs = 0 + self.entropy = 0 + +import torch +import mjx +from torch import optim, nn, utils, Tensor +import pytorch_lightning as pl +import mjx.agents + +from server import convert_log +from client.agent import CustomAgentBase +# CustomAgentBase を継承して, +# custom_act()を編集して麻雀AIを実装してください.import random + + +class MLP(pl.LightningModule): + def __init__(self, obs_size=544, n_actions=181, hidden_size=544): + super().__init__() + self.net = nn.Sequential( + nn.Linear(obs_size, hidden_size), + nn.ReLU(), + nn.Linear(hidden_size, hidden_size), + nn.ReLU(), + nn.Linear(hidden_size, n_actions), + ) + self.loss_module = nn.CrossEntropyLoss() + + def training_step(self, batch, batch_idx): + x, y = batch + preds = self.forward(x) + loss = self.loss_module(preds, y) + self.log("train_loss", loss) + return loss + + def configure_optimizers(self): + optimizer = optim.Adam(self.parameters(), lr=1e-3) + return optimizer + + def forward(self, x): + return self.net(x.float()) + +model = MLP() +model.load_state_dict(torch.load('./model_0.pth')) + +class MyAgent(CustomAgentBase): + + def __init__(self) -> None: + super().__init__() + + def custom_act(self, obs: mjx.Observation) -> mjx.Action: + legal_actions = obs.legal_actions() + if len(legal_actions) == 1: + return legal_actions[0] + + # 予測 + feature = obs.to_features(feature_name="mjx-small-v0") + with torch.no_grad(): + action_logit = model(Tensor(feature.ravel())) + action_proba = torch.sigmoid(action_logit).numpy() + + # アクション決定 + mask = obs.action_mask() + action_idx = (mask * action_proba).argmax() + return mjx.Action.select_from(action_idx, legal_actions) + +# %% +agent1 = MyAgent() +agent2 = MyAgent() +agent3 = MyAgent() +env = GymEnv( + opponent_agents=[agent1, agent2, agent3], + reward_type="game_tenhou_7dan", + done_type="game", + feature_type="mjx-small-v0", +) + +opt = optim.Adam(model.parameters(), lr=1e-3) +agent0 = REINFORCE(model, opt) +counter = { + 90 : 0, + 45 : 0, + 0 : 0, + -135 : 0, +} +for i in range(100): + obs, info = env.reset() + done = False + R = 0 + while not done: + a = agent0.act(obs, info["action_mask"]) + obs, r, done, info = env.step(a) + R += r + counter[r] += 1 + agent0.update_gradient(R) +print(counter) \ No newline at end of file diff --git a/python/sample_client.py b/python/sample_client.py index 62dd582..d404257 100644 --- a/python/sample_client.py +++ b/python/sample_client.py @@ -16,10 +16,11 @@ def custom_act(self, obs: mjx.Observation) -> mjx.Action: Args: obs (mjx.Observation): 盤面情報と取れる行動(obs.legal_actions()) - Returns: mjx.Action: 実際に取る行動 """ + print(obs) + print(obs.legal_actions()) # ランダムに取れる行動をする return random.choice(obs.legal_actions()) diff --git a/python/sample_trial.py b/python/sample_trial.py index 5fa95d3..0ecf68e 100644 --- a/python/sample_trial.py +++ b/python/sample_trial.py @@ -6,31 +6,71 @@ from datetime import datetime import json import random - +import torch import mjx +from torch import optim, nn, utils, Tensor +import pytorch_lightning as pl import mjx.agents from server import convert_log from client.agent import CustomAgentBase +# CustomAgentBase を継承して, +# custom_act()を編集して麻雀AIを実装してください.import random -# CustomAgentBase を継承して, -# custom_act()を編集して麻雀AIを実装してください. -class MyAgent(CustomAgentBase): - def __init__(self): +class MLP(pl.LightningModule): + def __init__(self, obs_size=544, n_actions=181, hidden_size=544): super().__init__() + self.net = nn.Sequential( + nn.Linear(obs_size, hidden_size), + nn.ReLU(), + nn.Linear(hidden_size, hidden_size), + nn.ReLU(), + nn.Linear(hidden_size, n_actions), + ) + self.loss_module = nn.CrossEntropyLoss() + + def training_step(self, batch, batch_idx): + x, y = batch + preds = self.forward(x) + loss = self.loss_module(preds, y) + self.log("train_loss", loss) + return loss + + def configure_optimizers(self): + optimizer = optim.Adam(self.parameters(), lr=1e-3) + return optimizer + + def forward(self, x): + return self.net(x.float()) + +model = MLP() +model.load_state_dict(torch.load('./model_0.pth')) - def custom_act(self, obs: mjx.Observation) -> mjx.Action: - """盤面情報と取れる行動を受け取って,行動を決定して返す関数.参加者が各自で実装. +class MyAgent(CustomAgentBase): - Args: - obs (mjx.Observation): 盤面情報と取れる行動(obs.legal_actions()) + def __init__(self) -> None: + super().__init__() - Returns: - mjx.Action: 実際に取る行動 - """ - # ランダムに取れる行動をする - return random.choice(obs.legal_actions()) + def custom_act(self, obs: mjx.Observation) -> mjx.Action: + legal_actions = obs.legal_actions() + if len(legal_actions) == 1: + return legal_actions[0] + + for action in legal_actions: + if action.type() in [mjx.ActionType.TSUMO, mjx.ActionType.RON]: + return action + elif action.type() == mjx.ActionType.RIICHI: + return action + + feature = obs.to_features(feature_name="mjx-small-v0") + with torch.no_grad(): + action_logit = model(Tensor(feature.ravel())) + action_proba = torch.sigmoid(action_logit).numpy() + + mask = obs.action_mask() + action_idx = (mask * action_proba).argmax() + return mjx.Action.select_from(action_idx, legal_actions) def save_log(obs_dict, env, logs):