Add ppo_sp

366e5f3b · sbl1996@126.com · 598465e8 · 366e5f3b · 366e5f3b · 366e5f3b
Commit 366e5f3b authored Feb 25, 2024 by sbl1996@126.com
Showing with 503 additions and 4 deletions

docs/features.md docs/features.md +14 -0

scripts/eval.py scripts/eval.py +7 -3

scripts/ppo.py scripts/ppo.py +1 -1

scripts/ppo_sp.py scripts/ppo_sp.py +480 -0

ygoai/rl/agent.py ygoai/rl/agent.py +1 -0

No files found.
--- a/docs/features.md
+++ b/docs/features.md
@@ -18,6 +18,20 @@
 ## Global
 - lp: 2, max 65535 to 2 bytes
 - oppo_lp: 2, max 65535 to 2 bytes
+- n_my_decks: 1, int
+- n_my_extras:
+- n_my_hands:
+- n_my_graves:
+- n_my_removes:
+- n_my_monsters:
+- n_my_spell_traps:
+- n_op_decks:
+- n_op_extras:
+- n_op_hands:
+- n_op_graves:
+- n_op_removes:
+- n_op_monsters:
+- n_op_spell_traps:
 - turn: 1, int, trunc to 8
 - phase: 1, int, one-hot (10)
 - is_first: 1, int, 0: False, 1: True

--- a/scripts/eval.py
+++ b/scripts/eval.py
@@ -43,6 +43,8 @@ class Args:
    """the maximum number of options"""
    n_history_actions: int = 16
    """the number of history actions to use"""
+    num_embeddings: Optional[int] = None
+    """the number of embeddings of the agent"""
    player: int = -1
    """the player to play as, -1 means random, 0 is the first player, 1 is the second player"""
@@ -138,9 +140,11 @@ if __name__ == "__main__":
    if args.agent:
        # count lines of code_list
-        with open(args.code_list_file, "r") as f:
+        embedding_shape = args.num_embeddings
-            code_list = f.readlines()
+        if embedding_shape is None:
-            embedding_shape = len(code_list)
+            with open(args.code_list_file, "r") as f:
+                code_list = f.readlines()
+                embedding_shape = len(code_list)
        L = args.num_layers
        agent = Agent(args.num_channels, L, L, 1, embedding_shape).to(device)
        agent = agent.eval()

--- a/scripts/ppo.py
+++ b/scripts/ppo.py
@@ -375,7 +375,7 @@ def run(local_rank, world_size):
            delta = rewards[t] + args.gamma * nextvalues * nextnonterminal - values[t]
            advantages[t] = lastgaelam = delta + args.gamma * args.gae_lambda * nextnonterminal * lastgaelam
        returns = advantages + values
        _start = time.time()
        # flatten the batch
        b_obs = {

--- a/scripts/ppo_sp.py
+++ b/scripts/ppo_sp.py
--- a/ygoai/rl/agent.py
+++ b/ygoai/rl/agent.py
@@ -105,6 +105,7 @@ class Encoder(nn.Module):
        self.a_option_embed = nn.Embedding(6, c // divisor // 2)
        self.a_number_embed = nn.Embedding(13, c // divisor // 2)
        self.a_place_embed = nn.Embedding(31, c // divisor // 2)
+        # TODO: maybe same embedding as attribute_embed
        self.a_attrib_embed = nn.Embedding(10, c // divisor // 2)
        self.a_feat_norm = nn.LayerNorm(c, elementwise_affine=affine)