Add RND

4c0edbf8 · sbl1996@126.com · d43e5903 · 4c0edbf8 · 4c0edbf8 · 4c0edbf8
Commit 4c0edbf8 authored Jun 06, 2024 by sbl1996@126.com
5 changed files
--- a/scripts/cleanba.py
+++ b/scripts/cleanba.py
--- a/scripts/cleanba_rnd.py
+++ b/scripts/cleanba_rnd.py
--- a/ygoai/rl/jax/__init__.py
+++ b/ygoai/rl/jax/__init__.py
@@ -107,15 +107,15 @@ def get_from_action(values, action):
    return jnp.sum(distrax.multiply_no_nan(values, value_one_hot), axis=-1)


-def mean_legal(values, axis=None):
+def mean_legal(values, axis=None, keepdims=False):
    # TODO: use real action mask
    no_nan_mask = values > -1e12
    no_nan = jnp.where(no_nan_mask, values, 0)
-    count = jnp.sum(no_nan_mask, axis=axis)
-    return jnp.sum(no_nan, axis=axis) / jnp.maximum(count, 1)
+    count = jnp.sum(no_nan_mask, axis=axis, keepdims=keepdims)
+    return jnp.sum(no_nan, axis=axis, keepdims=keepdims) / jnp.maximum(count, 1)


-def neurd_loss(actions, logits, new_logits, advantages, logits_threshold):
+def neurd_loss_2(actions, logits, new_logits, advantages, logits_threshold):
    # Neural Replicator Dynamics
    # Differences from the original implementation:
    # - all actions vs. sampled actions
@@ -136,6 +136,27 @@ def neurd_loss(actions, logits, new_logits, advantages, logits_threshold):
    return pg_loss


+def neurd_loss(new_logits, advantages, logits_threshold=2.0, adv_threshold=1000.0):
+    advs = jax.lax.stop_gradient(advantages)
+
+    legal_mask = new_logits > -1e12
+    legal_logits = jnp.where(legal_mask, new_logits, 0)
+    count = jnp.sum(legal_mask, axis=-1, keepdims=True)
+    new_logits_ = new_logits - jnp.sum(legal_logits, axis=-1, keepdims=True) / jnp.maximum(count, 1)
+
+    can_increase = new_logits_ < logits_threshold
+    can_decrease = new_logits_ > -logits_threshold
+    c = jnp.where(
+        advs >= 0, can_increase, can_decrease).astype(jnp.float32)
+    c = jax.lax.stop_gradient(c)
+    advs = jnp.clip(advs, -adv_threshold, adv_threshold)
+    # TODO: renormalize with player
+    pg_loss = -c * new_logits_ * advs
+    pg_loss = jnp.where(legal_mask, pg_loss, 0)
+    pg_loss = jnp.sum(pg_loss, axis=-1)
+    return pg_loss
+
+
 def ach_loss(actions, logits, new_logits, advantages, logits_threshold, clip_coef, dual_clip_coef=None):
    # Actor-Critic Hedge loss from Actor-Critic Policy Optimization in a Large-Scale Imperfect-Information Game
    # notice entropy term is required but not included here
@@ -158,7 +179,83 @@ def ach_loss(actions, logits, new_logits, advantages, logits_threshold, clip_coe
    return pg_loss


-def vtrace_loop(carry, inp, gamma, rho_min, rho_max, c_min, c_max):
+def vtrace_rnad_loop(carry, inp, gamma, rho_min, rho_max, c_min, c_max):
+    v1, v2, next_values1, next_values2, reward1, reward2, xi1, xi2, reward_u1, reward_u2 = carry
+    ratio, cur_values, r_t, eta_reg_entropy, probs, a_t, eta_log_policy, next_done, main = inp
+
+    v1 = jnp.where(next_done, 0, v1)
+    v2 = jnp.where(next_done, 0, v2)
+    next_values1 = jnp.where(next_done, 0, next_values1)
+    next_values2 = jnp.where(next_done, 0, next_values2)
+    reward1 = jnp.where(next_done, 0, reward1)
+    reward2 = jnp.where(next_done, 0, reward2)
+    xi1 = jnp.where(next_done, 1, xi1)
+    xi2 = jnp.where(next_done, 1, xi2)
+    reward_u1 = jnp.where(next_done, 0, reward_u1)
+    reward_u2 = jnp.where(next_done, 0, reward_u2)
+
+    discount = gamma * (1.0 - next_done)
+    next_v = jnp.where(main, v1, v2)
+    next_values = jnp.where(main, next_values1, next_values2)
+    reward = jnp.where(main, reward1, reward2)
+    xi = jnp.where(main, xi1, xi2)
+    reward_u = jnp.where(main, reward_u1, reward_u2)
+    
+    reward_u = r_t + discount * reward_u + eta_reg_entropy
+    discounted_reward = r_t + discount * reward
+
+    rho_t = jnp.clip(ratio * xi, rho_min, rho_max)
+    c_t = jnp.clip(ratio * xi, c_min, c_max)
+    sig_v = rho_t * (reward_u + discount * next_values - cur_values)
+    v = cur_values + sig_v + c_t * discount * (next_v - next_values)
+
+    q_t = cur_values[:, None] + eta_log_policy
+    n_actions = eta_log_policy.shape[-1]
+    q_t2 = discounted_reward + discount * xi * next_v - cur_values
+    q_t = q_t + q_t2[:, None] * distrax.multiply_no_nan(
+        1.0 / jnp.maximum(probs, 1e-3), jax.nn.one_hot(a_t, n_actions))
+
+    v1 = jnp.where(main, v, discount * v1)
+    v2 = jnp.where(main, discount * v2, v)
+    next_values1 = jnp.where(main, cur_values, discount * next_values1)
+    next_values2 = jnp.where(main, discount * next_values2, cur_values)
+    reward1 = jnp.where(main, 0, ratio * (discount * reward1 - r_t) - eta_reg_entropy)
+    reward2 = jnp.where(main, ratio * (discount * reward2 - r_t) - eta_reg_entropy, 0)
+    xi1 = jnp.where(main, 1, ratio * xi1)
+    xi2 = jnp.where(main, ratio * xi2, 1)
+    reward_u1 = jnp.where(main, 0, discount * reward_u1 - r_t - eta_reg_entropy)
+    reward_u2 = jnp.where(main, discount * reward_u2 - r_t - eta_reg_entropy, 0)
+
+    carry = v1, v2, next_values1, next_values2, reward1, reward2, xi1, xi2, reward_u1, reward_u2
+    return carry, (v, q_t)
+
+
+def vtrace_rnad(
+    next_value, ratios, logits, new_logits, actions,
+    log_policy_reg, values, rewards, next_dones, mains,
+    gamma, rho_min=0.001, rho_max=1.0, c_min=0.001, c_max=1.0, eta=0.2,
+):
+    probs = jax.nn.softmax(logits)
+    new_probs = jax.nn.softmax(new_logits)
+    eta_reg_entropy = -eta * jnp.sum(new_probs * log_policy_reg, axis=-1)
+    eta_log_policy = -eta * log_policy_reg
+    next_value1 = next_value
+    next_value2 = -next_value1
+    v1 = next_value1
+    v2 = next_value2
+    reward1 = reward2 = reward_u1 = reward_u2 = jnp.zeros_like(next_value)
+    xi1 = xi2 = jnp.ones_like(next_value)
+    carry = v1, v2, next_value1, next_value2, reward1, reward2, xi1, xi2, reward_u1, reward_u2
+
+    _, (targets, q_estimate) = jax.lax.scan(
+        partial(vtrace_rnad_loop, gamma=gamma, rho_min=rho_min, rho_max=rho_max, c_min=c_min, c_max=c_max),
+        carry, (ratios, values, rewards, eta_reg_entropy, probs, actions, eta_log_policy, next_dones, mains), reverse=True
+    )
+    targets = jax.lax.stop_gradient(targets)
+    return targets, q_estimate
+
+
+def vtrace_2p0s_loop(carry, inp, gamma, rho_min, rho_max, c_min, c_max):
    v1, v2, next_values1, next_values2, reward1, reward2, xi1, xi2, \
        last_return1, last_return2, next_q1, next_q2 = carry
    ratio, cur_values, next_done, r_t, main = inp
@@ -229,7 +326,7 @@ def vtrace_2p0s(
        return1, return2, next_q1, next_q2

    _, (targets, q_estimate, return_t) = jax.lax.scan(
-        partial(vtrace_loop, gamma=gamma, rho_min=rho_min, rho_max=rho_max, c_min=c_min, c_max=c_max),
+        partial(vtrace_2p0s_loop, gamma=gamma, rho_min=rho_min, rho_max=rho_max, c_min=c_min, c_max=c_max),
        carry, (ratios, values, next_dones, rewards, mains), reverse=True
    )
    advantages = q_estimate - values
@@ -314,6 +411,29 @@ def truncated_gae_2p0s(
    return targets, advantages


+def truncated_gae_loop(carry, inp, gamma, gae_lambda):
+    lastgaelam, next_value = carry
+    cur_value, next_done, reward = inp
+    nextnonterminal = 1.0 - next_done
+
+    delta = reward + gamma * next_value * nextnonterminal - cur_value
+    lastgaelam = delta + gamma * gae_lambda * nextnonterminal * lastgaelam
+    carry = lastgaelam, cur_value
+    return carry, lastgaelam
+
+
+def truncated_gae(next_value, values, rewards, next_dones, gamma, gae_lambda):
+    lastgaelam = jnp.zeros_like(next_value)
+    carry = lastgaelam, next_value
+    _, advantages = jax.lax.scan(
+        partial(truncated_gae_loop, gamma=gamma, gae_lambda=gae_lambda),
+        carry, (values, next_dones, rewards), reverse=True
+    )
+    targets = values + advantages
+    targets = jax.lax.stop_gradient(targets)
+    return targets, advantages
+
+
 def simple_policy_loss(ratios, logits, new_logits, advantages, kld_max, eps=1e-12):
    advs = jax.lax.stop_gradient(advantages)
    probs = jax.nn.softmax(logits)

--- a/ygoai/rl/jax/agent.py
+++ b/ygoai/rl/jax/agent.py
@@ -506,40 +506,39 @@ def rnn_forward_2p(rnn_layer, rstate, f_state, done, switch_or_main, switch=True


 @dataclass
-class ModelArgs:
+class EncoderArgs:
    num_layers: int = 2
    """the number of layers for the agent"""
    num_channels: int = 128
    """the number of channels for the agent"""
-    rnn_channels: int = 512
-    """the number of channels for the RNN in the agent"""
    use_history: bool = True
    """whether to use history actions as input for agent"""
    card_mask: bool = False
    """whether to mask the padding card as ignored in the transformer"""
-    rnn_type: Optional[Literal['lstm', 'gru', 'rwkv', 'none']] = "lstm"
-    """the type of RNN to use, None for no RNN"""
-    film: bool = False
-    """whether to use FiLM for the actor"""
    noam: bool = False
    """whether to use Noam architecture for the transformer layer"""
-    rwkv_head_size: int = 32
-    """the head size for the RWKV"""
    action_feats: bool = True
    """whether to use action features for the global state"""
    version: int = 0
    """the version of the environment and the agent"""


+@dataclass
+class ModelArgs(EncoderArgs):
+    rnn_channels: int = 512
+    """the number of channels for the RNN in the agent"""
+    rnn_type: Optional[Literal['lstm', 'gru', 'rwkv', 'none']] = "lstm"
+    """the type of RNN to use, None for no RNN"""
+    film: bool = False
+    """whether to use FiLM for the actor"""
+    rwkv_head_size: int = 32
+    """the head size for the RWKV"""
+
+
 class RNNAgent(nn.Module):
    num_layers: int = 2
    num_channels: int = 128
    rnn_channels: int = 512
-    embedding_shape: Optional[Union[int, Tuple[int, int]]] = None
-    dtype: jnp.dtype = jnp.float32
-    param_dtype: jnp.dtype = jnp.float32
-    switch: bool = True
-    freeze_id: bool = False
    use_history: bool = True
    card_mask: bool = False
    rnn_type: str = 'lstm'
@@ -549,6 +548,13 @@ class RNNAgent(nn.Module):
    action_feats: bool = True
    version: int = 0

+    switch: bool = True
+    freeze_id: bool = False
+    int_head: bool = False
+    embedding_shape: Optional[Union[int, Tuple[int, int]]] = None
+    dtype: jnp.dtype = jnp.float32
+    param_dtype: jnp.dtype = jnp.float32
+
    @nn.compact
    def __call__(self, x, rstate, done=None, switch_or_main=None):
        c = self.num_channels
@@ -618,6 +624,11 @@ class RNNAgent(nn.Module):
        
        logits = actor(f_state_r, f_actions, mask)
        value = critic(f_state_r)
+        if self.int_head:
+            critic_int = Critic(
+                channels=[c, c, c], dtype=self.dtype, param_dtype=self.param_dtype)
+            value_int = critic_int(f_state_r)
+            value = (value, value_int)
        return rstate, logits, value, valid

    def init_rnn_state(self, batch_size):
@@ -637,3 +648,61 @@ class RNNAgent(nn.Module):
            )
        else:
            return None
+
+
+default_rnd_args = EncoderArgs(
+    num_layers=1,
+    num_channels=128,
+    use_history=True,
+    card_mask=False,
+    noam=True,
+    action_feats=True,
+    version=2,
+)
+
+class RNDModel(nn.Module):
+    is_predictor: bool = False
+    num_layers: int = 1
+    num_channels: int = 128
+    use_history: bool = True
+    card_mask: bool = False
+    noam: bool = True
+    action_feats: bool = True
+    version: int = 2
+
+    out_channels: Optional[int] = None
+    freeze_id: bool = True
+    embedding_shape: Optional[Union[int, Tuple[int, int]]] = None
+    dtype: jnp.dtype = jnp.float32
+    param_dtype: jnp.dtype = jnp.float32
+
+
+    @nn.compact
+    def __call__(self, x):
+        c = self.num_channels
+        oc = self.out_channels or c * 2
+        encoder = Encoder(
+            channels=c,
+            out_channels=oc,
+            num_layers=self.num_layers,
+            embedding_shape=self.embedding_shape,
+            dtype=self.dtype,
+            param_dtype=self.param_dtype,
+            freeze_id=self.freeze_id,
+            use_history=self.use_history,
+            card_mask=self.card_mask,
+            noam=self.noam,
+            action_feats=self.action_feats,
+            version=self.version,
+        )
+
+        f_actions, f_state, mask, valid = encoder(x)
+        c = f_state.shape[-1]
+        if self.is_predictor:
+            predictor = MLP([oc, oc], dtype=self.dtype, param_dtype=self.param_dtype)
+            f_state = predictor(f_state)
+        else:
+            f_state = nn.Dense(
+                oc, dtype=self.dtype, param_dtype=self.param_dtype,
+                kernel_init=nn.initializers.orthogonal(np.sqrt(2)))(f_state)
+        return f_state
\ No newline at end of file
--- a/ygoai/rl/jax/utils.py
+++ b/ygoai/rl/jax/utils.py
 import jax
 import jax.numpy as jnp

+from flax import struct
+
 from ygoai.rl.env import RecordEpisodeStatistics


@@ -24,3 +26,50 @@ def categorical_sample(logits, key):
    u = jax.random.uniform(subkey, shape=logits.shape)
    action = jnp.argmax(logits - jnp.log(-jnp.log(u)), axis=-1)
    return action, key
+
+
+class RunningMeanStd(struct.PyTreeNode):
+    """Tracks the mean, variance and count of values."""
+
+    mean: jnp.ndarray = struct.field(pytree_node=True)
+    var: jnp.ndarray = struct.field(pytree_node=True)
+    count: jnp.ndarray = struct.field(pytree_node=True)
+
+    @classmethod
+    def create(cls, shape=()):
+        return cls(
+            mean=jnp.zeros(shape, "float64"),
+            var=jnp.ones(shape, "float64"),
+            count=jnp.full(shape, 1e-4, "float64"),
+        )
+
+    def update(self, x):
+        """Updates the mean, var and count from a batch of samples."""
+        batch_mean = jnp.mean(x, axis=0)
+        batch_var = jnp.var(x, axis=0)
+        batch_count = x.shape[0]
+        return self.update_from_moments(batch_mean, batch_var, batch_count)
+
+    def update_from_moments(self, batch_mean, batch_var, batch_count):
+        """Updates from batch mean, variance and count moments."""
+        mean, var, count = update_mean_var_count_from_moments(
+            self.mean, self.var, self.count, batch_mean, batch_var, batch_count
+        )
+        return self.replace(mean=mean, var=var, count=count)
+
+
+def update_mean_var_count_from_moments(
+    mean, var, count, batch_mean, batch_var, batch_count
+):
+    """Updates the mean, var and count using the previous mean, var, count and batch values."""
+    delta = batch_mean - mean
+    tot_count = count + batch_count
+
+    new_mean = mean + delta * batch_count / tot_count
+    m_a = var * count
+    m_b = batch_var * batch_count
+    M2 = m_a + m_b + jnp.square(delta) * count * batch_count / tot_count
+    new_var = M2 / tot_count
+    new_count = tot_count
+
+    return new_mean, new_var, new_count