ugh

1c9d3a31 · novelailab · 3a2e4799 · 1c9d3a31 · 1c9d3a31 · 1c9d3a31
Commit 1c9d3a31 authored Apr 06, 2022 by novelailab
8 changed files
--- a/lm_arch/embeddings.py
+++ b/lm_arch/embeddings.py
--- a/lm_arch/gptj.py
+++ b/lm_arch/gptj.py
@@ -10,12 +10,6 @@ except ImportError:
 import os
 from pathlib import Path
 import math
-import lm_arch.gpt_arch as gpt_arch
-
-#TODO: Might change with non einsum functions?
-
-def get_logits(x, embedding):
-    return embedding(x)

 def gelu_new(x):
    return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))
@@ -84,7 +78,7 @@ def _attn(query, key, value, causal_mask, masked_bias,

 class SelfAttention(nn.Module):
    # Code copied from HF, might want to sanity check later.
-    def __init__(self, hidden_dim, n_head, device="cuda", dtype=torch.float16):
+    def __init__(self, hidden_dim, n_head, device, dtype):
        super(SelfAttention, self).__init__()
        max_positions = 2049
        bias = torch.tril(torch.ones((max_positions, max_positions), dtype=torch.uint8, requires_grad=False)).view(
@@ -148,7 +142,7 @@ class SelfAttention(nn.Module):
        return x

 class FeedForward(nn.Module):
-    def __init__(self, dim=768, hidden_dim=768*4, activation=nn.GELU(), device="cuda", dtype=torch.float16):
+    def __init__(self, dim, hidden_dim, activation, device, dtype):
        super(FeedForward, self).__init__()
        self.ff1 = nn.Linear(dim, hidden_dim, device=device, dtype=dtype)
        self.ff2 = nn.Linear(hidden_dim, dim, device=device, dtype=dtype)
@@ -164,7 +158,7 @@ class FeedForward(nn.Module):
        return x

 class GPTJLayer(nn.Module):
-    def __init__(self, attn=SelfAttention, ff=FeedForward, hidden_dim=768, n_head=4, eps=1e-6, activation=nn.GELU(), device="cuda", dtype=torch.float16):
+    def __init__(self, attn, ff, hidden_dim, n_head, eps, activation, device, dtype):
        super(GPTJLayer, self).__init__()
        self.hidden_dim = hidden_dim
        self.ln_preattn = nn.LayerNorm(hidden_dim, eps=eps, device=device, dtype=dtype)
@@ -189,61 +183,45 @@ class GPTJLayer(nn.Module):
            
        return x

-class GPTJModel(gpt_arch.GPTModel):
-    def __init__(self, hidden_dim=512, n_layer=12, n_head=4, vocab_dim=50400, eps=1e-4, activation=nn.GELU(), Layer=GPTJLayer, SelfAttention=SelfAttention, FeedForward=FeedForward, device="cuda", dtype=torch.float16):
-        super(GPTJModel, self).__init__(hidden_dim=hidden_dim, n_layer=n_layer, n_head=n_head, vocab_dim=vocab_dim, eps=eps, activation=activation, Layer=Layer, SelfAttention=SelfAttention, FeedForward=FeedForward, device=device, dtype=dtype)
-    
-
-def load_gpt_j(path="models/6b", state_dict=None):
-    config = {
-        "n_layer": 28,
-        "n_head": 16,
-        "hidden_dim": 4096,
-        "vocab_dim": 50400,
-        "eps": 1e-5,
-        "activation": gelu_new,
-        "Layer": GPTJLayer
-    }
-    model = GPTJModel.load(config, path, state_dict)
-    return model
-
-def init_6b():
-    config = {
-        "n_layer": 28,
-        "n_head": 16,
-        "hidden_dim": 4096,
-        "vocab_dim": 50400,
-        "eps": 1e-5,
-        "activation": gelu_new,
-        "Layer": GPTJLayer
-    }
-    model = GPTJModel.init(config)
-    return model
-
-def init_125m():
-    config = {
-        "n_layer": 12,
-        "n_head": 12,
-        "hidden_dim": 768,
-        "vocab_dim": 50400,
-        "eps": 1e-5,
-        "activation": gelu_new,
-        "Layer": GPTJLayer
-    }
-
-    model = GPTJModel.init(config)
-    return model
-
-def init_1_3b():
-    config = {
-        "n_layer": 24,
-        "n_head": 16,
-        "hidden_dim": 2048,
-        "vocab_dim": 50400,
-        "eps": 1e-5,
-        "activation": gelu_new,
-        "Layer": GPTJLayer
-    }
-
-    model = GPTJModel(**config)
-    return model
\ No newline at end of file
+class GPTModel(nn.Module):
+    def __init__(self, hidden_dim, n_layer, n_head, vocab_dim, eps, activation, Layer, device, dtype):
+        super(GPTModel, self).__init__()
+        self.n_layer = n_layer
+        self.hidden_dim = hidden_dim
+        self.vocab_embed = nn.Embedding(vocab_dim, self.hidden_dim, device=device, dtype=dtype)
+        self.ln_final = nn.LayerNorm(self.hidden_dim, eps=eps, device=device, dtype=dtype)
+        self.layers = nn.ModuleList([])
+        self.lm_head = nn.Linear(hidden_dim, vocab_dim, bias=True)
+        for _ in range(n_layer):
+            self.layers.append(Layer(attn=SelfAttention, ff=FeedForward, hidden_dim=hidden_dim, n_head=n_head, eps=eps, activation=activation, device=device, dtype=dtype))
+            #TODO: Decouple more, maybe even init everything here, not sure. Not modular enough yet.
+            #TODO: Do we want to pass a config object everywhere? I don't exactly like that but passing a lot of variables is a bit ugly too.
+
+    def _init_weights(self, module):
+        """Initialize the weights."""
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+        for name, p in module.named_parameters():
+            if ("ff2" in name or "out_proj" in name) and "weight" in name:
+                # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
+                p.data.normal_(mean=0.0, std=(0.02 / math.sqrt(2 * self.n_layer)))
+
+    def forward(self, x, hypernetwork=None, act_ck=False):
+        x = self.get_embeds(x, hypernetwork=hypernetwork, act_ck=act_ck)
+        x = self.lm_head(x)
+        return x.float()
+
+    def get_embeds(self, x, hypernetwork=None, act_ck=False):
+        x = self.vocab_embed(x)
+        for layer_id, layer in enumerate(self.layers):
+            x = layer(x, layer_id=layer_id, hypernetwork=hypernetwork, act_ck=act_ck)
+        x = self.ln_final(x)
+        return x
--- a/lm_arch/lm_class.py
+++ b/lm_arch/lm_class.py
+from lm_arch import utils
+import math
+import torch
+from torch import nn
+import os
+
+class BaseLM(nn.Module):
+    def __init__(self, config=None, lm=None):
+        self.config = config
+        self.lm = lm
+
+    def init_weights(self):
+        for module in self.lm.modules():
+            if isinstance(module, nn.Linear):
+                module.weight.data.normal_(mean=0.0, std=0.02)
+                if module.bias is not None:
+                    module.bias.data.zero_()
+
+            elif isinstance(module, nn.Embedding):
+                module.weight.data.normal_(mean=0.0, std=0.02)
+
+            elif isinstance(module, nn.LayerNorm):
+                module.bias.data.zero_()
+                module.weight.data.fill_(1.0)
+
+            for name, p in module.named_parameters():
+                if ("ff2" in name or "out_proj" in name) and "weight" in name:
+                    p.data.normal_(mean=0.0, std=(0.02 / math.sqrt(2 * self.config.n_layer)))
+
+    @classmethod
+    def init(cls, config):
+        lm = config.model_class(**config)
+        model = cls(config, lm)
+        model.init_weights()
+        #make this modular later
+
+        return model
+
+    @classmethod
+    def no_init(cls, config):
+        lm = utils.no_init(lambda: config.model_class(**config))
+        model = cls(config, lm)
+        return model
+
+    @classmethod
+    def load(cls, config, path=None, state_dict=None, strict=False):
+        # I am kinda sad that we will not have a load function in lm object itself.
+        # might be better to add load functions to that as well but not sure.
+        if path:
+            state_dict = utils.SplitCheckpoint(path, device="cuda")
+
+        lm = config.model_class(**config)
+        model = cls(config, lm)
+        model.lm.load_state_dict(state_dict, strict=strict)
+        return model
+
+    def save(self, path):
+        if self.lm is None:
+            print("No LM object to save. Please first init a model.")
+        try: os.mkdir(path)
+        except: pass
+        checkpoint = {}
+        for i, x in enumerate(self.lm.state_dict().items()):
+            checkpoint[x[0]] = f"{path}/b{i}.pt"
+            torch.save(x[1], f"{path}/b{i}.pt")
+        torch.save(checkpoint, f"{path}/m.pt")
\ No newline at end of file
--- a/lm_arch/nn.py
+++ b/lm_arch/nn.py
--- a/lm_arch/presets.py
+++ b/lm_arch/presets.py
--- a/main.py
+++ b/main.py
@@ -75,6 +75,11 @@ def get_logits(x, embedding):
 def gelu_new(x):
    return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))

+def gelu_jax(x):
+    sqrt_2_over_pi = math.sqrt(2.0 / math.pi)
+    cdf = 0.5 * (1.0 + torch.tanh(sqrt_2_over_pi * (x + 0.044715 * (x ** 3))))
+    return cdf
+
 def fixed_pos_embedding(dim=None, seq_len=None, x=None):
    if x is None:
        x = torch.empty(0)

--- a/pyfra_util.py
+++ b/pyfra_util.py
+# run bash: -b
+# run command: default
+# kill: -k name
+# start pod: -s name 
+# gpu: -g --gpu
+# amount: -n
+# cpu cores: -c
+# amount of ram: -r
+# image: -i
+
+from novelutils.novelfra import *
+from pyfra import *
+import argparse
+import sys
+parser = argparse.ArgumentParser(description='Novelfra utility tool for launching pods and deployments on kubernetes with pyfra.')
+parser.add_argument('name', nargs="?", type=str, help='Deployment name')
+# Make the default the last one we used.
+parser.add_argument('--service', action="store_true", help="""Create a service with the deployment. If a service is not 
+created you won't be able to access the pod outside from the kube network.""")
+parser.add_argument('-b', '--bash', action="store_true", help='Run bash instead of python3.')
+parser.add_argument('-k', '--kill', action="store_true", help='Kill a pod given the name.')
+parser.add_argument('-s', '--start', action="store_true", help='Start a pod given the name.')
+parser.add_argument('-g', '--gpu', default="RTX_A4000", type=str, help='Use a gpu.')
+parser.add_argument('-n', '--amount', type=int, default=1, help='Amount of gpus to use.')
+parser.add_argument('-c', '--cpu', type=int, default=4, help='Amount of cpu cores to use.')
+parser.add_argument('-r', '--ram', type=int, default=8, help='Amount of ram to use.')
+parser.add_argument('-i', '--image', type=str, default='novelai/kube-ssh:13', help='Docker image to use.')
+parser.add_argument(type=str, nargs="*", dest="command", default="bash", help='Commands to run.')
+
+args = parser.parse_args()
+print(args)
+name = args.name
+dry = False
+bash = args.bash
+
+config_obj = KubeConfig()
+config_obj.set_name(name)
+if args.start:
+    
+    config_obj.set_gpu(gpu_name=args.gpu, amount=args.amount)
+    config_obj.set_ram(args.ram)
+    config_obj.set_cpu(args.cpu)
+    config_obj.dry_run(dry)
+    config_obj.print_information()
+
+    config_obj.create_deployment(overwrite=False)
+    if args.service:
+        config_obj.create_service(overwrite=False)
+
+if args.kill:
+    config_obj.kill_deployment()
+    config_obj.kill_service()
+    sys.exit(0)
+
+remote = config_obj.get_pyfra_remote()
+env1 = remote.env('noname', python_version=None)
+
+path = env1.path('/home/xuser/diffusionstorage/workspace/kuru/basedformer')
+env1.sh('pip install /home/xuser/hugessd/pytorch/torch-1.10.1+cu113-cp38-cp38-linux_x86_64.whl')
+env1.sh('pip install einops numpy')
+env1.sh('pip install tqdm')
+env1.sh('pip install /home/xuser/diffusionstorage/workspace/finetune/pokepls/transformers-repo')
+env1.sh('pip3 install einops==0.4.1 pyyaml wandb')
+env1.sh('wandb login 21a9442d42a35e15ce421f2b702ec58508b9adc4')
+env1.sh('pip3 install dotmap')
+
+with always_rerun():
+    if args.bash:
+        path.sh("bash")
+    else:
+        if args.command:
+            path.sh(" ".join(args.command))
+        else:
+            print("No command given.")
+            sys.exit(0)
+    
\ No newline at end of file
--- a/test_pyfra.py
+++ b/test_pyfra.py
 from novelutils.novelfra import *
 from pyfra import *
 import sys
+import argparse
+
+# run bash: -b
+# run command: default
+# kill

 name = 'pyfra-basedformer'
 dry = False