Commit 1c9d3a31 authored by novelailab's avatar novelailab

ugh

parent 3a2e4799
......@@ -10,12 +10,6 @@ except ImportError:
import os
from pathlib import Path
import math
import lm_arch.gpt_arch as gpt_arch
#TODO: Might change with non einsum functions?
def get_logits(x, embedding):
return embedding(x)
def gelu_new(x):
return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))
......@@ -84,7 +78,7 @@ def _attn(query, key, value, causal_mask, masked_bias,
class SelfAttention(nn.Module):
# Code copied from HF, might want to sanity check later.
def __init__(self, hidden_dim, n_head, device="cuda", dtype=torch.float16):
def __init__(self, hidden_dim, n_head, device, dtype):
super(SelfAttention, self).__init__()
max_positions = 2049
bias = torch.tril(torch.ones((max_positions, max_positions), dtype=torch.uint8, requires_grad=False)).view(
......@@ -148,7 +142,7 @@ class SelfAttention(nn.Module):
return x
class FeedForward(nn.Module):
def __init__(self, dim=768, hidden_dim=768*4, activation=nn.GELU(), device="cuda", dtype=torch.float16):
def __init__(self, dim, hidden_dim, activation, device, dtype):
super(FeedForward, self).__init__()
self.ff1 = nn.Linear(dim, hidden_dim, device=device, dtype=dtype)
self.ff2 = nn.Linear(hidden_dim, dim, device=device, dtype=dtype)
......@@ -164,7 +158,7 @@ class FeedForward(nn.Module):
return x
class GPTJLayer(nn.Module):
def __init__(self, attn=SelfAttention, ff=FeedForward, hidden_dim=768, n_head=4, eps=1e-6, activation=nn.GELU(), device="cuda", dtype=torch.float16):
def __init__(self, attn, ff, hidden_dim, n_head, eps, activation, device, dtype):
super(GPTJLayer, self).__init__()
self.hidden_dim = hidden_dim
self.ln_preattn = nn.LayerNorm(hidden_dim, eps=eps, device=device, dtype=dtype)
......@@ -189,61 +183,45 @@ class GPTJLayer(nn.Module):
return x
class GPTJModel(gpt_arch.GPTModel):
def __init__(self, hidden_dim=512, n_layer=12, n_head=4, vocab_dim=50400, eps=1e-4, activation=nn.GELU(), Layer=GPTJLayer, SelfAttention=SelfAttention, FeedForward=FeedForward, device="cuda", dtype=torch.float16):
super(GPTJModel, self).__init__(hidden_dim=hidden_dim, n_layer=n_layer, n_head=n_head, vocab_dim=vocab_dim, eps=eps, activation=activation, Layer=Layer, SelfAttention=SelfAttention, FeedForward=FeedForward, device=device, dtype=dtype)
def load_gpt_j(path="models/6b", state_dict=None):
config = {
"n_layer": 28,
"n_head": 16,
"hidden_dim": 4096,
"vocab_dim": 50400,
"eps": 1e-5,
"activation": gelu_new,
"Layer": GPTJLayer
}
model = GPTJModel.load(config, path, state_dict)
return model
def init_6b():
config = {
"n_layer": 28,
"n_head": 16,
"hidden_dim": 4096,
"vocab_dim": 50400,
"eps": 1e-5,
"activation": gelu_new,
"Layer": GPTJLayer
}
model = GPTJModel.init(config)
return model
def init_125m():
config = {
"n_layer": 12,
"n_head": 12,
"hidden_dim": 768,
"vocab_dim": 50400,
"eps": 1e-5,
"activation": gelu_new,
"Layer": GPTJLayer
}
model = GPTJModel.init(config)
return model
def init_1_3b():
config = {
"n_layer": 24,
"n_head": 16,
"hidden_dim": 2048,
"vocab_dim": 50400,
"eps": 1e-5,
"activation": gelu_new,
"Layer": GPTJLayer
}
model = GPTJModel(**config)
return model
\ No newline at end of file
class GPTModel(nn.Module):
def __init__(self, hidden_dim, n_layer, n_head, vocab_dim, eps, activation, Layer, device, dtype):
super(GPTModel, self).__init__()
self.n_layer = n_layer
self.hidden_dim = hidden_dim
self.vocab_embed = nn.Embedding(vocab_dim, self.hidden_dim, device=device, dtype=dtype)
self.ln_final = nn.LayerNorm(self.hidden_dim, eps=eps, device=device, dtype=dtype)
self.layers = nn.ModuleList([])
self.lm_head = nn.Linear(hidden_dim, vocab_dim, bias=True)
for _ in range(n_layer):
self.layers.append(Layer(attn=SelfAttention, ff=FeedForward, hidden_dim=hidden_dim, n_head=n_head, eps=eps, activation=activation, device=device, dtype=dtype))
#TODO: Decouple more, maybe even init everything here, not sure. Not modular enough yet.
#TODO: Do we want to pass a config object everywhere? I don't exactly like that but passing a lot of variables is a bit ugly too.
def _init_weights(self, module):
"""Initialize the weights."""
if isinstance(module, nn.Linear):
module.weight.data.normal_(mean=0.0, std=0.02)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=0.02)
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
for name, p in module.named_parameters():
if ("ff2" in name or "out_proj" in name) and "weight" in name:
# Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
p.data.normal_(mean=0.0, std=(0.02 / math.sqrt(2 * self.n_layer)))
def forward(self, x, hypernetwork=None, act_ck=False):
x = self.get_embeds(x, hypernetwork=hypernetwork, act_ck=act_ck)
x = self.lm_head(x)
return x.float()
def get_embeds(self, x, hypernetwork=None, act_ck=False):
x = self.vocab_embed(x)
for layer_id, layer in enumerate(self.layers):
x = layer(x, layer_id=layer_id, hypernetwork=hypernetwork, act_ck=act_ck)
x = self.ln_final(x)
return x
from lm_arch import utils
import math
import torch
from torch import nn
import os
class BaseLM(nn.Module):
def __init__(self, config=None, lm=None):
self.config = config
self.lm = lm
def init_weights(self):
for module in self.lm.modules():
if isinstance(module, nn.Linear):
module.weight.data.normal_(mean=0.0, std=0.02)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=0.02)
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
for name, p in module.named_parameters():
if ("ff2" in name or "out_proj" in name) and "weight" in name:
p.data.normal_(mean=0.0, std=(0.02 / math.sqrt(2 * self.config.n_layer)))
@classmethod
def init(cls, config):
lm = config.model_class(**config)
model = cls(config, lm)
model.init_weights()
#make this modular later
return model
@classmethod
def no_init(cls, config):
lm = utils.no_init(lambda: config.model_class(**config))
model = cls(config, lm)
return model
@classmethod
def load(cls, config, path=None, state_dict=None, strict=False):
# I am kinda sad that we will not have a load function in lm object itself.
# might be better to add load functions to that as well but not sure.
if path:
state_dict = utils.SplitCheckpoint(path, device="cuda")
lm = config.model_class(**config)
model = cls(config, lm)
model.lm.load_state_dict(state_dict, strict=strict)
return model
def save(self, path):
if self.lm is None:
print("No LM object to save. Please first init a model.")
try: os.mkdir(path)
except: pass
checkpoint = {}
for i, x in enumerate(self.lm.state_dict().items()):
checkpoint[x[0]] = f"{path}/b{i}.pt"
torch.save(x[1], f"{path}/b{i}.pt")
torch.save(checkpoint, f"{path}/m.pt")
\ No newline at end of file
......@@ -75,6 +75,11 @@ def get_logits(x, embedding):
def gelu_new(x):
return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))
def gelu_jax(x):
sqrt_2_over_pi = math.sqrt(2.0 / math.pi)
cdf = 0.5 * (1.0 + torch.tanh(sqrt_2_over_pi * (x + 0.044715 * (x ** 3))))
return cdf
def fixed_pos_embedding(dim=None, seq_len=None, x=None):
if x is None:
x = torch.empty(0)
......
# run bash: -b
# run command: default
# kill: -k name
# start pod: -s name
# gpu: -g --gpu
# amount: -n
# cpu cores: -c
# amount of ram: -r
# image: -i
from novelutils.novelfra import *
from pyfra import *
import argparse
import sys
parser = argparse.ArgumentParser(description='Novelfra utility tool for launching pods and deployments on kubernetes with pyfra.')
parser.add_argument('name', nargs="?", type=str, help='Deployment name')
# Make the default the last one we used.
parser.add_argument('--service', action="store_true", help="""Create a service with the deployment. If a service is not
created you won't be able to access the pod outside from the kube network.""")
parser.add_argument('-b', '--bash', action="store_true", help='Run bash instead of python3.')
parser.add_argument('-k', '--kill', action="store_true", help='Kill a pod given the name.')
parser.add_argument('-s', '--start', action="store_true", help='Start a pod given the name.')
parser.add_argument('-g', '--gpu', default="RTX_A4000", type=str, help='Use a gpu.')
parser.add_argument('-n', '--amount', type=int, default=1, help='Amount of gpus to use.')
parser.add_argument('-c', '--cpu', type=int, default=4, help='Amount of cpu cores to use.')
parser.add_argument('-r', '--ram', type=int, default=8, help='Amount of ram to use.')
parser.add_argument('-i', '--image', type=str, default='novelai/kube-ssh:13', help='Docker image to use.')
parser.add_argument(type=str, nargs="*", dest="command", default="bash", help='Commands to run.')
args = parser.parse_args()
print(args)
name = args.name
dry = False
bash = args.bash
config_obj = KubeConfig()
config_obj.set_name(name)
if args.start:
config_obj.set_gpu(gpu_name=args.gpu, amount=args.amount)
config_obj.set_ram(args.ram)
config_obj.set_cpu(args.cpu)
config_obj.dry_run(dry)
config_obj.print_information()
config_obj.create_deployment(overwrite=False)
if args.service:
config_obj.create_service(overwrite=False)
if args.kill:
config_obj.kill_deployment()
config_obj.kill_service()
sys.exit(0)
remote = config_obj.get_pyfra_remote()
env1 = remote.env('noname', python_version=None)
path = env1.path('/home/xuser/diffusionstorage/workspace/kuru/basedformer')
env1.sh('pip install /home/xuser/hugessd/pytorch/torch-1.10.1+cu113-cp38-cp38-linux_x86_64.whl')
env1.sh('pip install einops numpy')
env1.sh('pip install tqdm')
env1.sh('pip install /home/xuser/diffusionstorage/workspace/finetune/pokepls/transformers-repo')
env1.sh('pip3 install einops==0.4.1 pyyaml wandb')
env1.sh('wandb login 21a9442d42a35e15ce421f2b702ec58508b9adc4')
env1.sh('pip3 install dotmap')
with always_rerun():
if args.bash:
path.sh("bash")
else:
if args.command:
path.sh(" ".join(args.command))
else:
print("No command given.")
sys.exit(0)
\ No newline at end of file
from novelutils.novelfra import *
from pyfra import *
import sys
import argparse
# run bash: -b
# run command: default
# kill
name = 'pyfra-basedformer'
dry = False
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment