a

4141d527 · novelailab · 34739983 · 4141d527 · 4141d527 · 4141d527
Commit 4141d527 authored Apr 06, 2022 by novelailab
Hide whitespace changes
Inline Side-by-side

Showing with 9 additions and 6 deletions

basedformer/gptj.py basedformer/gptj.py +1 -1

basedformer/lm_base.py basedformer/lm_base.py +7 -5

basedformer/optimizer.py basedformer/optimizer.py +1 -0

No files found.
--- a/basedformer/gptj.py
+++ b/basedformer/gptj.py
@@ -202,7 +202,7 @@ class GPTJLayer(nn.Module):
        return x
 class GPTJModel(nn.Module):
-    def __init__(self, hidden_dim, n_layer, n_head, vocab_dim, eps, activation=gelu_new, Layer=GPTJLayer, device="cuda", dtype=torch.float16):
+    def __init__(self, hidden_dim, n_layer, n_head, vocab_dim, eps, activation=gelu_new, Layer=GPTJLayer, device="cuda", dtype=torch.float16, **kwargs):
        nn.Module.__init__(self)
        self.n_layer = n_layer
        self.hidden_dim = hidden_dim

--- a/basedformer/lm_base.py
+++ b/basedformer/lm_base.py
@@ -4,6 +4,7 @@ import torch
 from torch import nn
 from basedformer import gptj
 import os
+import json
 #Having common BaseLM functionality in this class instead of the torch LM itself makes sense.
 class BaseLM(nn.Module):
@@ -32,7 +33,7 @@ class BaseLM(nn.Module):
    @classmethod
    def init(cls, config):
-        lm = config.model_class(**config)
+        lm = config["model_class"](**config)
        model = cls(config, lm)
        model.init_weights()
        #make this modular later
@@ -46,13 +47,13 @@ class BaseLM(nn.Module):
        return model
    @classmethod
-    def load(cls, model_class, config, path=None, state_dict=None, strict=False):
+    def load(cls, config, path=None, state_dict=None, strict=False):
        # I am kinda sad that we will not have a load function in lm object itself.
-        # might be better to add load functions to that as well but not sure.
+        # might be better to add load functions -- actually nope.
        if path:
            state_dict = utils.SplitCheckpoint(path, device="cuda")
-        lm = model_class(**config)
+        lm = config["model_class"](**config)
        model = cls(config, lm)
        model.lm.load_state_dict(state_dict, strict=strict)
        return model
@@ -73,11 +74,12 @@ class BaseLM(nn.Module):
 def load_gpt_j(path="models/6b", state_dict=None):
    config = {
+        "model_class": gptj.GPTJModel,
        "n_layer": 28,
        "n_head": 16,
        "hidden_dim": 4096,
        "vocab_dim": 50400,
        "eps": 1e-5
    }
-    model = BaseLM.load(gptj.GPTJModel, config, path, state_dict)
+    model = BaseLM.load(config, path, state_dict)
    return model
--- a/basedformer/optimizer.py
+++ b/basedformer/optimizer.py
@@ -10,6 +10,7 @@ def lr_schedule(step, warmup_steps, anneal_steps, lr, end_lr):
    anneal_percent = np.clip(step - warmup_steps, 0, anneal_steps) / anneal_steps
    #cosine schedule for annealing
    return lr * warmup_percent - (lr - end_lr) * (1 - np.cos(np.pi * anneal_percent)) / 2
+    #kinda broken. doesn't start from 0
 class BasedOptimizer: