Commit 40e90836 authored by novelailab's avatar novelailab

fix fairseq by taking out eot and newline mapping

parent aa6444e9
...@@ -171,7 +171,7 @@ class GPTNeoModel(base_lm.BaseModel): ...@@ -171,7 +171,7 @@ class GPTNeoModel(base_lm.BaseModel):
base_lm.BaseModel.__init__(self, user_config, **kwargs) base_lm.BaseModel.__init__(self, user_config, **kwargs)
self.pos_embed = nn.Embedding(self.config.n_tokens, self.config.hidden_dim) self.pos_embed = nn.Embedding(self.config.n_tokens, self.config.hidden_dim)
self.lm_head = nn.Linear(self.config.hidden_dim, self.config.vocab_dim, bias=False) self.lm_head = nn.Linear(self.config.hidden_dim, self.config.vocab_dim, bias=False)
#bias=False for fairseq models #bias=False for neo models
def get_embeds(self, x, hypernetwork=None, act_ck=False, kv=None, cache=False): def get_embeds(self, x, hypernetwork=None, act_ck=False, kv=None, cache=False):
if kv is None: if kv is None:
......
This diff is collapsed.
...@@ -14,7 +14,7 @@ bash = False ...@@ -14,7 +14,7 @@ bash = False
config_obj = KubeConfig() config_obj = KubeConfig()
config_obj.set_name(name) config_obj.set_name(name)
config_obj.set_gpu(gpu_name=GPU.RTX_A6000, amount=1) config_obj.set_gpu(gpu_name=GPU.RTX_A6000, amount=1)
config_obj.set_ram(16) config_obj.set_ram(64)
config_obj.set_cpu(4) config_obj.set_cpu(4)
config_obj.dry_run(dry) config_obj.dry_run(dry)
config_obj.print_information() config_obj.print_information()
...@@ -31,18 +31,23 @@ if False: ...@@ -31,18 +31,23 @@ if False:
env1.sh('pip install /home/xuser/hugessd/pytorch/torch-1.10.1+cu113-cp38-cp38-linux_x86_64.whl') env1.sh('pip install /home/xuser/hugessd/pytorch/torch-1.10.1+cu113-cp38-cp38-linux_x86_64.whl')
env1.sh('pip install einops numpy') env1.sh('pip install einops numpy')
env1.sh('pip install tqdm') env1.sh('pip install tqdm')
env1.sh('pip install /home/xuser/diffusionstorage/workspace/finetune/pokepls/transformers-repo') #env1.sh('pip install /home/xuser/diffusionstorage/workspace/finetune/pokepls/transformers-repo')
env1.sh('pip3 install einops==0.4.1 pyyaml wandb') env1.sh('pip3 install einops==0.4.1 pyyaml wandb')
env1.sh('wandb login 21a9442d42a35e15ce421f2b702ec58508b9adc4') env1.sh('wandb login 21a9442d42a35e15ce421f2b702ec58508b9adc4')
env1.sh('pip3 install dotmap icecream') env1.sh('pip3 install dotmap icecream')
path.sh("pip3 install --editable .") path.sh("pip3 install --editable .")
#path.sh("pip3 uninstall torch")
#path.sh("pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu113")
with always_rerun(): with always_rerun():
if True: if True:
#env1.sh('pip3 uninstall transformers')
#env1.sh('pip3 install transformers') #env1.sh('pip3 install transformers')
path.sh("python3 ../lm-evaluation-harness/main.py --model basedformer --batch_size 8 --model_args pretrained=/home/xuser/diffusionstorage/workspace/kuru/basedformer/pretrained/gpt-neo-125m-ported --device 0 --tasks lambada --no_cache") #path.sh('pip3 install --editable ../lm-evaluation-harness/.')
#env1.sh('pip3 install pytest')
#env1.sh('pip install /home/xuser/hugessd/pytorch/torch-1.10.1+cu113-cp38-cp38-linux_x86_64.whl')
path.sh('pip3 uninstall huggingface_hub')
path.sh('pip3 install huggingface_hub')
#path.sh('pip3 uninstall transformers')
#path.sh('pip3 install transformers')
#path.sh("python3 ../lm-evaluation-harness/main.py --model gpt2 --batch_size 8 --model_args pretrained=EleutherAI/gpt-neo-125M --device 0 --tasks lambada --no_cache")
path.sh("python3 ../lm-evaluation-harness/main.py --model basedformer --batch_size 8 --model_args pretrained=/home/xuser/diffusionstorage/workspace/kuru/basedformer/pretrained/fairseq_125m --device 0 --tasks lambada --no_cache")
#path.sh("python3 ../lm-evaluation-harness/main.py --batch_size 8") #path.sh("python3 ../lm-evaluation-harness/main.py --batch_size 8")
else: else:
......
...@@ -11,67 +11,15 @@ from contextlib import contextmanager ...@@ -11,67 +11,15 @@ from contextlib import contextmanager
import torch.nn.functional as F import torch.nn.functional as F
from transformers import GPTNeoForCausalLM from transformers import GPTNeoForCausalLM
from icecream import ic from icecream import ic
#replicating timeit magic function of ipython
def timeit(func, r=1, n=5, quiet=False, function=None, do_tqdm=False, first=True):
precision = 'ns'
r_arr = np.empty([2, r]) # [0] = mean, [1] = std
if function:
func.__name__ = function.__name__
for i in tqdm(range(r)) if do_tqdm else range(r):
n_arr = np.empty(n)
for k in range(n):
start = perf_counter_ns()
func()
n_arr[k] = perf_counter_ns() - start
if not first:
# delete the first element from n_arr numpy array
n_arr = np.delete(n_arr, 0)
r_arr[0, i] = np.mean(n_arr)
r_arr[1, i] = np.std(n_arr)
best = r_arr[:, np.argmin(r_arr[0])] # [0] = mean, [1] = std
#check if best[0] bigger than 1ms in numpy
if best[0] < 1e3:
precision = 'ns'
elif best[0] >= 1e9:
print('b')
best[0] = best[0] * 1e-9
best[1] = best[1] * 1e-9
precision = 's'
elif best[0] >= 1e6:
best[0] = best[0] * 1e-6
best[1] = best[1] * 1e-6
precision = 'ms'
elif best[0] >= 1e3:
precision = 'μs'
best[0] = best[0] * 1e-3
best[1] = best[1] * 1e-3
if not quiet:
if precision == 'ns':
print(f"{func.__name__}: {best[0]:.0f}{precision} ± {best[1]:.0f}{precision} per loop (mean ± std. dev. of {str(r)} runs, {str(n)} loops each)")
if precision == 'μs':
print(f"{func.__name__}: {best[0]:.2f}{precision} ± {best[1]:.2f}{precision} per loop (mean ± std. dev. of {str(r)} runs, {str(n)} loops each)")
elif precision == 'ms':
print(f"{func.__name__}: {best[0]:.2f}{precision} ± {best[1]:.2f}{precision} per loop (mean ± std. dev. of {str(r)} runs, {str(n)} loops each)")
elif precision == 's':
print(f"{func.__name__}: {best[0]:.4f}{precision} ± {best[1]:.4f}{precision} per loop (mean ± std. dev. of {str(r)} runs, {str(n)} loops each)")
with torch.no_grad(): with torch.no_grad():
model_dir = '/home/xuser/diffusionstorage/workspace/kuru/basedformer/pretrained/hf_125m/' model_dir = '/home/xuser/diffusionstorage/models/fairseq/converted/en_dense_lm_125m/'
hf_model = no_init(lambda: GPTNeoForCausalLM.from_pretrained(model_dir)).cuda().half().eval() hf_model = no_init(lambda: GPTNeoForCausalLM.from_pretrained(model_dir)).cuda().half().eval()
print("Loaded hf model") print("Loaded hf model")
path = "/home/xuser/diffusionstorage/workspace/kuru/basedformer/pretrained/fairseq_125m" path = "/home/xuser/diffusionstorage/workspace/kuru/basedformer/pretrained/fairseq_125m"
based_model = lmu.load_from_path(path).cuda().half().eval() based_model = lmu.load_from_path(path).cuda().half().eval()
print("Loaded based model") print("Loaded based model")
x = torch.randint(0, 50256, (1, 2048)).cuda().long() x = torch.randint(0, 51200, (1, 300)).cuda().long()
assert torch.allclose(hf_model.transformer.wte(x), based_model.vocab_embed(x)) assert torch.allclose(hf_model.transformer.wte(x), based_model.vocab_embed(x))
hidden = hf_model.transformer.wte(x) hidden = hf_model.transformer.wte(x)
...@@ -85,7 +33,7 @@ with torch.no_grad(): ...@@ -85,7 +33,7 @@ with torch.no_grad():
ic(hf_model.transformer.h[layer].attn(hidden)[0].abs().mean()) ic(hf_model.transformer.h[layer].attn(hidden)[0].abs().mean())
ic(based_model.layers[layer].attn(hidden)[0].abs().mean()) ic(based_model.layers[layer].attn(hidden)[0].abs().mean())
ic((hf_model.transformer.h[layer].attn(hidden)[0] - based_model.layers[layer].attn(hidden)[0]).abs().mean()) ic((hf_model.transformer.h[layer].attn(hidden)[0] - based_model.layers[layer].attn(hidden)[0]).abs().mean())
#assert torch.allclose(hf_model.transformer.h[layer].attn(hidden)[0], based_model.layers[layer].attn(hidden)[0], rtol=1e-6) assert torch.allclose(hf_model.transformer.h[layer].attn(hidden)[0], based_model.layers[layer].attn(hidden)[0], rtol=1e-6)
attn_out = hf_model.transformer.h[layer].attn(hidden)[0] attn_out = hf_model.transformer.h[layer].attn(hidden)[0]
hidden = residual + attn_out hidden = residual + attn_out
residual = hidden residual = hidden
......
...@@ -110,12 +110,12 @@ with torch.no_grad(): ...@@ -110,12 +110,12 @@ with torch.no_grad():
wte = fairdict["decoder.embed_tokens.weight"].clone() wte = fairdict["decoder.embed_tokens.weight"].clone()
for i in range(50260): for i in range(50260):
wte[mapping[i]] = fairdict["decoder.embed_tokens.weight"][i] wte[mapping[i]] = fairdict["decoder.embed_tokens.weight"][i]
hack_embs(wte) #hack_embs(wte)
save(wte.half(), "vocab_embed.weight") save(wte.half(), "vocab_embed.weight")
lm_head = fairdict["decoder.output_projection.weight"].clone() lm_head = fairdict["decoder.output_projection.weight"].clone()
for i in range(50260): for i in range(50260):
lm_head[mapping[i]] = fairdict["decoder.output_projection.weight"][i] lm_head[mapping[i]] = fairdict["decoder.output_projection.weight"][i]
hack_embs(lm_head) #hack_embs(lm_head)
save(lm_head.half(), "lm_head.weight") save(lm_head.half(), "lm_head.weight")
save(torch.FloatTensor(1), "pos_embed._float_tensor") save(torch.FloatTensor(1), "pos_embed._float_tensor")
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment