Epoch support, and mask `<|endoftext|>`

55a00bbf · Wes Brown · eebb1fa8 · 55a00bbf
Commit 55a00bbf authored Jul 17, 2022 by Wes Brown
Show whitespace changes
Inline Side-by-side

Showing with 77 additions and 65 deletions

hypertrain.py hypertrain.py +77 -65

No files found.
--- a/hypertrain.py
+++ b/hypertrain.py
@@ -28,7 +28,8 @@ prompts = ["<|endoftext|>",
           "The mercurial and beautiful",
           "<|endoftext|>[ Author:",
           "<|endoftext|>[ Genre:",
-           "***"]
+           "***",
+           "----"]
 def _init_weights(module):
@@ -285,6 +286,7 @@ parser.add_argument("--logs", type=str, help="log directory location",
 parser.add_argument("--masked", type=bool, help="masked softmax fusion")
 parser.add_argument("--sample_vanilla", type=bool, help="sample vanilla model")
 parser.add_argument("--shuffle", type=bool, help="shuffle dataset contexts")
+parser.add_argument("--epochs", type=int, help="number of epochs to train for")
 parser.set_defaults(loss_scale=False, amp=False, no_resume=False, masked=False,
                    sample_vanilla=False, shuffle=False)
 args = parser.parse_args()
@@ -312,6 +314,7 @@ train_config = {
    "context_size": args.context_size,
    "sample_vanilla": args.sample_vanilla,
    "shuffle": args.shuffle,
+    "epochs": args.epochs,
 }
 torch.manual_seed(train_config["seed"])
 bs = train_config["bs"]
@@ -368,9 +371,12 @@ if last_cp:
 else:
    curr_step = 0
-t = tqdm(train_loader, initial=curr_step)
+epoch_steps = len(train_loader)
+total_steps = epoch_steps * train_config['epochs']
-for input_ids, labels in t:
+with tqdm(total=total_steps, initial=curr_step) as t:
+    for epoch in range(train_config['epochs']):
+        for input_ids, labels in train_loader:
            timex = time.perf_counter()
            input_ids = input_ids.to(gpu)
            labels = labels.to(gpu)
@@ -384,6 +390,7 @@ for input_ids, labels in t:
                    logits = logits.view(-1, logits.shape[-1])
                    gas_labels = labels[x * bs:(x + 1) * bs, :].contiguous()
                    gas_labels = gas_labels.view(-1)
+                    gas_labels[gas_labels == 50256] = -100
                    gas_loss = F.cross_entropy(logits, gas_labels)
                if train_config["loss_scale"]:
@@ -408,11 +415,15 @@ for input_ids, labels in t:
            opt.zero_grad()
            sec_per_step = (time.perf_counter() - timex)
            step_per_sec = (1. / sec_per_step)
-    tokens_per_sec = (step_per_sec * train_config["context_size"]) * bs * gas
+            tokens_per_sec = (step_per_sec * train_config["context_size"]) * \
-    t.set_description(f"{step_per_sec:.2f} steps/s, {sec_per_step:.2f}s/step,"
+                             bs * gas
-                      + f"{tokens_per_sec:.2f}tokens/s, loss={loss:.4f}")
+            t.set_description(f"{step_per_sec:.2f} steps/s, "
+                              f"{sec_per_step:.2f}s/step, "
+                              f"{tokens_per_sec:.2f}tokens/s, "
+                              f"loss={loss:.4f}")
            wandb.log(
                {
+                    "train/epoch": float(curr_step) / float(epoch_steps),
                    "train/loss": loss,
                    "train/tokens_per_sec": tokens_per_sec,
                    "train/sec_per_step": sec_per_step,
@@ -432,6 +443,7 @@ for input_ids, labels in t:
                eval_fn(curr_step)
            curr_step += 1
+            t.update(1)
 eval_fn(curr_step)
 hypernetwork_saver("final")