Commit 3b759040 authored by Wes Brown's avatar Wes Brown

First pass at argo workflow.

parent 0bde514b
FROM gooseai/torch-base:6cfdc11
RUN apt-get install -y cuda-nvcc-11-3 cuda-nvml-dev-11-3 libcurand-dev-11-3 \
libcublas-dev-11-3 libcusparse-dev-11-3 \
libcusolver-dev-11-3 cuda-nvprof-11-3 \
ninja-build && \
apt-get clean
RUN mkdir -p /app
WORKDIR /app
# Requirements
COPY requirements.txt .
RUN pip3 install --no-cache-dir -r requirements.txt
# Then our source
COPY . .
CMD [ "/usr/bin/python3", "hypertrain.py" ]
apiVersion: argoproj.io/v1alpha1
kind: Workflow
metadata:
generateName: "hypertrain-"
spec:
entrypoint: main
arguments:
parameters:
# run_name should be unique on a per-run basis, especially if reporting
# to wandb or sharing PVCs between runs.
- name: run_name
- name: pvc
value: 'finetune-data'
# Training parameters. Model IDs are hugging face IDs to pull down, or
# a path to your model relative to the PVC root.
- name: model
value: 'sigurdv4'
# The directory to read your dataset in, and tokenize for training.
- name: dataset
value: 'dataset'
# Whether to retokenize the data on each run. It is strongly recommended to
# NOT enable this if you are making multiple runs on the same dataset.
- name: retokenize
value: 'false'
# eot and pad tokens, it is suggested not to change this value, but usual
# values are `<|padding|>` or `<|endoftext|>`
- name: eot_token
value: ''
- name: pad_token
value: ''
# The token that marks a boundary - if the end of a `context` is reached,
# and the rest of the range cannot fit in the remainder of a context, the
# range is allowed to fill out the current `context`, and then the entire
# range is used to start the next `context`.
- name: boundary_token
value: '\n'
# The context size to train. Most models have a default maximum of `2048`, but
# this can be adjusted downwards. This is most useful when the text you're
# training on is divided along logical boundaries and is split using
# `boundary_token`.
- name: context
value: '2049'
# Model training parameters
# Batch size.
- name: batch_size
value: 4
- name: gas
value: 1
- name: random_seed
value: 42
# Learn rate, increase or decrease as needed.
- name: learn_rate
value: '2e-4'
- name: learn_rate_end
value: '2e-4'
# Number of steps to run warmup for
- name: warmup
value: 10
# Number of epochs, the number of times it will go through your dataset.
- name: epochs
value: 1
# The optimizer to use
- name: optimizer
value: 'adamw'
- name: save_steps
value: 500
- name: eval_every
value: 500
# Whether to *not* resume from checkpoints.
- name: no_resume
value: 'false'
# Where we store our training logs on the PVC.
- name: logs
value: 'logs'
# Project reporting/id
- name: wandb_key
value: ''
- name: project_id
value: 'hypernetwork-training'
# CoreWeave region to default to; ORD1 has most of the GPUs.
- name: region
value: 'ORD1'
# Training GPU - A40, 48gb VRAM
- name: trainer_gpu
value: 'A40'
# Container images -- generally, don't alter this.
- name: downloader_image
value: 'ghcr.io/wbrown/gpt_bpe/model_downloader'
- name: downloader_tag
value: 'cfbacfe'
- name: tokenizer_image
value: 'ghcr.io/wbrown/gpt_bpe/dataset_tokenizer'
- name: tokenizer_tag
value: 'cfbacfe'
- name: hypertrainer_image
value: 'docker.io/gooseai/basedformer'
- name: hypertrainer_tag
value: '0bde514'
templates:
- name: main
steps:
- - name: downloader
template: model-downloader
arguments:
parameters:
- name: model
value: "{{workflow.parameters.model}}"
- name: dest
value: "/{{workflow.parameters.pvc}}/models/{{workflow.parameters.model}}"
- - name: tokenizer
template: model-tokenizer
arguments:
parameters:
- name: input
value: "/{{workflow.parameters.pvc}}/{{workflow.parameters.dataset}}"
- name: output
value: "/{{workflow.parameters.pvc}}/{{workflow.parameters.dataset}}-{{=sprig.replace('/', '_', sprig.replace('.','_', sprig.replace('-','_', workflow.parameters.model)))}}-{{workflow.parameters.context}}-{{workflow.parameters.tokenizer_tag}}.tokens"
- name: model
value: "/{{workflow.parameters.pvc}}/models/{{workflow.parameters.model}}"
- name: context
value: "{{workflow.parameters.context}}"
- name: eot
value: "{{workflow.parameters.eot_token}}"
- name: pad
value: "{{workflow.parameters.pad_token}}"
- name: boundary
value: "{{workflow.parameters.boundary_token}}"
- - name: hypertrainer
template: model-hypertrainer
arguments:
parameters:
- name: run_name
value: "{{workflow.parameters.run_name}}"
- name: model
value: "/{{workflow.parameters.pvc}}/models/{{workflow.parameters.model}}"
- name: dataset
value: "/{{workflow.parameters.pvc}}/{{workflow.parameters.dataset}}-{{=sprig.replace('/', '_', sprig.replace('.','_', sprig.replace('-','_', workflow.parameters.model)))}}-{{workflow.parameters.context}}-{{workflow.parameters.tokenizer_tag}}.tokens"
- name: learn_rate
value: "{{workflow.parameters.learn_rate}}"
- name: learn_rate_end
value: "{{workflow.parameters.learn_rate_end}}"
- name: warmup
value: "{{workflow.parameters.warmup}}"
- name: epochs
value: "{{workflow.parameters.epochs}}"
- name: save_steps
value: "{{workflow.parameters.save_steps}}"
- name: eval_every
value: "{{workflow.parameters.eval_every}}"
- name: eot
value: "{{workflow.parameters.eot_token}}"
- name: pad
value: "{{workflow.parameters.pad_token}}"
- name: bs
value: "{{workflow.parameters.batch_size}}"
- name: seed
value: "{{workflow.parameters.random_seed}}"
- name: output_path
value: "/{{workflow.parameters.pvc}}/hypernets/"
- name: cache
value: "/{{workflow.parameters.pvc}}/cache/"
- name: torch_cache
value: "/{{workflow.parameters.pvc}}/torch/"
- name: no_resume
value: "{{workflow.parameters.no_resume}}"
- name: logs
value: "/{{workflow.parameters.pvc}}/{{workflow.parameters.logs}}"
- name: wandb_key
value: "{{workflow.parameters.wandb_key}}"
- name: context
value: "{{workflow.parameters.context}}"
- name: gas
value: "{{workflow.parameters.gas}}"
- name: project_id
value: "{{workflow.parameters.project_id}}"
- name: optimizer
value: "{{workflow.parameters.optimizer}}"
- name: model-downloader
inputs:
parameters:
- name: model
- name: dest
retryStrategy:
limit: 1
container:
image: "{{workflow.parameters.downloader_image}}:{{workflow.parameters.downloader_tag}}"
command: [ "/ko-app/model_downloader" ]
args: [ "-model", "{{inputs.parameters.model}}",
"-dest", "{{inputs.parameters.dest}}" ]
resources:
requests:
memory: 512Mi
cpu: "2"
limits:
memory: 512Mi
cpu: "2"
volumeMounts:
- mountPath: "/{{workflow.parameters.pvc}}"
name: "{{workflow.parameters.pvc}}"
volumes:
- name: "{{workflow.parameters.pvc}}"
persistentVolumeClaim:
claimName: "{{workflow.parameters.pvc}}"
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: topology.kubernetes.io/region
operator: In
values:
- "{{workflow.parameters.region}}"
- name: model-tokenizer
inputs:
parameters:
- name: input
- name: model
- name: context
- name: eot
- name: pad
- name: output
- name: boundary
retryStrategy:
limit: 1
container:
image: "{{workflow.parameters.tokenizer_image}}:{{workflow.parameters.tokenizer_tag}}"
command: [ "/ko-app/dataset_tokenizer" ]
args: ["-tokenizer", "gpt2", # "{{inputs.parameters.model}}",
"-context", "{{inputs.parameters.context}}",
"-eot", "{{inputs.parameters.eot}}",
"-pad", "{{inputs.parameters.pad}}",
"-input", "{{inputs.parameters.input}}",
"-output", "{{inputs.parameters.output}}",
"-boundary", "{{inputs.parameters.boundary}}",
"-sanitize"]
resources:
requests:
memory: 256Mi
cpu: "4"
limits:
memory: 256Mi
cpu: "4"
volumeMounts:
- mountPath: "/{{workflow.parameters.pvc}}"
name: "{{workflow.parameters.pvc}}"
volumes:
- name: "{{workflow.parameters.pvc}}"
persistentVolumeClaim:
claimName: "{{workflow.parameters.pvc}}"
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: topology.kubernetes.io/region
operator: In
values:
- "{{workflow.parameters.region}}"
- name: model-hypertrainer
inputs:
parameters:
- name: run_name
- name: model
- name: dataset
- name: learn_rate
- name: learn_rate_end
- name: epochs
- name: bs
- name: gas
- name: seed
- name: output_path
- name: no_resume
- name: wandb_key
- name: project_id
- name: context
- name: optimizer
- name: torch_cache
- name: save_steps
- name: eval_every
- name: warmup
container:
image: "{{workflow.parameters.hypertrainer_image}}:{{workflow.parameters.hypertrainer_tag}}"
command: [ "/usr/bin/python3", "/usr/src/app/hypertrain.py" ]
args: ["--run_name", "{{inputs.parameters.run_name}}",
"--model", "{{inputs.parameters.model}}",
"--dataset", "{{inputs.parameters.dataset}}",
"--seed", "{{inputs.parameters.seed}}",
"--lr", "{{inputs.parameters.learn_rate}}",
"--end_lr", "{{inputs.parameters.learn_rate_end}}",
"--warmup", "{{inputs.parameters.warmup}}",
"--bs", "{{inputs.parameters.bs}}",
"--gas", "{{inputs.parameters.gas}}",
"--save_steps", "{{inputs.parameters.save_steps}}",
"--eval_every", "{{inputs.parameters.eval_every}}",
"--output_path", "{{inputs.parameters.output_path}}",
"--project_id", "{{inputs.parameters.project_id}}",
"--epochs", "{{inputs.parameters.epochs}}",
"--context_size", "{{inputs.parameters.context}}",
"--optimizer", "{{inputs.parameters.optimizer}}"]
tty: true
env:
- name: WANDB_API_KEY
value: "{{inputs.parameters.wandb_key}}"
- name: PYTHONUNBUFFERED
value: "1"
- name: TORCH_EXTENSIONS_DIR
value: "{{inputs.parameters.torch_cache}}"
resources:
requests:
memory: 128Gi
cpu: "8"
limits:
memory: 192Gi
cpu: "8"
nvidia.com/gpu: 1
volumeMounts:
- mountPath: "/{{workflow.parameters.pvc}}"
name: "{{workflow.parameters.pvc}}"
volumes:
- name: "{{workflow.parameters.pvc}}"
persistentVolumeClaim:
claimName: "{{workflow.parameters.pvc}}"
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: gpu.nvidia.com/class
operator: In
values:
- "{{workflow.parameters.trainer_gpu}}"
- key: topology.kubernetes.io/region
operator: In
values:
- "{{workflow.parameters.region}}"
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment