add sd in

15dd396a · nanahira · 34d0ca62 · 15dd396a · 34d0ca62 · acdc20a6
Commit 15dd396a authored Oct 08, 2022 by nanahira
222 changed files
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
+stages:
+  - build
+  - deploy
+variables:
+  GIT_DEPTH: "1"
+
+before_script:
+  - docker login -u $CI_REGISTRY_USER -p $CI_REGISTRY_PASSWORD $CI_REGISTRY
+
+.build-image:
+  stage: build
+  script:
+    - git submodule update --init
+    - docker build --pull -t $TARGET_IMAGE .
+    - docker push $TARGET_IMAGE
+
+build-x86:
+  extends: .build-image
+  tags:
+    - docker
+  variables:
+    TARGET_IMAGE: $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_SLUG-x86
+
+build-arm:
+  extends: .build-image
+  tags:
+    - docker-arm
+  variables:
+    TARGET_IMAGE: $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_SLUG-arm
+
+.deploy:
+  stage: deploy
+  tags:
+    - docker
+  script:
+    - docker pull $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_SLUG-x86
+    - docker pull $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_SLUG-arm
+    - docker manifest create $TARGET_IMAGE --amend $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_SLUG-x86 --amend
+      $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_SLUG-arm
+    - docker manifest push $TARGET_IMAGE
+
+deploy_latest:
+  extends: .deploy
+  variables:
+    TARGET_IMAGE: $CI_REGISTRY_IMAGE:latest
+  only:
+    - master
+
+deploy_branch:
+  extends: .deploy
+  variables:
+    TARGET_IMAGE: $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_SLUG
--- a/.gitmodules
+++ b/.gitmodules
-[submodule "stable-diffusion-private"]
-	path = stable-diffusion-private
-	url = https://github.com/NovelAI/stable-diffusion-private.git
-	branch = buckets
--- a/stable-diffusion-private @ acdc20a6
+++ b/stable-diffusion-private @ acdc20a6
-Subproject commit acdc20a6de698156418ad20ee277ccc45fe6787b
--- a/stable-diffusion-private/.gitignore
+++ b/stable-diffusion-private/.gitignore
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+wandb/
+output/
\ No newline at end of file
--- a/stable-diffusion-private/LICENSE
+++ b/stable-diffusion-private/LICENSE
+MIT License
+
+Copyright (c) 2022 Machine Vision and Learning Group, LMU Munich
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/stable-diffusion-private/README.md
+++ b/stable-diffusion-private/README.md
--- a/stable-diffusion-private/assets/a-painting-of-a-fire.png
+++ b/stable-diffusion-private/assets/a-painting-of-a-fire.png
--- a/stable-diffusion-private/assets/a-photograph-of-a-fire.png
+++ b/stable-diffusion-private/assets/a-photograph-of-a-fire.png
--- a/stable-diffusion-private/assets/a-shirt-with-a-fire-printed-on-it.png
+++ b/stable-diffusion-private/assets/a-shirt-with-a-fire-printed-on-it.png
--- a/stable-diffusion-private/assets/a-shirt-with-the-inscription-'fire'.png
+++ b/stable-diffusion-private/assets/a-shirt-with-the-inscription-'fire'.png
--- a/stable-diffusion-private/assets/a-watercolor-painting-of-a-fire.png
+++ b/stable-diffusion-private/assets/a-watercolor-painting-of-a-fire.png
--- a/stable-diffusion-private/assets/birdhouse.png
+++ b/stable-diffusion-private/assets/birdhouse.png
--- a/stable-diffusion-private/assets/fire.png
+++ b/stable-diffusion-private/assets/fire.png
--- a/stable-diffusion-private/assets/inpainting.png
+++ b/stable-diffusion-private/assets/inpainting.png
--- a/stable-diffusion-private/assets/modelfigure.png
+++ b/stable-diffusion-private/assets/modelfigure.png
--- a/stable-diffusion-private/assets/reconstruction1.png
+++ b/stable-diffusion-private/assets/reconstruction1.png
--- a/stable-diffusion-private/assets/reconstruction2.png
+++ b/stable-diffusion-private/assets/reconstruction2.png
--- a/stable-diffusion-private/assets/results.gif
+++ b/stable-diffusion-private/assets/results.gif
--- a/stable-diffusion-private/assets/samples/grid-0001.png
+++ b/stable-diffusion-private/assets/samples/grid-0001.png
--- a/stable-diffusion-private/assets/samples/grid-0006.png
+++ b/stable-diffusion-private/assets/samples/grid-0006.png
--- a/stable-diffusion-private/assets/samples/grid-0007.png
+++ b/stable-diffusion-private/assets/samples/grid-0007.png
--- a/stable-diffusion-private/assets/samples/grid-0008.png
+++ b/stable-diffusion-private/assets/samples/grid-0008.png
--- a/stable-diffusion-private/assets/the-earth-is-on-fire,-oil-on-canvas.png
+++ b/stable-diffusion-private/assets/the-earth-is-on-fire,-oil-on-canvas.png
--- a/stable-diffusion-private/assets/txt2img-convsample.png
+++ b/stable-diffusion-private/assets/txt2img-convsample.png
--- a/stable-diffusion-private/assets/txt2img-preview.png
+++ b/stable-diffusion-private/assets/txt2img-preview.png
--- a/stable-diffusion-private/configs/autoencoder/autoencoder_kl_16x16x16.yaml
+++ b/stable-diffusion-private/configs/autoencoder/autoencoder_kl_16x16x16.yaml
+model:
+  base_learning_rate: 4.5e-6
+  target: ldm.models.autoencoder.AutoencoderKL
+  params:
+    monitor: "val/rec_loss"
+    embed_dim: 16
+    lossconfig:
+      target: ldm.modules.losses.LPIPSWithDiscriminator
+      params:
+        disc_start: 50001
+        kl_weight: 0.000001
+        disc_weight: 0.5
+
+    ddconfig:
+      double_z: True
+      z_channels: 16
+      resolution: 256
+      in_channels: 3
+      out_ch: 3
+      ch: 128
+      ch_mult: [ 1,1,2,2,4]  # num_down = len(ch_mult)-1
+      num_res_blocks: 2
+      attn_resolutions: [16]
+      dropout: 0.0
+
+
+data:
+  target: main.DataModuleFromConfig
+  params:
+    batch_size: 12
+    wrap: True
+    train:
+      target: ldm.data.imagenet.ImageNetSRTrain
+      params:
+        size: 256
+        degradation: pil_nearest
+    validation:
+      target: ldm.data.imagenet.ImageNetSRValidation
+      params:
+        size: 256
+        degradation: pil_nearest
+
+lightning:
+  callbacks:
+    image_logger:
+      target: main.ImageLogger
+      params:
+        batch_frequency: 1000
+        max_images: 8
+        increase_log_steps: True
+
+  trainer:
+    benchmark: True
+    accumulate_grad_batches: 2
--- a/stable-diffusion-private/configs/autoencoder/autoencoder_kl_32x32x4.yaml
+++ b/stable-diffusion-private/configs/autoencoder/autoencoder_kl_32x32x4.yaml
+model:
+  base_learning_rate: 4.5e-6
+  target: ldm.models.autoencoder.AutoencoderKL
+  params:
+    monitor: "val/rec_loss"
+    embed_dim: 4
+    lossconfig:
+      target: ldm.modules.losses.LPIPSWithDiscriminator
+      params:
+        disc_start: 50001
+        kl_weight: 0.000001
+        disc_weight: 0.5
+
+    ddconfig:
+      double_z: True
+      z_channels: 4
+      resolution: 256
+      in_channels: 3
+      out_ch: 3
+      ch: 128
+      ch_mult: [ 1,2,4,4 ]  # num_down = len(ch_mult)-1
+      num_res_blocks: 2
+      attn_resolutions: [ ]
+      dropout: 0.0
+
+data:
+  target: main.DataModuleFromConfig
+  params:
+    batch_size: 12
+    wrap: True
+    train:
+      target: ldm.data.imagenet.ImageNetSRTrain
+      params:
+        size: 256
+        degradation: pil_nearest
+    validation:
+      target: ldm.data.imagenet.ImageNetSRValidation
+      params:
+        size: 256
+        degradation: pil_nearest
+
+lightning:
+  callbacks:
+    image_logger:
+      target: main.ImageLogger
+      params:
+        batch_frequency: 1000
+        max_images: 8
+        increase_log_steps: True
+
+  trainer:
+    benchmark: True
+    accumulate_grad_batches: 2
--- a/stable-diffusion-private/configs/autoencoder/autoencoder_kl_64x64x3.yaml
+++ b/stable-diffusion-private/configs/autoencoder/autoencoder_kl_64x64x3.yaml
+model:
+  base_learning_rate: 4.5e-6
+  target: ldm.models.autoencoder.AutoencoderKL
+  params:
+    monitor: "val/rec_loss"
+    embed_dim: 3
+    lossconfig:
+      target: ldm.modules.losses.LPIPSWithDiscriminator
+      params:
+        disc_start: 50001
+        kl_weight: 0.000001
+        disc_weight: 0.5
+
+    ddconfig:
+      double_z: True
+      z_channels: 3
+      resolution: 256
+      in_channels: 3
+      out_ch: 3
+      ch: 128
+      ch_mult: [ 1,2,4 ]  # num_down = len(ch_mult)-1
+      num_res_blocks: 2
+      attn_resolutions: [ ]
+      dropout: 0.0
+
+
+data:
+  target: main.DataModuleFromConfig
+  params:
+    batch_size: 12
+    wrap: True
+    train:
+      target: ldm.data.imagenet.ImageNetSRTrain
+      params:
+        size: 256
+        degradation: pil_nearest
+    validation:
+      target: ldm.data.imagenet.ImageNetSRValidation
+      params:
+        size: 256
+        degradation: pil_nearest
+
+lightning:
+  callbacks:
+    image_logger:
+      target: main.ImageLogger
+      params:
+        batch_frequency: 1000
+        max_images: 8
+        increase_log_steps: True
+
+  trainer:
+    benchmark: True
+    accumulate_grad_batches: 2
--- a/stable-diffusion-private/configs/autoencoder/autoencoder_kl_8x8x64.yaml
+++ b/stable-diffusion-private/configs/autoencoder/autoencoder_kl_8x8x64.yaml
+model:
+  base_learning_rate: 4.5e-6
+  target: ldm.models.autoencoder.AutoencoderKL
+  params:
+    monitor: "val/rec_loss"
+    embed_dim: 64
+    lossconfig:
+      target: ldm.modules.losses.LPIPSWithDiscriminator
+      params:
+        disc_start: 50001
+        kl_weight: 0.000001
+        disc_weight: 0.5
+
+    ddconfig:
+      double_z: True
+      z_channels: 64
+      resolution: 256
+      in_channels: 3
+      out_ch: 3
+      ch: 128
+      ch_mult: [ 1,1,2,2,4,4]  # num_down = len(ch_mult)-1
+      num_res_blocks: 2
+      attn_resolutions: [16,8]
+      dropout: 0.0
+
+data:
+  target: main.DataModuleFromConfig
+  params:
+    batch_size: 12
+    wrap: True
+    train:
+      target: ldm.data.imagenet.ImageNetSRTrain
+      params:
+        size: 256
+        degradation: pil_nearest
+    validation:
+      target: ldm.data.imagenet.ImageNetSRValidation
+      params:
+        size: 256
+        degradation: pil_nearest
+
+lightning:
+  callbacks:
+    image_logger:
+      target: main.ImageLogger
+      params:
+        batch_frequency: 1000
+        max_images: 8
+        increase_log_steps: True
+
+  trainer:
+    benchmark: True
+    accumulate_grad_batches: 2
--- a/stable-diffusion-private/configs/based/animefull.yaml
+++ b/stable-diffusion-private/configs/based/animefull.yaml
+data_path: /mnt/storageserver/workspace/kuru/sdfinetune/dataset/fulldanbooru
+index_path: null
+model_path: /mnt/storageserver/workspace/kuru/sdfinetune/checkpoints/animefull-64bs-0.1ucg-penultimate-clip-5epoch-50-50prompt/animefull-64bs-0.1ucg-penultimate-clip-5epoch-50-50prompt-68000.pt
+config_path: /mnt/storageserver/workspace/kuru/sdfinetune/checkpoints/config.yaml
+save_path: /mnt/storageserver/workspace/kuru/sdfinetune/checkpoints/animefull-64bs-0.1ucg-penultimate-clip-5epoch-50-50prompt-continue
+do_save: true
+run_name: animefull-64bs-0.1ucg-penultimate-clip-5epoch-50-50prompt-continue
+lr: 1e-5
+end_lr: 5e-6
+warmup_steps: 100
+anneal_steps: 370000
+bs: 8
+gas: 1
+seed: 69
+save_every: 2000
+amp: false
+loss_scale: false
+cast_to: float16
+sample_every: 500
+beta1: 0.95
+beta2: 0.999
+eps: 1e-8
+weight_decay: 0.0
+use_ema: true
+ucg: 0.1
+min_tags: 50
+max_tags: 50
+mode: basedformer
+epoch: 5
\ No newline at end of file
--- a/stable-diffusion-private/configs/based/animenoe.yaml
+++ b/stable-diffusion-private/configs/based/animenoe.yaml
+data_path: /mnt/storageserver/workspace/kuru/sdfinetune/dataset/fulldanbooru
+index_path: /mnt/storageserver/workspace/kuru/sdfinetune/gsq.index
+model_path: /mnt/storageserver/workspace/kuru/sdfinetune/checkpoints/animeno-e-64bs-0.1ucg-penultimate-clip-6epoch-1-22prompt/animeno-e-64bs-0.1ucg-penultimate-clip-6epoch-1-22prompt-8000.pt
+config_path: /mnt/storageserver/workspace/kuru/sdfinetune/checkpoints/config.yaml
+save_path: /mnt/storageserver/workspace/kuru/sdfinetune/checkpoints/animeno-e-64bs-0.1ucg-penultimate-clip-6epoch-1-22prompt-continue
+do_save: true
+run_name: animeno-e-64bs-0.1ucg-penultimate-clip-6epoch-1-22prompt-continue
+lr: 1e-5
+end_lr: 5e-6
+warmup_steps: 100
+anneal_steps: 414000
+bs: 8
+gas: 1
+seed: 69
+save_every: 2000
+amp: false
+loss_scale: false
+cast_to: float16
+sample_every: 500
+beta1: 0.95
+beta2: 0.999
+eps: 1e-8
+weight_decay: 0.0
+use_ema: true
+ucg: 0.1
+min_tags: 1
+max_tags: 22
+mode: basedformer
+epoch: 5
\ No newline at end of file
--- a/stable-diffusion-private/configs/latent-diffusion/celebahq-ldm-vq-4.yaml
+++ b/stable-diffusion-private/configs/latent-diffusion/celebahq-ldm-vq-4.yaml
+model:
+  base_learning_rate: 2.0e-06
+  target: ldm.models.diffusion.ddpm.LatentDiffusion
+  params:
+    linear_start: 0.0015
+    linear_end: 0.0195
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: image
+    image_size: 64
+    channels: 3
+    monitor: val/loss_simple_ema
+
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        image_size: 64
+        in_channels: 3
+        out_channels: 3
+        model_channels: 224
+        attention_resolutions:
+        # note: this isn\t actually the resolution but
+        # the downsampling factor, i.e. this corresnponds to
+        # attention on spatial resolution 8,16,32, as the
+        # spatial reolution of the latents is 64 for f4
+        - 8
+        - 4
+        - 2
+        num_res_blocks: 2
+        channel_mult:
+        - 1
+        - 2
+        - 3
+        - 4
+        num_head_channels: 32
+    first_stage_config:
+      target: ldm.models.autoencoder.VQModelInterface
+      params:
+        embed_dim: 3
+        n_embed: 8192
+        ckpt_path: models/first_stage_models/vq-f4/model.ckpt
+        ddconfig:
+          double_z: false
+          z_channels: 3
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+    cond_stage_config: __is_unconditional__
+data:
+  target: main.DataModuleFromConfig
+  params:
+    batch_size: 48
+    num_workers: 5
+    wrap: false
+    train:
+      target: taming.data.faceshq.CelebAHQTrain
+      params:
+        size: 256
+    validation:
+      target: taming.data.faceshq.CelebAHQValidation
+      params:
+        size: 256
+
+
+lightning:
+  callbacks:
+    image_logger:
+      target: main.ImageLogger
+      params:
+        batch_frequency: 5000
+        max_images: 8
+        increase_log_steps: False
+
+  trainer:
+    benchmark: True
\ No newline at end of file
--- a/stable-diffusion-private/configs/latent-diffusion/cin-ldm-vq-f8.yaml
+++ b/stable-diffusion-private/configs/latent-diffusion/cin-ldm-vq-f8.yaml
+model:
+  base_learning_rate: 1.0e-06
+  target: ldm.models.diffusion.ddpm.LatentDiffusion
+  params:
+    linear_start: 0.0015
+    linear_end: 0.0195
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: image
+    cond_stage_key: class_label
+    image_size: 32
+    channels: 4
+    cond_stage_trainable: true
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        image_size: 32
+        in_channels: 4
+        out_channels: 4
+        model_channels: 256
+        attention_resolutions:
+        #note: this isn\t actually the resolution but
+        # the downsampling factor, i.e. this corresnponds to
+        # attention on spatial resolution 8,16,32, as the
+        # spatial reolution of the latents is 32 for f8
+        - 4
+        - 2
+        - 1
+        num_res_blocks: 2
+        channel_mult:
+        - 1
+        - 2
+        - 4
+        num_head_channels: 32
+        use_spatial_transformer: true
+        transformer_depth: 1
+        context_dim: 512
+    first_stage_config:
+      target: ldm.models.autoencoder.VQModelInterface
+      params:
+        embed_dim: 4
+        n_embed: 16384
+        ckpt_path: configs/first_stage_models/vq-f8/model.yaml
+        ddconfig:
+          double_z: false
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 2
+          - 4
+          num_res_blocks: 2
+          attn_resolutions:
+          - 32
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.ClassEmbedder
+      params:
+        embed_dim: 512
+        key: class_label
+data:
+  target: main.DataModuleFromConfig
+  params:
+    batch_size: 64
+    num_workers: 12
+    wrap: false
+    train:
+      target: ldm.data.imagenet.ImageNetTrain
+      params:
+        config:
+          size: 256
+    validation:
+      target: ldm.data.imagenet.ImageNetValidation
+      params:
+        config:
+          size: 256
+
+
+lightning:
+  callbacks:
+    image_logger:
+      target: main.ImageLogger
+      params:
+        batch_frequency: 5000
+        max_images: 8
+        increase_log_steps: False
+
+  trainer:
+    benchmark: True
\ No newline at end of file
--- a/stable-diffusion-private/configs/latent-diffusion/cin256-v2.yaml
+++ b/stable-diffusion-private/configs/latent-diffusion/cin256-v2.yaml
+model:
+  base_learning_rate: 0.0001
+  target: ldm.models.diffusion.ddpm.LatentDiffusion
+  params:
+    linear_start: 0.0015
+    linear_end: 0.0195
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: image
+    cond_stage_key: class_label
+    image_size: 64
+    channels: 3
+    cond_stage_trainable: true
+    conditioning_key: crossattn
+    monitor: val/loss
+    use_ema: False
+    
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        image_size: 64
+        in_channels: 3
+        out_channels: 3
+        model_channels: 192
+        attention_resolutions:
+        - 8
+        - 4
+        - 2
+        num_res_blocks: 2
+        channel_mult:
+        - 1
+        - 2
+        - 3
+        - 5
+        num_heads: 1
+        use_spatial_transformer: true
+        transformer_depth: 1
+        context_dim: 512
+    
+    first_stage_config:
+      target: ldm.models.autoencoder.VQModelInterface
+      params:
+        embed_dim: 3
+        n_embed: 8192
+        ddconfig:
+          double_z: false
+          z_channels: 3
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+    
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.ClassEmbedder
+      params:
+        n_classes: 1001
+        embed_dim: 512
+        key: class_label
--- a/stable-diffusion-private/configs/latent-diffusion/ffhq-ldm-vq-4.yaml
+++ b/stable-diffusion-private/configs/latent-diffusion/ffhq-ldm-vq-4.yaml
+model:
+  base_learning_rate: 2.0e-06
+  target: ldm.models.diffusion.ddpm.LatentDiffusion
+  params:
+    linear_start: 0.0015
+    linear_end: 0.0195
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: image
+    image_size: 64
+    channels: 3
+    monitor: val/loss_simple_ema
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        image_size: 64
+        in_channels: 3
+        out_channels: 3
+        model_channels: 224
+        attention_resolutions:
+        # note: this isn\t actually the resolution but
+        # the downsampling factor, i.e. this corresnponds to
+        # attention on spatial resolution 8,16,32, as the
+        # spatial reolution of the latents is 64 for f4
+        - 8
+        - 4
+        - 2
+        num_res_blocks: 2
+        channel_mult:
+        - 1
+        - 2
+        - 3
+        - 4
+        num_head_channels: 32
+    first_stage_config:
+      target: ldm.models.autoencoder.VQModelInterface
+      params:
+        embed_dim: 3
+        n_embed: 8192
+        ckpt_path: configs/first_stage_models/vq-f4/model.yaml
+        ddconfig:
+          double_z: false
+          z_channels: 3
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+    cond_stage_config: __is_unconditional__
+data:
+  target: main.DataModuleFromConfig
+  params:
+    batch_size: 42
+    num_workers: 5
+    wrap: false
+    train:
+      target: taming.data.faceshq.FFHQTrain
+      params:
+        size: 256
+    validation:
+      target: taming.data.faceshq.FFHQValidation
+      params:
+        size: 256
+
+
+lightning:
+  callbacks:
+    image_logger:
+      target: main.ImageLogger
+      params:
+        batch_frequency: 5000
+        max_images: 8
+        increase_log_steps: False
+
+  trainer:
+    benchmark: True
\ No newline at end of file
--- a/stable-diffusion-private/configs/latent-diffusion/lsun_bedrooms-ldm-vq-4.yaml
+++ b/stable-diffusion-private/configs/latent-diffusion/lsun_bedrooms-ldm-vq-4.yaml
+model:
+  base_learning_rate: 2.0e-06
+  target: ldm.models.diffusion.ddpm.LatentDiffusion
+  params:
+    linear_start: 0.0015
+    linear_end: 0.0195
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: image
+    image_size: 64
+    channels: 3
+    monitor: val/loss_simple_ema
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        image_size: 64
+        in_channels: 3
+        out_channels: 3
+        model_channels: 224
+        attention_resolutions:
+        # note: this isn\t actually the resolution but
+        # the downsampling factor, i.e. this corresnponds to
+        # attention on spatial resolution 8,16,32, as the
+        # spatial reolution of the latents is 64 for f4
+        - 8
+        - 4
+        - 2
+        num_res_blocks: 2
+        channel_mult:
+        - 1
+        - 2
+        - 3
+        - 4
+        num_head_channels: 32
+    first_stage_config:
+      target: ldm.models.autoencoder.VQModelInterface
+      params:
+        ckpt_path: configs/first_stage_models/vq-f4/model.yaml
+        embed_dim: 3
+        n_embed: 8192
+        ddconfig:
+          double_z: false
+          z_channels: 3
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+    cond_stage_config: __is_unconditional__
+data:
+  target: main.DataModuleFromConfig
+  params:
+    batch_size: 48
+    num_workers: 5
+    wrap: false
+    train:
+      target: ldm.data.lsun.LSUNBedroomsTrain
+      params:
+        size: 256
+    validation:
+      target: ldm.data.lsun.LSUNBedroomsValidation
+      params:
+        size: 256
+
+
+lightning:
+  callbacks:
+    image_logger:
+      target: main.ImageLogger
+      params:
+        batch_frequency: 5000
+        max_images: 8
+        increase_log_steps: False
+
+  trainer:
+    benchmark: True
\ No newline at end of file
--- a/stable-diffusion-private/configs/latent-diffusion/lsun_churches-ldm-kl-8.yaml
+++ b/stable-diffusion-private/configs/latent-diffusion/lsun_churches-ldm-kl-8.yaml
+model:
+  base_learning_rate: 5.0e-5   # set to target_lr by starting main.py with '--scale_lr False'
+  target: ldm.models.diffusion.ddpm.LatentDiffusion
+  params:
+    linear_start: 0.0015
+    linear_end: 0.0155
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    loss_type: l1
+    first_stage_key: "image"
+    cond_stage_key: "image"
+    image_size: 32
+    channels: 4
+    cond_stage_trainable: False
+    concat_mode: False
+    scale_by_std: True
+    monitor: 'val/loss_simple_ema'
+
+    scheduler_config: # 10000 warmup steps
+      target: ldm.lr_scheduler.LambdaLinearScheduler
+      params:
+        warm_up_steps: [10000]
+        cycle_lengths: [10000000000000]
+        f_start: [1.e-6]
+        f_max: [1.]
+        f_min: [ 1.]
+
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        image_size: 32
+        in_channels: 4
+        out_channels: 4
+        model_channels: 192
+        attention_resolutions: [ 1, 2, 4, 8 ]   # 32, 16, 8, 4
+        num_res_blocks: 2
+        channel_mult: [ 1,2,2,4,4 ]  # 32, 16, 8, 4, 2
+        num_heads: 8
+        use_scale_shift_norm: True
+        resblock_updown: True
+
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: "val/rec_loss"
+        ckpt_path: "models/first_stage_models/kl-f8/model.ckpt"
+        ddconfig:
+          double_z: True
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult: [ 1,2,4,4 ]  # num_down = len(ch_mult)-1
+          num_res_blocks: 2
+          attn_resolutions: [ ]
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+
+    cond_stage_config: "__is_unconditional__"
+
+data:
+  target: main.DataModuleFromConfig
+  params:
+    batch_size: 96
+    num_workers: 5
+    wrap: False
+    train:
+      target: ldm.data.lsun.LSUNChurchesTrain
+      params:
+        size: 256
+    validation:
+      target: ldm.data.lsun.LSUNChurchesValidation
+      params:
+        size: 256
+
+lightning:
+  callbacks:
+    image_logger:
+      target: main.ImageLogger
+      params:
+        batch_frequency: 5000
+        max_images: 8
+        increase_log_steps: False
+
+
+  trainer:
+    benchmark: True
\ No newline at end of file
--- a/stable-diffusion-private/configs/latent-diffusion/txt2img-1p4B-eval.yaml
+++ b/stable-diffusion-private/configs/latent-diffusion/txt2img-1p4B-eval.yaml
+model:
+  base_learning_rate: 5.0e-05
+  target: ldm.models.diffusion.ddpm.LatentDiffusion
+  params:
+    linear_start: 0.00085
+    linear_end: 0.012
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: image
+    cond_stage_key: caption
+    image_size: 32
+    channels: 4
+    cond_stage_trainable: true
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+    use_ema: False
+
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        image_size: 32
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions:
+        - 4
+        - 2
+        - 1
+        num_res_blocks: 2
+        channel_mult:
+        - 1
+        - 2
+        - 4
+        - 4
+        num_heads: 8
+        use_spatial_transformer: true
+        transformer_depth: 1
+        context_dim: 1280
+        use_checkpoint: true
+        legacy: False
+
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.BERTEmbedder
+      params:
+        n_embed: 1280
+        n_layer: 32
--- a/stable-diffusion-private/configs/stable-diffusion/dev.yaml
+++ b/stable-diffusion-private/configs/stable-diffusion/dev.yaml
+model:
+  base_learning_rate: 1.0e-04
+  target: ldm.models.diffusion.ddpm.LatentDiffusion
+  params:
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    image_size: 32
+    channels: 4
+    cond_stage_trainable: true
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+
+    scheduler_config: # 10000 warmup steps
+      target: ldm.lr_scheduler.LambdaLinearScheduler
+      params:
+        warm_up_steps: [ 10000 ]
+        cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
+        f_start: [ 1.e-6 ]
+        f_max: [ 1. ]
+        f_min: [ 1. ]
+
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        image_size: 32
+        in_channels: 4
+        out_channels: 4
+        model_channels: 32 # 320   # TODO increase
+        attention_resolutions: [ ]  # is equal to fixed spatial resolution: 32 , 16 , 8
+        num_res_blocks: 2
+        channel_mult: [ 1, ]
+        #num_head_channels: 32
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 32
+        use_checkpoint: False
+
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ckpt_path: "models/first_stage_models/kl-f8/model.ckpt"
+        ddconfig:
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.BERTEmbedder
+      params:
+        n_embed: 32
+        n_layer: 1 #32 # TODO: increase
+
+
+data:
+  target: ldm.data.laion.WebDataModuleFromConfig
+  params:
+    tar_base: "pipe:aws s3 cp s3://s-datasets/laion5b/laion2B-data/"
+    batch_size: 10
+    num_workers: 4
+    n_nodes: 1
+    train:
+      shards: '{000000..000010}.tar -'  # TODO: wild guess, change
+      image_key: jpg
+      image_transforms:
+      - target: torchvision.transforms.Resize
+        params:
+          size: 256
+          interpolation: 3
+      - target: torchvision.transforms.RandomCrop
+        params:
+          size: 256
+
+      shuffle: 5000
+      n_examples: 16519100  # TODO: find out
+    validation:
+      shards: '{000011..000012}.tar -'  # TODO: wild guess, change
+      image_key: jpg
+      image_transforms:
+      - target: torchvision.transforms.Resize
+        params:
+          size: 256
+          interpolation: 3
+      - target: torchvision.transforms.CenterCrop
+        params:
+          size: 256
+
+      shuffle: 0
+      n_examples: 60000  # TODO: find out
+    val_num_workers: 2
+
+
+
+lightning:
+  callbacks:
+    image_logger:
+      target: main.ImageLogger
+      params:
+        batch_frequency: 5000 # 5000
+        max_images: 0
+        increase_log_steps: False
+        log_first_step: True
+
+
+  trainer:
+    replace_sampler_ddp: False
+    benchmark: True
+    val_check_interval: 20000  # every 20k training steps
+    num_sanity_val_steps: 0
+
--- a/stable-diffusion-private/configs/stable-diffusion/dev_mn.yaml
+++ b/stable-diffusion-private/configs/stable-diffusion/dev_mn.yaml
+model:
+  base_learning_rate: 1.0e-04
+  target: ldm.models.diffusion.ddpm.LatentDiffusion
+  params:
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    image_size: 32
+    channels: 4
+    cond_stage_trainable: true
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+
+    scheduler_config: # 10000 warmup steps
+      target: ldm.lr_scheduler.LambdaLinearScheduler
+      params:
+        warm_up_steps: [ 10000 ]
+        cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
+        f_start: [ 1.e-6 ]
+        f_max: [ 1. ]
+        f_min: [ 1. ]
+
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        image_size: 32
+        in_channels: 4
+        out_channels: 4
+        model_channels: 32 # 320   # TODO increase
+        attention_resolutions: [ ]  # is equal to fixed spatial resolution: 32 , 16 , 8
+        num_res_blocks: 2
+        channel_mult: [ 1, ]
+        #num_head_channels: 32
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 32
+        use_checkpoint: False
+
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ckpt_path: "models/first_stage_models/kl-f8/model.ckpt"
+        ddconfig:
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.BERTEmbedder
+      params:
+        n_embed: 32
+        n_layer: 1 #32 # TODO: increase
+
+
+data:
+  target: ldm.data.laion.WebDataModuleFromConfig
+  params:
+    tar_base: "pipe:aws s3 cp s3://s-datasets/laion5b/laion2B-data/"
+    batch_size: 4
+    num_workers: 4
+    n_nodes: 4
+    train:
+      shards: '{000000..231339}.tar -'
+      shuffle: 10000
+      image_key: jpg
+      image_transforms:
+      - target: torchvision.transforms.Resize
+        params:
+          size: 256
+          interpolation: 3
+      - target: torchvision.transforms.RandomCrop
+        params:
+          size: 256
+
+    # NOTE use enough shards to avoid empty validation loops in workers
+    validation:
+      shards: '{231346..231349}.tar -'
+      shuffle: 0
+      image_key: jpg
+      image_transforms:
+      - target: torchvision.transforms.Resize
+        params:
+          size: 256
+          interpolation: 3
+      - target: torchvision.transforms.CenterCrop
+        params:
+          size: 256
+
+
+lightning:
+  callbacks:
+    image_logger:
+      target: main.ImageLogger
+      params:
+        batch_frequency: 500 # 5000
+        max_images: 8
+        increase_log_steps: False
+        log_first_step: False
+
+
+  trainer:
+    #replace_sampler_ddp: False
+    benchmark: True
+    val_check_interval: 1000  # every 20k training steps
+    num_sanity_val_steps: 0
+
+
--- a/stable-diffusion-private/configs/stable-diffusion/dev_mn_dummy.yaml
+++ b/stable-diffusion-private/configs/stable-diffusion/dev_mn_dummy.yaml
+model:
+  base_learning_rate: 1.0e-04
+  target: ldm.models.diffusion.ddpm.LatentDiffusion
+  params:
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    image_size: 32
+    channels: 4
+    cond_stage_trainable: true
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+
+    scheduler_config: # 10000 warmup steps
+      target: ldm.lr_scheduler.LambdaLinearScheduler
+      params:
+        warm_up_steps: [ 10000 ]
+        cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
+        f_start: [ 1.e-6 ]
+        f_max: [ 1. ]
+        f_min: [ 1. ]
+
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        image_size: 32
+        in_channels: 4
+        out_channels: 4
+        model_channels: 32 # 320   # TODO increase
+        attention_resolutions: [ ]  # is equal to fixed spatial resolution: 32 , 16 , 8
+        num_res_blocks: 2
+        channel_mult: [ 1, ]
+        #num_head_channels: 32
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 32
+        use_checkpoint: False
+
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ckpt_path: "models/first_stage_models/kl-f8/model.ckpt"
+        ddconfig:
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.BERTEmbedder
+      params:
+        n_embed: 32
+        n_layer: 1 #32 # TODO: increase
+
+
+data:
+  target: main.DataModuleFromConfig
+  params:
+    batch_size: 4
+    num_workers: 4
+    wrap: false
+    train:
+      target: ldm.data.dummy.DummyData
+      params:
+        length: 20000
+        size: [256, 256, 3]
+    validation:
+      target: ldm.data.dummy.DummyData
+      params:
+        length: 10000
+        size: [256, 256, 3]
+
+
+lightning:
+  callbacks:
+    image_logger:
+      target: main.ImageLogger
+      params:
+        batch_frequency: 500 # 5000
+        max_images: 8
+        increase_log_steps: False
+        log_first_step: False
+
+
+  trainer:
+    #replace_sampler_ddp: False
+    benchmark: True
+    val_check_interval: 1000  # every 20k training steps
+    num_sanity_val_steps: 0
--- a/stable-diffusion-private/configs/stable-diffusion/inpainting/v1-finetune-for-inpainting-laion-aesthetic-larger-masks.yaml
+++ b/stable-diffusion-private/configs/stable-diffusion/inpainting/v1-finetune-for-inpainting-laion-aesthetic-larger-masks.yaml
+model:
+  base_learning_rate: 7.5e-05
+  target: ldm.models.diffusion.ddpm.LatentInpaintDiffusion
+  params:
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    image_size: 64
+    channels: 4
+    cond_stage_trainable: false   # Note: different from the one we trained before
+    conditioning_key: hybrid   # important
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+    ckpt_path: "/fsx/stable-diffusion/stable-diffusion/checkpoints/v1pp/v1pp-flatlined-hr.ckpt"
+
+    scheduler_config: # 10000 warmup steps
+      target: ldm.lr_scheduler.LambdaLinearScheduler
+      params:
+        warm_up_steps: [ 2500 ] # NOTE for resuming. use 10000 if starting from scratch
+        cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
+        f_start: [ 1.e-6 ]
+        f_max: [ 1. ]
+        f_min: [ 1. ]
+
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        image_size: 32 # unused
+        in_channels: 9  # 4 data + 4 downscaled image + 1 mask
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
+
+
+data:
+  target: ldm.data.laion.WebDataModuleFromConfig
+  params:
+    tar_base: "__improvedaesthetic__"
+    batch_size: 2
+    num_workers: 4
+    multinode: True
+    min_size: 512
+    max_pwatermark: 0.8
+    train:
+      shards: '{00000..17279}.tar -'
+      shuffle: 10000
+      image_key: jpg
+      image_transforms:
+      - target: torchvision.transforms.Resize
+        params:
+          size: 512
+          interpolation: 3
+      - target: torchvision.transforms.RandomCrop
+        params:
+          size: 512
+      postprocess:
+        target: ldm.data.laion.AddMask
+        params:
+          mode: "512train-large"
+
+    # NOTE use enough shards to avoid empty validation loops in workers
+    validation:
+      shards: '{17280..17535}.tar -'
+      shuffle: 0
+      image_key: jpg
+      image_transforms:
+      - target: torchvision.transforms.Resize
+        params:
+          size: 512
+          interpolation: 3
+      - target: torchvision.transforms.CenterCrop
+        params:
+          size: 512
+      postprocess:
+        target: ldm.data.laion.AddMask
+        params:
+          mode: "512train-large"
+
+
+lightning:
+  find_unused_parameters: False
+
+  modelcheckpoint:
+    params:
+      every_n_train_steps: 2000
+
+  callbacks:
+    image_logger:
+      target: main.ImageLogger
+      params:
+        disabled: False
+        batch_frequency: 1000
+        max_images: 4
+        increase_log_steps: False
+        log_first_step: False
+        log_images_kwargs:
+          use_ema_scope: False
+          inpaint: False
+          plot_progressive_rows: False
+          plot_diffusion_rows: False
+          N: 4
+          unconditional_guidance_scale: 3.0
+          unconditional_guidance_label: [""]
+          ddim_steps: 100  # todo check these out for inpainting,
+          ddim_eta: 1.0   # todo check these out for inpainting,
+
+  trainer:
+    benchmark: True
+    val_check_interval: 5000000  # really sorry
+    num_sanity_val_steps: 0
+    accumulate_grad_batches: 2
--- a/stable-diffusion-private/configs/stable-diffusion/inpainting/v1-finetune-for-inpainting-laion-iaesthe.yaml
+++ b/stable-diffusion-private/configs/stable-diffusion/inpainting/v1-finetune-for-inpainting-laion-iaesthe.yaml
+model:
+  base_learning_rate: 7.5e-05
+  target: ldm.models.diffusion.ddpm.LatentInpaintDiffusion
+  params:
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    image_size: 64
+    channels: 4
+    cond_stage_trainable: false   # Note: different from the one we trained before
+    conditioning_key: hybrid   # important
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+    ckpt_path: "/fsx/stable-diffusion/stable-diffusion/checkpoints2/v1pp/v1pp-flatline-pruned.ckpt"
+
+    scheduler_config: # 10000 warmup steps
+      target: ldm.lr_scheduler.LambdaLinearScheduler
+      params:
+        warm_up_steps: [ 2500 ] # NOTE for resuming. use 10000 if starting from scratch
+        cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
+        f_start: [ 1.e-6 ]
+        f_max: [ 1. ]
+        f_min: [ 1. ]
+
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        image_size: 32 # unused
+        in_channels: 9  # 4 data + 4 downscaled image + 1 mask
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
+
+
+data:
+  target: ldm.data.laion.WebDataModuleFromConfig
+  params:
+    tar_base: "pipe:aws s3 cp s3://s-datasets/laion-high-resolution/"
+    batch_size: 4
+    num_workers: 4
+    multinode: True
+    min_size: 512
+    train:
+      shards: '{00000..17279}.tar -'
+      shuffle: 10000
+      image_key: jpg
+      image_transforms:
+      - target: torchvision.transforms.Resize
+        params:
+          size: 512
+          interpolation: 3
+      - target: torchvision.transforms.RandomCrop
+        params:
+          size: 512
+      postprocess:
+        target: ldm.data.laion.AddMask
+
+    # NOTE use enough shards to avoid empty validation loops in workers
+    validation:
+      shards: '{17280..17535}.tar -'
+      shuffle: 0
+      image_key: jpg
+      image_transforms:
+      - target: torchvision.transforms.Resize
+        params:
+          size: 512
+          interpolation: 3
+      - target: torchvision.transforms.CenterCrop
+        params:
+          size: 512
+      postprocess:
+        target: ldm.data.laion.AddMask
+
+
+lightning:
+  find_unused_parameters: False
+
+  modelcheckpoint:
+    params:
+      every_n_train_steps: 2000
+
+  callbacks:
+    image_logger:
+      target: main.ImageLogger
+      params:
+        disabled: False
+        batch_frequency: 1000
+        max_images: 4
+        increase_log_steps: False
+        log_first_step: False
+        log_images_kwargs:
+          use_ema_scope: False
+          inpaint: False
+          plot_progressive_rows: False
+          plot_diffusion_rows: False
+          N: 4
+          unconditional_guidance_scale: 3.0
+          unconditional_guidance_label: [""]
+          ddim_steps: 100  # todo check these out for inpainting,
+          ddim_eta: 1.0   # todo check these out for inpainting,
+
+  trainer:
+    benchmark: True
+    val_check_interval: 5000000  # really sorry
+    num_sanity_val_steps: 0
+    accumulate_grad_batches: 2
--- a/stable-diffusion-private/configs/stable-diffusion/txt2img-1p4B-multinode-clip-encoder-high-res-512-inference.yaml
+++ b/stable-diffusion-private/configs/stable-diffusion/txt2img-1p4B-multinode-clip-encoder-high-res-512-inference.yaml
+model:
+  base_learning_rate: 1.0e-04
+  target: ldm.models.diffusion.ddpm.LatentDiffusion
+  params:
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    image_size: 64
+    channels: 4
+    cond_stage_trainable: false   # Note: different from the one we trained before
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+
+    scheduler_config: # 10000 warmup steps
+      target: ldm.lr_scheduler.LambdaLinearScheduler
+      params:
+        warm_up_steps: [ 10000 ]
+        cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
+        f_start: [ 1.e-6 ]
+        f_max: [ 1. ]
+        f_min: [ 1. ]
+
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        image_size: 32 # unused
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
--- a/stable-diffusion-private/configs/stable-diffusion/txt2img-1p4B-multinode-clip-encoder-high-res-512.yaml
+++ b/stable-diffusion-private/configs/stable-diffusion/txt2img-1p4B-multinode-clip-encoder-high-res-512.yaml
+model:
+  base_learning_rate: 1.0e-04
+  target: ldm.models.diffusion.ddpm.LatentDiffusion
+  params:
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    image_size: 64
+    channels: 4
+    cond_stage_trainable: false   # Note: different from the one we trained before
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+
+    scheduler_config: # 10000 warmup steps
+      target: ldm.lr_scheduler.LambdaLinearScheduler
+      params:
+        warm_up_steps: [ 1 ] # NOTE for resuming. use 10000 if starting from scratch
+        cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
+        f_start: [ 1.e-6 ]
+        f_max: [ 1. ]
+        f_min: [ 1. ]
+
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        image_size: 32 # unused
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
+
+
+data:
+  target: ldm.data.laion.WebDataModuleFromConfig
+  params:
+    tar_base: "pipe:aws s3 cp s3://s-datasets/laion-high-resolution/"
+    batch_size: 4
+    num_workers: 4
+    multinode: True
+    train:
+      shards: '{00000..17279}.tar -'
+      shuffle: 10000
+      image_key: jpg
+      image_transforms:
+      - target: torchvision.transforms.Resize
+        params:
+          size: 512
+          interpolation: 3
+      - target: torchvision.transforms.RandomCrop
+        params:
+          size: 512
+
+    # NOTE use enough shards to avoid empty validation loops in workers
+    validation:
+      shards: '{17280..17535}.tar -'
+      shuffle: 0
+      image_key: jpg
+      image_transforms:
+      - target: torchvision.transforms.Resize
+        params:
+          size: 512
+          interpolation: 3
+      - target: torchvision.transforms.CenterCrop
+        params:
+          size: 512
+
+
+lightning:
+  find_unused_parameters: False
+
+  modelcheckpoint:
+    params:
+      every_n_train_steps: 5000
+
+  callbacks:
+    image_logger:
+      target: main.ImageLogger
+      params:
+        batch_frequency: 5000
+        max_images: 4
+        increase_log_steps: False
+        log_first_step: False
+        log_images_kwargs:
+          use_ema_scope: False
+          inpaint: False
+          plot_progressive_rows: False
+          plot_diffusion_rows: False
+          N: 4
+          unconditional_guidance_scale: 3.0
+          unconditional_guidance_label: [""]
+
+  trainer:
+    benchmark: True
+    val_check_interval: 5000000  # really sorry
+    num_sanity_val_steps: 0
+    accumulate_grad_batches: 2
--- a/stable-diffusion-private/configs/stable-diffusion/txt2img-1p4B-multinode-clip-encoder.yaml
+++ b/stable-diffusion-private/configs/stable-diffusion/txt2img-1p4B-multinode-clip-encoder.yaml
+model:
+  base_learning_rate: 1.0e-04
+  target: ldm.models.diffusion.ddpm.LatentDiffusion
+  params:
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    image_size: 32
+    channels: 4
+    cond_stage_trainable: false   # Note: different from the one we trained before
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+
+    scheduler_config: # 10000 warmup steps
+      target: ldm.lr_scheduler.LambdaLinearScheduler
+      params:
+        warm_up_steps: [ 10000 ]
+        cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
+        f_start: [ 1.e-6 ]
+        f_max: [ 1. ]
+        f_min: [ 1. ]
+
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        image_size: 32
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ckpt_path: "models/first_stage_models/kl-f8/model.ckpt"
+        ddconfig:
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
+
+
+data:
+  target: ldm.data.laion.WebDataModuleFromConfig
+  params:
+    tar_base: "pipe:aws s3 cp s3://s-datasets/laion5b/laion2B-data/"
+    batch_size: 50
+    num_workers: 4
+    multinode: True
+    train:
+      shards: '{000000..231317}.tar -'
+      shuffle: 10000
+      image_key: jpg
+      image_transforms:
+      - target: torchvision.transforms.Resize
+        params:
+          size: 256
+          interpolation: 3
+      - target: torchvision.transforms.RandomCrop
+        params:
+          size: 256
+
+    # NOTE use enough shards to avoid empty validation loops in workers
+    validation:
+      shards: '{231318..231349}.tar -'
+      shuffle: 0
+      image_key: jpg
+      image_transforms:
+      - target: torchvision.transforms.Resize
+        params:
+          size: 256
+          interpolation: 3
+      - target: torchvision.transforms.CenterCrop
+        params:
+          size: 256
+
+
+lightning:
+  callbacks:
+    image_logger:
+      target: main.ImageLogger
+      params:
+        batch_frequency: 5000
+        max_images: 4
+        increase_log_steps: False
+        log_first_step: False
+        log_images_kwargs:
+          use_ema_scope: False
+          inpaint: False
+          plot_progressive_rows: False
+          plot_diffusion_rows: False
+          N: 4
+          unconditional_guidance_scale: 3.0
+          unconditional_guidance_label: [""]
+
+  trainer:
+    #replace_sampler_ddp: False
+    benchmark: True
+    val_check_interval: 5000000  # really sorry
+    num_sanity_val_steps: 0
+    accumulate_grad_batches: 2
--- a/stable-diffusion-private/configs/stable-diffusion/txt2img-1p4B-multinode-t5-encoder.yaml
+++ b/stable-diffusion-private/configs/stable-diffusion/txt2img-1p4B-multinode-t5-encoder.yaml
+model:
+  base_learning_rate: 1.0e-04
+  target: ldm.models.diffusion.ddpm.LatentDiffusion
+  params:
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    image_size: 32
+    channels: 4
+    cond_stage_trainable: false   # Note: different from the one we trained before
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+
+    scheduler_config: # 10000 warmup steps
+      target: ldm.lr_scheduler.LambdaLinearScheduler
+      params:
+        warm_up_steps: [ 10000 ]
+        cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
+        f_start: [ 1.e-6 ]
+        f_max: [ 1. ]
+        f_min: [ 1. ]
+
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        image_size: 32
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ckpt_path: "models/first_stage_models/kl-f8/model.ckpt"
+        ddconfig:
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenT5Embedder
+      params:
+        version: "google/t5-v1_1-xl"
+
+
+data:
+  target: ldm.data.laion.WebDataModuleFromConfig
+  params:
+    tar_base: "pipe:aws s3 cp s3://s-datasets/laion5b/laion2B-data/"
+    batch_size: 12
+    num_workers: 4
+    train:
+      shards: '{000000..231317}.tar -'
+      shuffle: 10000
+      image_key: jpg
+      image_transforms:
+      - target: torchvision.transforms.Resize
+        params:
+          size: 256
+          interpolation: 3
+      - target: torchvision.transforms.RandomCrop
+        params:
+          size: 256
+
+    # NOTE use enough shards to avoid empty validation loops in workers
+    validation:
+      shards: '{231318..231349}.tar -'
+      shuffle: 0
+      image_key: jpg
+      image_transforms:
+      - target: torchvision.transforms.Resize
+        params:
+          size: 256
+          interpolation: 3
+      - target: torchvision.transforms.CenterCrop
+        params:
+          size: 256
+
+
+lightning:
+  callbacks:
+    image_logger:
+      target: main.ImageLogger
+      params:
+        batch_frequency: 5000
+        max_images: 8
+        increase_log_steps: False
+        log_first_step: False
+
+
+  trainer:
+    #replace_sampler_ddp: False
+    benchmark: True
+    val_check_interval: 50000
+    num_sanity_val_steps: 0
+
+
+
--- a/stable-diffusion-private/configs/stable-diffusion/txt2img-1p4B-multinode.yaml
+++ b/stable-diffusion-private/configs/stable-diffusion/txt2img-1p4B-multinode.yaml
+model:
+  base_learning_rate: 1.0e-04
+  target: ldm.models.diffusion.ddpm.LatentDiffusion
+  params:
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    image_size: 32
+    channels: 4
+    cond_stage_trainable: true
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+
+    scheduler_config: # 10000 warmup steps
+      target: ldm.lr_scheduler.LambdaLinearScheduler
+      params:
+        warm_up_steps: [ 10000 ]
+        cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
+        f_start: [ 1.e-6 ]
+        f_max: [ 1. ]
+        f_min: [ 1. ]
+
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        image_size: 32
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 1280
+        use_checkpoint: True
+        legacy: False
+
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ckpt_path: "models/first_stage_models/kl-f8/model.ckpt"
+        ddconfig:
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.BERTEmbedder
+      params:
+        n_embed: 1280
+        n_layer: 32
+
+
+data:
+  target: ldm.data.laion.WebDataModuleFromConfig
+  params:
+    tar_base: "pipe:aws s3 cp s3://s-datasets/laion5b/laion2B-data/"
+    batch_size: 12
+    num_workers: 4
+    train:
+      shards: '{000000..231317}.tar -'
+      shuffle: 10000
+      image_key: jpg
+      image_transforms:
+      - target: torchvision.transforms.Resize
+        params:
+          size: 256
+          interpolation: 3
+      - target: torchvision.transforms.RandomCrop
+        params:
+          size: 256
+
+    # NOTE use enough shards to avoid empty validation loops in workers
+    validation:
+      shards: '{231318..231349}.tar -'
+      shuffle: 0
+      image_key: jpg
+      image_transforms:
+      - target: torchvision.transforms.Resize
+        params:
+          size: 256
+          interpolation: 3
+      - target: torchvision.transforms.CenterCrop
+        params:
+          size: 256
+
+
+lightning:
+  callbacks:
+    image_logger:
+      target: main.ImageLogger
+      params:
+        batch_frequency: 5000
+        max_images: 8
+        increase_log_steps: False
+        log_first_step: False
+
+
+  trainer:
+    #replace_sampler_ddp: False
+    benchmark: True
+    val_check_interval: 50000
+    num_sanity_val_steps: 0
+
+
+
--- a/stable-diffusion-private/configs/stable-diffusion/txt2img-clip-encoder-dev.yaml
+++ b/stable-diffusion-private/configs/stable-diffusion/txt2img-clip-encoder-dev.yaml
+model:
+  base_learning_rate: 1.0e-04
+  target: ldm.models.diffusion.ddpm.LatentDiffusion
+  params:
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    image_size: 32
+    channels: 4
+    cond_stage_trainable: false   # Note: different from the one we trained before
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+
+    scheduler_config: # 10000 warmup steps
+      target: ldm.lr_scheduler.LambdaLinearScheduler
+      params:
+        warm_up_steps: [ 10000 ]
+        cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
+        f_start: [ 1.e-6 ]
+        f_max: [ 1. ]
+        f_min: [ 1. ]
+
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        image_size: 32
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ckpt_path: "models/first_stage_models/kl-f8/model.ckpt"
+        ddconfig:
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
+
+
+data:
+  target: ldm.data.laion.WebDataModuleFromConfig
+  params:
+    tar_base: "pipe:aws s3 cp s3://s-datasets/laion5b/laion2B-data/"
+    batch_size: 56
+    num_workers: 4
+    multinode: True
+    train:
+      shards: '{000000..231317}.tar -'
+      shuffle: 10000
+      image_key: jpg
+      image_transforms:
+      - target: torchvision.transforms.Resize
+        params:
+          size: 256
+          interpolation: 3
+      - target: torchvision.transforms.RandomCrop
+        params:
+          size: 256
+
+    # NOTE use enough shards to avoid empty validation loops in workers
+    validation:
+      shards: '{231318..231349}.tar -'
+      shuffle: 0
+      image_key: jpg
+      image_transforms:
+      - target: torchvision.transforms.Resize
+        params:
+          size: 256
+          interpolation: 3
+      - target: torchvision.transforms.CenterCrop
+        params:
+          size: 256
+
+
+lightning:
+  callbacks:
+    image_logger:
+      target: main.ImageLogger
+      params:
+        batch_frequency: 5000
+        max_images: 8
+        increase_log_steps: False
+        log_first_step: False
+
+
+  trainer:
+    #replace_sampler_ddp: False
+    benchmark: True
+    val_check_interval: 50000
+    num_sanity_val_steps: 0
+    accumulate_grad_batches: 2
+
+
+
--- a/stable-diffusion-private/configs/stable-diffusion/txt2img-ldm-frozen-dev.yaml
+++ b/stable-diffusion-private/configs/stable-diffusion/txt2img-ldm-frozen-dev.yaml
+model:
+  base_learning_rate: 1.0e-04
+  target: ldm.models.diffusion.ddpm.LatentDiffusion
+  params:
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    image_size: 32
+    channels: 4
+    cond_stage_trainable: false
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+
+    scheduler_config: # 10000 warmup steps
+      target: ldm.lr_scheduler.LambdaLinearScheduler
+      params:
+        warm_up_steps: [ 10000 ]
+        cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
+        f_start: [ 1.e-6 ]
+        f_max: [ 1. ]
+        f_min: [ 1. ]
+
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        image_size: 32
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 1280
+        use_checkpoint: True
+        legacy: False
+
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ckpt_path: "models/first_stage_models/kl-f8/model.ckpt"
+        ddconfig:
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.BERTEmbedder
+      params:
+        n_embed: 1280
+        n_layer: 32
+
+
+data:
+  target: ldm.data.laion.WebDataModuleFromConfig
+  params:
+    tar_base: "pipe:aws s3 cp s3://s-datasets/laion5b/laion2B-data/"
+    batch_size: 52
+    num_workers: 4
+    multinode: False
+    train:
+      shards: '{000000..231317}.tar -'
+      shuffle: 10000
+      image_key: jpg
+      image_transforms:
+      - target: torchvision.transforms.Resize
+        params:
+          size: 256
+          interpolation: 3
+      - target: torchvision.transforms.RandomCrop
+        params:
+          size: 256
+
+    # NOTE use enough shards to avoid empty validation loops in workers
+    validation:
+      shards: '{231318..231349}.tar -'
+      shuffle: 0
+      image_key: jpg
+      image_transforms:
+      - target: torchvision.transforms.Resize
+        params:
+          size: 256
+          interpolation: 3
+      - target: torchvision.transforms.CenterCrop
+        params:
+          size: 256
+
+
+lightning:
+  callbacks:
+    image_logger:
+      target: main.ImageLogger
+      params:
+        batch_frequency: 5000
+        max_images: 8
+        increase_log_steps: False
+        log_first_step: False
+
+
+  trainer:
+    #replace_sampler_ddp: False
+    benchmark: True
+    val_check_interval: 50000
+    num_sanity_val_steps: 0
+
+
+
--- a/stable-diffusion-private/configs/stable-diffusion/txt2img-ldm-unfrozen-dev.yaml
+++ b/stable-diffusion-private/configs/stable-diffusion/txt2img-ldm-unfrozen-dev.yaml
+model:
+  base_learning_rate: 1.0e-04
+  target: ldm.models.diffusion.ddpm.LatentDiffusion
+  params:
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    image_size: 32
+    channels: 4
+    cond_stage_trainable: true
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+
+    scheduler_config: # 10000 warmup steps
+      target: ldm.lr_scheduler.LambdaLinearScheduler
+      params:
+        warm_up_steps: [ 10000 ]
+        cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
+        f_start: [ 1.e-6 ]
+        f_max: [ 1. ]
+        f_min: [ 1. ]
+
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        image_size: 32
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 1280
+        use_checkpoint: True
+        legacy: False
+
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ckpt_path: "models/first_stage_models/kl-f8/model.ckpt"
+        ddconfig:
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.BERTEmbedder
+      params:
+        n_embed: 1280
+        n_layer: 32
+
+
+data:
+  target: ldm.data.laion.WebDataModuleFromConfig
+  params:
+    tar_base: "pipe:aws s3 cp s3://s-datasets/laion5b/laion2B-data/"
+    batch_size: 12
+    num_workers: 4
+    multinode: False
+    train:
+      shards: '{000000..231317}.tar -'
+      shuffle: 10000
+      image_key: jpg
+      image_transforms:
+      - target: torchvision.transforms.Resize
+        params:
+          size: 256
+          interpolation: 3
+      - target: torchvision.transforms.RandomCrop
+        params:
+          size: 256
+
+    # NOTE use enough shards to avoid empty validation loops in workers
+    validation:
+      shards: '{231318..231349}.tar -'
+      shuffle: 0
+      image_key: jpg
+      image_transforms:
+      - target: torchvision.transforms.Resize
+        params:
+          size: 256
+          interpolation: 3
+      - target: torchvision.transforms.CenterCrop
+        params:
+          size: 256
+
+
+lightning:
+  callbacks:
+    image_logger:
+      target: main.ImageLogger
+      params:
+        batch_frequency: 5000
+        max_images: 8
+        increase_log_steps: False
+        log_first_step: False
+
+
+  trainer:
+    #replace_sampler_ddp: False
+    benchmark: True
+    val_check_interval: 50000
+    num_sanity_val_steps: 0
+
+
+
--- a/stable-diffusion-private/configs/stable-diffusion/txt2img-ldm-vae-f8.yaml
+++ b/stable-diffusion-private/configs/stable-diffusion/txt2img-ldm-vae-f8.yaml
+model:
+  base_learning_rate: 1.0e-04  # TODO: run with scale_lr False
+  target: ldm.models.diffusion.ddpm.LatentDiffusion
+  params:
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    image_size: 32
+    channels: 4
+    cond_stage_trainable: true
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+
+    scheduler_config: # 10000 warmup steps
+      target: ldm.lr_scheduler.LambdaLinearScheduler
+      params:
+        warm_up_steps: [ 10000 ]
+        cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
+        f_start: [ 1.e-6 ]
+        f_max: [ 1. ]
+        f_min: [ 1. ]
+
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        image_size: 32
+        in_channels: 4
+        out_channels: 4
+        model_channels: 128 # 320   # TODO increase
+        attention_resolutions: [ 4, 2, 1 ]  # is equal to fixed spatial resolution: 32 , 16 , 8
+        num_res_blocks: 2
+        channel_mult: [ 1,2,4,4 ]
+        #num_head_channels: 32
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 1280
+        use_checkpoint: True
+
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ckpt_path: "/home/robin/projects/latent-diffusion/models/first_stage_models/kl-f8/model.ckpt"
+        ddconfig:
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.BERTEmbedder
+      params:
+        n_embed: 1280
+        n_layer: 3 #32 # TODO: increase
+
+
+data:
+  target: ldm.data.laion.WebDataModuleFromConfig
+  params:
+    tar_base: "pipe:aws s3 cp s3://s-datasets/laion5b/laion2B-data/"
+    batch_size: 60
+    num_workers: 4
+    n_nodes: 2  # TODO: runs with two gpus
+    train:
+      shards: '{000000..000010}.tar -'  # TODO: wild guess, change
+      image_key: jpg
+      image_transforms:
+      - target: torchvision.transforms.Resize
+        params:
+          size: 512
+          interpolation: 3
+      - target: torchvision.transforms.RandomCrop
+        params:
+          size: 512
+
+      shuffle: 5000
+      n_examples: 16519100  # TODO: find out
+    validation:
+      shards: '{000011..000012}.tar -'  # TODO: wild guess, change
+      image_key: jpg
+      image_transforms:
+      - target: torchvision.transforms.Resize
+        params:
+          size: 512
+          interpolation: 3
+      - target: torchvision.transforms.CenterCrop
+        params:
+          size: 512
+
+      shuffle: 0
+      n_examples: 60000  # TODO: find out
+    val_num_workers: 2
+
+
+
+lightning:
+  callbacks:
+    image_logger:
+      target: main.ImageLogger
+      params:
+        batch_frequency: 5000 # 5000
+        max_images: 8
+        increase_log_steps: False
+        log_first_step: True
+
+
+  trainer:
+    replace_sampler_ddp: False
+    benchmark: True
+    val_check_interval: 20000  # every 20k training steps
+    num_sanity_val_steps: 0
\ No newline at end of file
--- a/stable-diffusion-private/configs/stable-diffusion/txt2img-multinode-clip-encoder-f16-1024-laion-hr.yaml
+++ b/stable-diffusion-private/configs/stable-diffusion/txt2img-multinode-clip-encoder-f16-1024-laion-hr.yaml
+model:
+  base_learning_rate: 1.0e-04
+  target: ldm.models.diffusion.ddpm.LatentDiffusion
+  params:
+    linear_start: 0.001
+    linear_end: 0.015
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    image_size: 64
+    channels: 16
+    cond_stage_trainable: false   # Note: different from the one we trained before
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.22765929   # magic number
+  
+    #ckpt_path: "/home/mchorse/stable-diffusion-ckpts/768f16-2022-06-23-pruned.ckpt"
+
+    #scheduler_config: # 10000 warmup steps
+    #  target: ldm.lr_scheduler.LambdaLinearScheduler
+    #  params:
+    #    warm_up_steps: [ 10000 ]
+    #    cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
+    #    f_start: [ 1.e-6 ]
+    #    f_max: [ 1. ]
+    #    f_min: [ 1. ]
+
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        image_size: 64    # not really needed
+        in_channels: 16
+        out_channels: 16
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 16
+        monitor: val/rec_loss
+        ddconfig:
+          double_z: True
+          z_channels: 16
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult: [ 1,1,2,2,4 ]  # num_down = len(ch_mult)-1
+          num_res_blocks: 2
+          attn_resolutions: [ 16 ]
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
+
+
+data:
+  target: ldm.data.laion.WebDataModuleFromConfig
+  params:
+    tar_base: "pipe:aws s3 cp s3://s-datasets/laion-high-resolution/"
+    batch_size: 3
+    num_workers: 4
+    multinode: True
+    train:
+      shards: '{00000..17279}.tar -'
+      shuffle: 10000
+      image_key: jpg
+      image_transforms:
+      - target: torchvision.transforms.Resize
+        params:
+          size: 1024
+          interpolation: 3
+      - target: torchvision.transforms.RandomCrop
+        params:
+          size: 1024
+
+    # NOTE use enough shards to avoid empty validation loops in workers
+    validation:
+      shards: '{17280..17535}.tar -'
+      shuffle: 0
+      image_key: jpg
+      image_transforms:
+      - target: torchvision.transforms.Resize
+        params:
+          size: 1024
+          interpolation: 3
+      - target: torchvision.transforms.CenterCrop
+        params:
+          size: 1024
+
+
+lightning:
+  find_unused_parameters: False
+
+  modelcheckpoint:
+    params:
+      every_n_train_steps: 2000
+
+  callbacks:
+    image_logger:
+      target: main.ImageLogger
+      params:
+        batch_frequency: 2000
+        max_images: 2
+        increase_log_steps: False
+        log_first_step: False
+        log_images_kwargs:
+          use_ema_scope: False
+          inpaint: False
+          plot_progressive_rows: False
+          plot_diffusion_rows: False
+          N: 2
+          unconditional_guidance_scale: 5.0
+          unconditional_guidance_label: [""]
+
+  trainer:
+    benchmark: True
+    val_check_interval: 5000000
+    num_sanity_val_steps: 0
+    accumulate_grad_batches: 4
--- a/stable-diffusion-private/configs/stable-diffusion/txt2img-multinode-clip-encoder-f16-256-pretraining.yaml
+++ b/stable-diffusion-private/configs/stable-diffusion/txt2img-multinode-clip-encoder-f16-256-pretraining.yaml
+model:
+  base_learning_rate: 1.0e-04
+  target: ldm.models.diffusion.ddpm.LatentDiffusion
+  params:
+    linear_start: 0.001
+    linear_end: 0.015
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    image_size: 16
+    channels: 16
+    cond_stage_trainable: false   # Note: different from the one we trained before
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.22765929   # magic number
+
+    scheduler_config: # 10000 warmup steps
+      target: ldm.lr_scheduler.LambdaLinearScheduler
+      params:
+        warm_up_steps: [ 10000 ]
+        cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
+        f_start: [ 1.e-6 ]
+        f_max: [ 1. ]
+        f_min: [ 1. ]
+
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        image_size: 16    # not really needed
+        in_channels: 16
+        out_channels: 16
+        model_channels: 320    # TODO: scale model here
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 16
+        monitor: val/rec_loss
+        ckpt_path: "models/first_stage_models/kl-f16/model.ckpt"
+        ddconfig:
+          double_z: True
+          z_channels: 16
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult: [ 1,1,2,2,4 ]  # num_down = len(ch_mult)-1
+          num_res_blocks: 2
+          attn_resolutions: [ 16 ]
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
+
+
+data:
+  target: ldm.data.laion.WebDataModuleFromConfig
+  params:
+    tar_base: "pipe:aws s3 cp s3://s-datasets/laion5b/laion2B-data/"
+    batch_size: 55
+    num_workers: 4
+    multinode: True
+    min_size: 256   # TODO: experiment. Note: for 2B, images are stored at max 384 resolution
+    train:
+      shards: '{000000..231317}.tar -'
+      shuffle: 10000
+      image_key: jpg
+      image_transforms:
+      - target: torchvision.transforms.Resize
+        params:
+          size: 256
+          interpolation: 3
+      - target: torchvision.transforms.RandomCrop
+        params:
+          size: 256
+
+    # NOTE use enough shards to avoid empty validation loops in workers
+    validation:
+      shards: '{231318..231349}.tar -'
+      shuffle: 0
+      image_key: jpg
+      image_transforms:
+      - target: torchvision.transforms.Resize
+        params:
+          size: 256
+          interpolation: 3
+      - target: torchvision.transforms.CenterCrop
+        params:
+          size: 256
+
+
+lightning:
+  callbacks:
+    image_logger:
+      target: main.ImageLogger
+      params:
+        batch_frequency: 5000
+        max_images: 4
+        increase_log_steps: False
+        log_first_step: False
+        log_images_kwargs:
+          use_ema_scope: False
+          inpaint: False
+          plot_progressive_rows: False
+          plot_diffusion_rows: False
+          N: 4
+          unconditional_guidance_scale: 3.0
+          unconditional_guidance_label: [""]
+
+  trainer:
+    benchmark: True
+    val_check_interval: 5000000  # really sorry
+    num_sanity_val_steps: 0
+    accumulate_grad_batches: 2
--- a/stable-diffusion-private/configs/stable-diffusion/txt2img-multinode-clip-encoder-f16-768-laion-hr-inference.yaml
+++ b/stable-diffusion-private/configs/stable-diffusion/txt2img-multinode-clip-encoder-f16-768-laion-hr-inference.yaml
+model:
+  base_learning_rate: 1.0e-04
+  target: ldm.models.diffusion.ddpm.LatentDiffusion
+  params:
+    linear_start: 0.001
+    linear_end: 0.015
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    image_size: 48
+    channels: 16
+    cond_stage_trainable: false
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.22765929   # magic number
+
+    scheduler_config: # 10000 warmup steps
+      target: ldm.lr_scheduler.LambdaLinearScheduler
+      params:
+        warm_up_steps: [ 10000 ]
+        cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
+        f_start: [ 1.e-6 ]
+        f_max: [ 1. ]
+        f_min: [ 1. ]
+
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        image_size: 48
+        in_channels: 16
+        out_channels: 16
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 16
+        monitor: val/rec_loss
+        ddconfig:
+          double_z: True
+          z_channels: 16
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult: [ 1,1,2,2,4 ]  # num_down = len(ch_mult)-1
+          num_res_blocks: 2
+          attn_resolutions: [ 16 ]
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
\ No newline at end of file
--- a/stable-diffusion-private/configs/stable-diffusion/txt2img-multinode-clip-encoder-f16-768-laion-hr.yaml
+++ b/stable-diffusion-private/configs/stable-diffusion/txt2img-multinode-clip-encoder-f16-768-laion-hr.yaml
+model:
+  base_learning_rate: 1.0e-04
+  target: ldm.models.diffusion.ddpm.LatentDiffusion
+  params:
+    linear_start: 0.001
+    linear_end: 0.015
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    image_size: 48
+    channels: 16
+    cond_stage_trainable: false   # Note: different from the one we trained before
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.22765929   # magic number
+  
+    ckpt_path: "/home/mchorse/stable-diffusion-ckpts/768f16-2022-06-23-pruned.ckpt"
+
+    scheduler_config: # 10000 warmup steps
+      target: ldm.lr_scheduler.LambdaLinearScheduler
+      params:
+        warm_up_steps: [ 10000 ]
+        cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
+        f_start: [ 1.e-6 ]
+        f_max: [ 1. ]
+        f_min: [ 1. ]
+
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        image_size: 48    # not really needed
+        in_channels: 16
+        out_channels: 16
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 16
+        monitor: val/rec_loss
+        ddconfig:
+          double_z: True
+          z_channels: 16
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult: [ 1,1,2,2,4 ]  # num_down = len(ch_mult)-1
+          num_res_blocks: 2
+          attn_resolutions: [ 16 ]
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
+
+
+data:
+  target: ldm.data.laion.WebDataModuleFromConfig
+  params:
+    tar_base: "pipe:aws s3 cp s3://s-datasets/laion-high-resolution/"
+    batch_size: 6
+    num_workers: 4
+    multinode: True
+    train:
+      shards: '{00000..17279}.tar -'
+      shuffle: 10000
+      image_key: jpg
+      image_transforms:
+      - target: torchvision.transforms.Resize
+        params:
+          size: 768
+          interpolation: 3
+      - target: torchvision.transforms.RandomCrop
+        params:
+          size: 768
+
+    # NOTE use enough shards to avoid empty validation loops in workers
+    validation:
+      shards: '{17280..17535}.tar -'
+      shuffle: 0
+      image_key: jpg
+      image_transforms:
+      - target: torchvision.transforms.Resize
+        params:
+          size: 768
+          interpolation: 3
+      - target: torchvision.transforms.CenterCrop
+        params:
+          size: 768
+
+
+lightning:
+  find_unused_parameters: False
+
+  modelcheckpoint:
+    params:
+      every_n_train_steps: 5000
+
+  callbacks:
+    image_logger:
+      target: main.ImageLogger
+      params:
+        batch_frequency: 5000
+        max_images: 4
+        increase_log_steps: False
+        log_first_step: False
+        log_images_kwargs:
+          use_ema_scope: False
+          inpaint: False
+          plot_progressive_rows: False
+          plot_diffusion_rows: False
+          N: 4
+          unconditional_guidance_scale: 3.0
+          unconditional_guidance_label: [""]
+
+  trainer:
+    benchmark: True
+    val_check_interval: 5000000
+    num_sanity_val_steps: 0
+    accumulate_grad_batches: 2
--- a/stable-diffusion-private/configs/stable-diffusion/txt2img-multinode-clip-encoder-f16-768.yaml
+++ b/stable-diffusion-private/configs/stable-diffusion/txt2img-multinode-clip-encoder-f16-768.yaml
+model:
+  base_learning_rate: 1.0e-04
+  target: ldm.models.diffusion.ddpm.LatentDiffusion
+  params:
+    linear_start: 0.001
+    linear_end: 0.015
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    image_size: 48
+    channels: 16
+    cond_stage_trainable: false   # Note: different from the one we trained before
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.22765929   # magic number
+  
+    ckpt_path: "/home/mchorse/stable-diffusion-ckpts/256f16-2022-06-15-216k-pruned.ckpt"
+
+    scheduler_config: # 10000 warmup steps
+      target: ldm.lr_scheduler.LambdaLinearScheduler
+      params:
+        warm_up_steps: [ 10000 ]
+        cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
+        f_start: [ 1.e-6 ]
+        f_max: [ 1. ]
+        f_min: [ 1. ]
+
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        image_size: 48    # not really needed
+        in_channels: 16
+        out_channels: 16
+        model_channels: 320    # TODO: scale model here
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 16
+        monitor: val/rec_loss
+        ddconfig:
+          double_z: True
+          z_channels: 16
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult: [ 1,1,2,2,4 ]  # num_down = len(ch_mult)-1
+          num_res_blocks: 2
+          attn_resolutions: [ 16 ]
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
+
+
+data:
+  target: ldm.data.laion.WebDataModuleFromConfig
+  params:
+    tar_base: "pipe:aws s3 cp s3://s-datasets/laion5b/laion2B-data/"
+    batch_size: 6
+    num_workers: 4
+    multinode: True
+    min_size: 384   # TODO: experiment. Note: for 2B, images are stored at max 384 resolution
+    train:
+      shards: '{000000..231317}.tar -'
+      shuffle: 10000
+      image_key: jpg
+      image_transforms:
+      - target: torchvision.transforms.Resize
+        params:
+          size: 768
+          interpolation: 3
+      - target: torchvision.transforms.RandomCrop
+        params:
+          size: 768
+
+    # NOTE use enough shards to avoid empty validation loops in workers
+    validation:
+      shards: '{231318..231349}.tar -'
+      shuffle: 0
+      image_key: jpg
+      image_transforms:
+      - target: torchvision.transforms.Resize
+        params:
+          size: 768
+          interpolation: 3
+      - target: torchvision.transforms.CenterCrop
+        params:
+          size: 768
+
+
+lightning:
+  find_unused_parameters: False
+
+  callbacks:
+    image_logger:
+      target: main.ImageLogger
+      params:
+        batch_frequency: 5000
+        max_images: 4
+        increase_log_steps: False
+        log_first_step: False
+        log_images_kwargs:
+          use_ema_scope: False
+          inpaint: False
+          plot_progressive_rows: False
+          plot_diffusion_rows: False
+          N: 4
+          unconditional_guidance_scale: 3.0
+          unconditional_guidance_label: [""]
+
+  trainer:
+    benchmark: True
+    val_check_interval: 5000000
+    num_sanity_val_steps: 0
+    accumulate_grad_batches: 2
--- a/stable-diffusion-private/configs/stable-diffusion/txt2img-t5-encoder-dev.yaml
+++ b/stable-diffusion-private/configs/stable-diffusion/txt2img-t5-encoder-dev.yaml
+model:
+  base_learning_rate: 1.0e-04
+  target: ldm.models.diffusion.ddpm.LatentDiffusion
+  params:
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    image_size: 32
+    channels: 4
+    cond_stage_trainable: false   # Note: different from the one we trained before
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+
+    scheduler_config: # 10000 warmup steps
+      target: ldm.lr_scheduler.LambdaLinearScheduler
+      params:
+        warm_up_steps: [ 10000 ]
+        cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
+        f_start: [ 1.e-6 ]
+        f_max: [ 1. ]
+        f_min: [ 1. ]
+
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        image_size: 32
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 2048
+        use_checkpoint: True
+        legacy: False
+
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ckpt_path: "models/first_stage_models/kl-f8/model.ckpt"
+        ddconfig:
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenT5Embedder
+      params:
+        version: "google/t5-v1_1-xl"
+
+
+data:
+  target: ldm.data.laion.WebDataModuleFromConfig
+  params:
+    tar_base: "pipe:aws s3 cp s3://s-datasets/laion5b/laion2B-data/"
+    batch_size: 40
+    num_workers: 4
+    multinode: False
+    train:
+      shards: '{000000..231317}.tar -'
+      shuffle: 10000
+      image_key: jpg
+      image_transforms:
+      - target: torchvision.transforms.Resize
+        params:
+          size: 256
+          interpolation: 3
+      - target: torchvision.transforms.RandomCrop
+        params:
+          size: 256
+
+    # NOTE use enough shards to avoid empty validation loops in workers
+    validation:
+      shards: '{231318..231349}.tar -'
+      shuffle: 0
+      image_key: jpg
+      image_transforms:
+      - target: torchvision.transforms.Resize
+        params:
+          size: 256
+          interpolation: 3
+      - target: torchvision.transforms.CenterCrop
+        params:
+          size: 256
+
+
+lightning:
+  callbacks:
+    image_logger:
+      target: main.ImageLogger
+      params:
+        batch_frequency: 5000
+        max_images: 8
+        increase_log_steps: False
+        log_first_step: False
+
+
+  trainer:
+    #replace_sampler_ddp: False
+    benchmark: True
+    val_check_interval: 50000
+    num_sanity_val_steps: 0
+
+
+
--- a/stable-diffusion-private/configs/stable-diffusion/txt2img-upscale-clip-encoder-f16-1024-dev.yaml
+++ b/stable-diffusion-private/configs/stable-diffusion/txt2img-upscale-clip-encoder-f16-1024-dev.yaml
+model:
+  base_learning_rate: 1.0e-04
+  target: ldm.models.diffusion.ddpm.LatentUpscaleDiffusion
+  params:
+    low_scale_key: "LR_image" # TODO: adapt
+    linear_start: 0.001
+    linear_end: 0.015
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "image"
+    #first_stage_key: "jpg"  # TODO: use this later
+    cond_stage_key: "caption"
+    #cond_stage_key: "txt" # TODO: use this later
+    image_size: 64
+    channels: 16
+    cond_stage_trainable: false
+    conditioning_key: "hybrid-adm"
+    monitor: val/loss_simple_ema
+    scale_factor: 0.22765929   # magic number
+
+    low_scale_config:
+      target: ldm.modules.encoders.modules.LowScaleEncoder
+      params:
+        scale_factor: 0.18215
+        linear_start: 0.00085
+        linear_end: 0.0120
+        timesteps: 1000
+        max_noise_level: 100
+        output_size: 64
+        model_config:
+          target: ldm.models.autoencoder.AutoencoderKL
+          params:
+            embed_dim: 4
+            monitor: val/rec_loss
+            ckpt_path: "models/first_stage_models/kl-f8/model.ckpt"
+            ddconfig:
+              double_z: true
+              z_channels: 4
+              resolution: 256
+              in_channels: 3
+              out_ch: 3
+              ch: 128
+              ch_mult:
+                - 1
+                - 2
+                - 4
+                - 4
+              num_res_blocks: 2
+              attn_resolutions: [ ]
+              dropout: 0.0
+            lossconfig:
+              target: torch.nn.Identity
+
+    scheduler_config: # 10000 warmup steps
+      target: ldm.lr_scheduler.LambdaLinearScheduler
+      params:
+        warm_up_steps: [ 10000 ]
+        cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
+        f_start: [ 1.e-6 ]
+        f_max: [ 1. ]
+        f_min: [ 1. ]
+
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        num_classes: 1000  #  timesteps for noise conditoining
+        image_size: 64    # not really needed
+        in_channels: 20
+        out_channels: 16
+        model_channels: 32 # TODO: more
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 16
+        monitor: val/rec_loss
+        ckpt_path: "models/first_stage_models/kl-f16/model.ckpt"
+        ddconfig:
+          double_z: True
+          z_channels: 16
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult: [ 1,1,2,2,4 ]  # num_down = len(ch_mult)-1
+          num_res_blocks: 2
+          attn_resolutions: [ 16 ]
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
+
+
+#data:
+#  target: ldm.data.laion.WebDataModuleFromConfig
+#  params:
+#    tar_base: "pipe:aws s3 cp s3://s-datasets/laion5b/laion2B-data/"
+#    batch_size: 4
+#    num_workers: 4
+#    multinode: True
+#    min_size: 256   # TODO: experiment. Note: for 2B, images are stored at max 384 resolution
+#    train:
+#      shards: '{000000..231317}.tar -'
+#      shuffle: 10000
+#      image_key: jpg
+#      image_transforms:
+#      - target: torchvision.transforms.Resize
+#        params:
+#          size: 1024
+#          interpolation: 3
+#      - target: torchvision.transforms.RandomCrop
+#        params:
+#          size: 1024
+#
+#    # NOTE use enough shards to avoid empty validation loops in workers
+#    validation:
+#      shards: '{231318..231349}.tar -'
+#      shuffle: 0
+#      image_key: jpg
+#      image_transforms:
+#      - target: torchvision.transforms.Resize
+#        params:
+#          size: 1024
+#          interpolation: 3
+#      - target: torchvision.transforms.CenterCrop
+#        params:
+#          size: 1024
+
+data:
+  target: main.DataModuleFromConfig
+  params:
+    batch_size: 8
+    num_workers: 7
+    wrap: false
+    train:
+      target: ldm.data.imagenet.ImageNetSRTrain
+      params:
+        size: 1024
+        downscale_f: 4
+        degradation: "cv_nearest"
+
+lightning:
+  callbacks:
+    image_logger:
+      target: main.ImageLogger
+      params:
+        batch_frequency: 10
+        max_images: 4
+        increase_log_steps: False
+        log_first_step: False
+        log_images_kwargs:
+          sample: False
+          use_ema_scope: False
+          inpaint: False
+          plot_progressive_rows: False
+          plot_diffusion_rows: False
+          N: 4
+          #unconditional_guidance_scale: 3.0
+          #unconditional_guidance_label: [""]
+
+  trainer:
+    benchmark: True
+    # val_check_interval: 5000000  # really sorry # TODO: bring back in
+    num_sanity_val_steps: 0
+    accumulate_grad_batches: 1
--- a/stable-diffusion-private/configs/stable-diffusion/txt2img-upscale-clip-encoder-f16-1024.yaml
+++ b/stable-diffusion-private/configs/stable-diffusion/txt2img-upscale-clip-encoder-f16-1024.yaml
+model:
+  base_learning_rate: 5.0e-05
+  target: ldm.models.diffusion.ddpm.LatentUpscaleDiffusion
+  params:
+    low_scale_key: "lr"
+    linear_start: 0.001
+    linear_end: 0.015
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    image_size: 64
+    channels: 16
+    cond_stage_trainable: false
+    conditioning_key: "hybrid-adm"
+    monitor: val/loss_simple_ema
+    scale_factor: 0.22765929   # magic number
+
+    low_scale_config:
+      target: ldm.modules.encoders.modules.LowScaleEncoder
+      params:
+        scale_factor: 0.18215
+        linear_start: 0.00085
+        linear_end: 0.0120
+        timesteps: 1000
+        max_noise_level: 100
+        output_size: 64
+        model_config:
+          target: ldm.models.autoencoder.AutoencoderKL
+          params:
+            embed_dim: 4
+            monitor: val/rec_loss
+            ckpt_path: "models/first_stage_models/kl-f8/model.ckpt"
+            ddconfig:
+              double_z: true
+              z_channels: 4
+              resolution: 256
+              in_channels: 3
+              out_ch: 3
+              ch: 128
+              ch_mult:
+                - 1
+                - 2
+                - 4
+                - 4
+              num_res_blocks: 2
+              attn_resolutions: [ ]
+              dropout: 0.0
+            lossconfig:
+              target: torch.nn.Identity
+
+    scheduler_config: # 10000 warmup steps
+      target: ldm.lr_scheduler.LambdaLinearScheduler
+      params:
+        warm_up_steps: [ 10000 ]
+        cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
+        f_start: [ 1.e-6 ]
+        f_max: [ 1. ]
+        f_min: [ 1. ]
+
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        num_classes: 1000  #  timesteps for noise conditoining
+        image_size: 64    # not really needed
+        in_channels: 20
+        out_channels: 16
+        model_channels: 96
+        attention_resolutions: [ 8, 4, 2 ] # -> at 32, 16, 8
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 8, 8 ]
+        # -> res, ds: (64, 1), (32, 2), (16, 4), (8, 8), (4, 16)
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 16
+        monitor: val/rec_loss
+        ckpt_path: "models/first_stage_models/kl-f16/model.ckpt"
+        ddconfig:
+          double_z: True
+          z_channels: 16
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult: [ 1,1,2,2,4 ]  # num_down = len(ch_mult)-1
+          num_res_blocks: 2
+          attn_resolutions: [ 16 ]
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
+
+
+data:
+  target: ldm.data.laion.WebDataModuleFromConfig
+  params:
+    tar_base: "pipe:aws s3 cp s3://s-datasets/laion-high-resolution/"
+    batch_size: 10
+    num_workers: 4
+    train:
+      shards: '{00000..17279}.tar -'
+      shuffle: 10000
+      image_key: jpg
+      image_transforms:
+      - target: torchvision.transforms.Resize
+        params:
+          size: 1024
+          interpolation: 3
+      - target: torchvision.transforms.RandomCrop
+        params:
+          size: 1024
+      postprocess:
+              target: ldm.data.laion.AddLR
+              params:
+                      factor: 4
+
+    # NOTE use enough shards to avoid empty validation loops in workers
+    validation:
+      shards: '{17280..17535}.tar -'
+      shuffle: 0
+      image_key: jpg
+      image_transforms:
+      - target: torchvision.transforms.Resize
+        params:
+          size: 1024
+          interpolation: 3
+      - target: torchvision.transforms.CenterCrop
+        params:
+          size: 1024
+      postprocess:
+              target: ldm.data.laion.AddLR
+              params:
+                      factor: 4
+
+lightning:
+  find_unused_parameters: False
+
+  callbacks:
+    image_logger:
+      target: main.ImageLogger
+      params:
+        batch_frequency: 1000
+        max_images: 4
+        increase_log_steps: False
+        log_first_step: False
+        log_images_kwargs:
+          use_ema_scope: False
+          inpaint: False
+          plot_progressive_rows: False
+          plot_diffusion_rows: False
+          N: 4
+          unconditional_guidance_scale: 3.0
+          unconditional_guidance_label: [""]
+
+  trainer:
+    benchmark: True
+    val_check_interval: 5000000  # really sorry
+    num_sanity_val_steps: 0
+    accumulate_grad_batches: 4
--- a/stable-diffusion-private/configs/stable-diffusion/txt2img-v2-clip-encoder-improved_aesthetics-256-dev.yaml
+++ b/stable-diffusion-private/configs/stable-diffusion/txt2img-v2-clip-encoder-improved_aesthetics-256-dev.yaml
+model:
+  base_learning_rate: 1.0e-04
+  target: ldm.models.diffusion.ddpm.LatentDiffusion
+  params:
+
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    image_size: 32
+    channels: 4
+    cond_stage_trainable: false   # Note: different from the one we trained before
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+
+    scheduler_config: # 10000 warmup steps
+      target: ldm.lr_scheduler.LambdaLinearScheduler
+      params:
+        warm_up_steps: [ 10000 ]
+        cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
+        f_start: [ 1.e-6 ]
+        f_max: [ 1. ]
+        f_min: [ 1. ]
+
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        image_size: 32 # unused
+        in_channels: 4
+        out_channels: 4
+        model_channels: 384
+        attention_resolutions: [ 8, 4, 2, 1 ]
+        num_res_blocks: [ 2, 2, 2, 2 ]
+        channel_mult: [ 1, 2, 4, 4 ]
+        disable_self_attentions: [ False, False, False, False ]  # converts the self-attention to a cross-attention layer if true
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
+
+
+data:  # TODO
+  target: main.DataModuleFromConfig
+  params:
+    batch_size: 8
+    num_workers: 4
+    wrap: false
+    train:
+      target: ldm.data.dummy.DummyData
+      params:
+        length: 20000
+        size: [256, 256, 3]
+    validation:
+      target: ldm.data.dummy.DummyData
+      params:
+        length: 10000
+        size: [256, 256, 3]
+
+#data:
+#  target: ldm.data.laion.WebDataModuleFromConfig
+#  params:
+#    tar_base: "pipe:aws s3 cp s3://s-datasets/laion-high-resolution/"
+#    batch_size: 4
+#    num_workers: 4
+#    multinode: True
+#    train:
+#      shards: '{00000..17279}.tar -'
+#      shuffle: 10000
+#      image_key: jpg
+#      image_transforms:
+#      - target: torchvision.transforms.Resize
+#        params:
+#          size: 512
+#          interpolation: 3
+#      - target: torchvision.transforms.RandomCrop
+#        params:
+#          size: 512
+#
+#    # NOTE use enough shards to avoid empty validation loops in workers
+#    validation:
+#      shards: '{17280..17535}.tar -'
+#      shuffle: 0
+#      image_key: jpg
+#      image_transforms:
+#      - target: torchvision.transforms.Resize
+#        params:
+#          size: 512
+#          interpolation: 3
+#      - target: torchvision.transforms.CenterCrop
+#        params:
+#          size: 512
+
+
+lightning:
+  callbacks:
+    image_logger:
+      target: main.ImageLogger
+      params:
+        batch_frequency: 5  # TODO
+        max_images: 4
+        increase_log_steps: False
+        log_first_step: False
+        log_images_kwargs:
+          use_ema_scope: False
+          inpaint: False
+          plot_progressive_rows: False
+          plot_diffusion_rows: False
+          N: 4
+          unconditional_guidance_scale: 3.0
+          unconditional_guidance_label: [""]
+
+  trainer:
+    #replace_sampler_ddp: False
+    benchmark: True
+    val_check_interval: 200  # TODO: 5000000 # really sorry
+    num_sanity_val_steps: 0
+    accumulate_grad_batches: 2
--- a/stable-diffusion-private/configs/stable-diffusion/txt2img-v2-clip-encoder-improved_aesthetics-256.yaml
+++ b/stable-diffusion-private/configs/stable-diffusion/txt2img-v2-clip-encoder-improved_aesthetics-256.yaml
+model:
+  base_learning_rate: 8.e-05
+  target: ldm.models.diffusion.ddpm.LatentDiffusion
+  params:
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    image_size: 32
+    channels: 4
+    cond_stage_trainable: false   # Note: different from the one we trained before
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+
+    scheduler_config: # 10000 warmup steps
+      target: ldm.lr_scheduler.LambdaLinearScheduler
+      params:
+        warm_up_steps: [ 10000 ]
+        cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
+        f_start: [ 1.e-6 ]
+        f_max: [ 1. ]
+        f_min: [ 1. ]
+
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        image_size: 32 # unused
+        in_channels: 4
+        out_channels: 4
+        model_channels: 416
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: [ 2, 2, 2, 2 ]
+        channel_mult: [ 1, 2, 4, 4 ]
+        disable_self_attentions: [ False, False, False, False ]  # converts the self-attention to a cross-attention layer if true
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ckpt_path: "/fsx/stable-diffusion/stable-diffusion/models/first_stage_models/kl-f8/model.ckpt"
+        ddconfig:
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
+
+
+data:
+  target: ldm.data.laion.WebDataModuleFromConfig
+  params:
+    tar_base: "__improvedaesthetic__"
+    batch_size: 8
+    num_workers: 4
+    multinode: True
+    train:
+      shards: '{00000..17279}.tar -'
+      shuffle: 10000
+      image_key: jpg
+      image_transforms:
+      - target: torchvision.transforms.Resize
+        params:
+          size: 256
+          interpolation: 3
+      - target: torchvision.transforms.RandomCrop
+        params:
+          size: 256
+
+#    # NOTE use enough shards to avoid empty validation loops in workers
+    validation:
+      shards: '{17280..17535}.tar -'
+      shuffle: 0
+      image_key: jpg
+      image_transforms:
+      - target: torchvision.transforms.Resize
+        params:
+          size: 256
+          interpolation: 3
+      - target: torchvision.transforms.CenterCrop
+        params:
+          size: 256
+
+
+lightning:
+  find_unused_parameters: false
+  modelcheckpoint:
+    params:
+      every_n_train_steps: 5000
+  callbacks:
+    image_logger:
+      target: main.ImageLogger
+      params:
+        disabled: True
+        batch_frequency: 2500  
+        max_images: 4
+        increase_log_steps: False
+        log_first_step: False
+        log_images_kwargs:
+          use_ema_scope: False
+          inpaint: False
+          plot_progressive_rows: False
+          plot_diffusion_rows: False
+          N: 4
+          unconditional_guidance_scale: 3.0
+          unconditional_guidance_label: [""]
+
+  trainer:
+    #replace_sampler_ddp: False
+    benchmark: True
+    val_check_interval: 5000000 # really sorry
+    num_sanity_val_steps: 0
+    accumulate_grad_batches: 1
--- a/stable-diffusion-private/configs/stable-diffusion/txt2img-v2-clip-encoder-improved_aesthetics-512-dev.yaml
+++ b/stable-diffusion-private/configs/stable-diffusion/txt2img-v2-clip-encoder-improved_aesthetics-512-dev.yaml
+model:
+  base_learning_rate: 1.0e-04
+  target: ldm.models.diffusion.ddpm.LatentDiffusion
+  params:
+
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    image_size: 64
+    channels: 4
+    cond_stage_trainable: false   # Note: different from the one we trained before
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+
+    scheduler_config: # 10000 warmup steps
+      target: ldm.lr_scheduler.LambdaLinearScheduler
+      params:
+        warm_up_steps: [ 10000 ]
+        cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
+        f_start: [ 1.e-6 ]
+        f_max: [ 1. ]
+        f_min: [ 1. ]
+
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        image_size: 64 # unused
+        in_channels: 4
+        out_channels: 4
+        model_channels: 384
+        attention_resolutions: [ 8, 4, 2, 1 ]
+        num_res_blocks: [ 2, 2, 2, 2 ]
+        channel_mult: [ 1, 2, 4, 4 ]
+        disable_self_attentions: [ False, False, False, False ]  # converts the self-attention to a cross-attention layer if true
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
+
+
+data:  # TODO
+  target: main.DataModuleFromConfig
+  params:
+    batch_size: 1
+    num_workers: 4
+    wrap: false
+    train:
+      target: ldm.data.dummy.DummyData
+      params:
+        length: 20000
+        size: [512, 512, 3]
+    validation:
+      target: ldm.data.dummy.DummyData
+      params:
+        length: 10000
+        size: [512, 512, 3]
+
+#data:
+#  target: ldm.data.laion.WebDataModuleFromConfig
+#  params:
+#    tar_base: "pipe:aws s3 cp s3://s-datasets/laion-high-resolution/"
+#    batch_size: 4
+#    num_workers: 4
+#    multinode: True
+#    train:
+#      shards: '{00000..17279}.tar -'
+#      shuffle: 10000
+#      image_key: jpg
+#      image_transforms:
+#      - target: torchvision.transforms.Resize
+#        params:
+#          size: 512
+#          interpolation: 3
+#      - target: torchvision.transforms.RandomCrop
+#        params:
+#          size: 512
+#
+#    # NOTE use enough shards to avoid empty validation loops in workers
+#    validation:
+#      shards: '{17280..17535}.tar -'
+#      shuffle: 0
+#      image_key: jpg
+#      image_transforms:
+#      - target: torchvision.transforms.Resize
+#        params:
+#          size: 512
+#          interpolation: 3
+#      - target: torchvision.transforms.CenterCrop
+#        params:
+#          size: 512
+
+
+lightning:
+  callbacks:
+    image_logger:
+      target: main.ImageLogger
+      params:
+        batch_frequency: 5  # TODO
+        max_images: 4
+        increase_log_steps: False
+        log_first_step: False
+        log_images_kwargs:
+          use_ema_scope: False
+          inpaint: False
+          plot_progressive_rows: False
+          plot_diffusion_rows: False
+          N: 4
+          unconditional_guidance_scale: 3.0
+          unconditional_guidance_label: [""]
+
+  trainer:
+    #replace_sampler_ddp: False
+    benchmark: True
+    val_check_interval: 1000 # TODO: 5000000 # really sorry
+    num_sanity_val_steps: 0
+    accumulate_grad_batches: 2
--- a/stable-diffusion-private/configs/stable-diffusion/txt2img-v2-clip-encoder-improved_aesthetics-512.yaml
+++ b/stable-diffusion-private/configs/stable-diffusion/txt2img-v2-clip-encoder-improved_aesthetics-512.yaml
+model:
+  base_learning_rate: 1.0e-04
+  target: ldm.models.diffusion.ddpm.LatentDiffusion
+  params:
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    image_size: 32
+    channels: 4
+    cond_stage_trainable: false   # Note: different from the one we trained before
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+
+    scheduler_config: # 10000 warmup steps
+      target: ldm.lr_scheduler.LambdaLinearScheduler
+      params:
+        warm_up_steps: [ 10000 ]
+        cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
+        f_start: [ 1.e-6 ]
+        f_max: [ 1. ]
+        f_min: [ 1. ]
+
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        image_size: 32 # unused
+        in_channels: 4
+        out_channels: 4
+        model_channels: 416
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: [ 2, 2, 2, 2 ]
+        channel_mult: [ 1, 2, 4, 4 ]
+        disable_self_attentions: [ False, False, False, False ]  # converts the self-attention to a cross-attention layer if true
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
+
+
+data:
+  target: ldm.data.laion.WebDataModuleFromConfig
+  params:
+    tar_base: "__improvedaesthetic__"
+    batch_size: 1
+    num_workers: 4
+    multinode: True
+    train:
+      shards: '{00000..17279}.tar -'
+      shuffle: 10000
+      image_key: jpg
+      image_transforms:
+      - target: torchvision.transforms.Resize
+        params:
+          size: 512
+          interpolation: 3
+      - target: torchvision.transforms.RandomCrop
+        params:
+          size: 512
+
+#    # NOTE use enough shards to avoid empty validation loops in workers
+    validation:
+      shards: '{17280..17535}.tar -'
+      shuffle: 0
+      image_key: jpg
+      image_transforms:
+      - target: torchvision.transforms.Resize
+        params:
+          size: 512
+          interpolation: 3
+      - target: torchvision.transforms.CenterCrop
+        params:
+          size: 512
+
+
+lightning:
+  find_unused_parameters: false
+  modelcheckpoint:
+    params:
+      every_n_train_steps: 5000
+  callbacks:
+    image_logger:
+      target: main.ImageLogger
+      params:
+        batch_frequency: 2500  
+        max_images: 2
+        increase_log_steps: False
+        log_first_step: False
+        log_images_kwargs:
+          use_ema_scope: False
+          inpaint: False
+          plot_progressive_rows: False
+          plot_diffusion_rows: False
+          N: 2
+          unconditional_guidance_scale: 3.0
+          unconditional_guidance_label: [""]
+
+  trainer:
+    #replace_sampler_ddp: False
+    benchmark: True
+    val_check_interval: 5000000 # really sorry
+    num_sanity_val_steps: 0
+    accumulate_grad_batches: 2
--- a/stable-diffusion-private/configs/stable-diffusion/upscaling/upscale-v1-with-f16.yaml
+++ b/stable-diffusion-private/configs/stable-diffusion/upscaling/upscale-v1-with-f16.yaml
+model:
+  base_learning_rate: 5.0e-05
+  target: ldm.models.diffusion.ddpm.LatentUpscaleDiffusion
+  params:
+    low_scale_key: "lr"
+    linear_start: 0.001
+    linear_end: 0.015
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    image_size: 32
+    channels: 16
+    cond_stage_trainable: false
+    conditioning_key: "hybrid-adm"
+    monitor: val/loss_simple_ema
+    scale_factor: 0.22765929   # magic number
+
+    low_scale_config:
+      target: ldm.modules.encoders.modules.LowScaleEncoder
+      params:
+        scale_factor: 0.18215
+        linear_start: 0.00085
+        linear_end: 0.0120
+        timesteps: 1000
+        max_noise_level: 250
+        output_size: null
+        model_config:
+          target: ldm.models.autoencoder.AutoencoderKL
+          params:
+            embed_dim: 4
+            monitor: val/rec_loss
+            ckpt_path: "/fsx/stable-diffusion/stable-diffusion/models/first_stage_models/kl-f8/model.ckpt"
+            ddconfig:
+              double_z: true
+              z_channels: 4
+              resolution: 256
+              in_channels: 3
+              out_ch: 3
+              ch: 128
+              ch_mult:
+                - 1
+                - 2
+                - 4
+                - 4
+              num_res_blocks: 2
+              attn_resolutions: [ ]
+              dropout: 0.0
+            lossconfig:
+              target: torch.nn.Identity
+
+    scheduler_config: # 10000 warmup steps
+      target: ldm.lr_scheduler.LambdaLinearScheduler
+      params:
+        warm_up_steps: [ 10000 ]
+        cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
+        f_start: [ 1.e-6 ]
+        f_max: [ 1. ]
+        f_min: [ 1. ]
+
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        num_classes: 251  #  timesteps for noise conditoining
+        image_size: 64    # not really needed
+        in_channels: 20
+        out_channels: 16
+        model_channels: 128
+        attention_resolutions: [ 8, 4, 2 ] # -> at 32, 16, 8
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 6, 8 ]
+        # -> res, ds: (64, 1), (32, 2), (16, 4), (6, 8), (4, 16)
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 16
+        monitor: val/rec_loss
+        ckpt_path: "/fsx/stable-diffusion/stable-diffusion/models/first_stage_models/kl-f16/model.ckpt"
+        ddconfig:
+          double_z: True
+          z_channels: 16
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult: [ 1,1,2,2,4 ]  # num_down = len(ch_mult)-1
+          num_res_blocks: 2
+          attn_resolutions: [ 16 ]
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
+
+
+#data:   # TODO: finetune here later
+#  target: ldm.data.laion.WebDataModuleFromConfig
+#  params:
+#    tar_base: "pipe:aws s3 cp s3://s-datasets/laion-high-resolution/"
+#    batch_size: 10
+#    num_workers: 4
+#    train:
+#      shards: '{00000..17279}.tar -'
+#      shuffle: 10000
+#      image_key: jpg
+#      image_transforms:
+#      - target: torchvision.transforms.Resize
+#        params:
+#          size: 1024
+#          interpolation: 3
+#      - target: torchvision.transforms.RandomCrop
+#        params:
+#          size: 1024
+#      postprocess:
+#              target: ldm.data.laion.AddLR
+#              params:
+#                factor: 2
+#
+#    # NOTE use enough shards to avoid empty validation loops in workers
+#    validation:
+#      shards: '{17280..17535}.tar -'
+#      shuffle: 0
+#      image_key: jpg
+#      image_transforms:
+#      - target: torchvision.transforms.Resize
+#        params:
+#          size: 1024
+#          interpolation: 3
+#      - target: torchvision.transforms.CenterCrop
+#        params:
+#          size: 1024
+#      postprocess:
+#              target: ldm.data.laion.AddLR
+#              params:
+#                factor: 2
+
+data:
+  target: ldm.data.laion.WebDataModuleFromConfig
+  params:
+    tar_base: "__improvedaesthetic__"
+    batch_size: 28
+    num_workers: 4
+    multinode: True
+    min_size: 512
+    train:
+      shards: '{00000..17279}.tar -'
+      shuffle: 10000
+      image_key: jpg
+      image_transforms:
+      - target: torchvision.transforms.Resize
+        params:
+          size: 512
+          interpolation: 3
+      - target: torchvision.transforms.RandomCrop
+        params:
+          size: 512
+      postprocess:
+        target: ldm.data.laion.AddLR
+        params:
+          factor: 2
+
+    # NOTE use enough shards to avoid empty validation loops in workers
+    validation:
+      shards: '{17280..17535}.tar -'
+      shuffle: 0
+      image_key: jpg
+      image_transforms:
+      - target: torchvision.transforms.Resize
+        params:
+          size: 512
+          interpolation: 3
+      - target: torchvision.transforms.CenterCrop
+        params:
+          size: 512
+      postprocess:
+        target: ldm.data.laion.AddLR
+        params:
+          factor: 2
+
+
+lightning:
+  find_unused_parameters: False
+
+  callbacks:
+    image_logger:
+      target: main.ImageLogger
+      params:
+        batch_frequency: 1000
+        max_images: 4
+        increase_log_steps: False
+        log_first_step: False
+        log_images_kwargs:
+          use_ema_scope: False
+          inpaint: False
+          plot_progressive_rows: False
+          plot_diffusion_rows: False
+          N: 4
+          unconditional_guidance_scale: 3.0
+          unconditional_guidance_label: [""]
+
+  trainer:
+    benchmark: True
+    val_check_interval: 5000000  # really sorry
+    num_sanity_val_steps: 0
+    accumulate_grad_batches: 2
--- a/stable-diffusion-private/configs/stable-diffusion/v1_improvedaesthetics.yaml
+++ b/stable-diffusion-private/configs/stable-diffusion/v1_improvedaesthetics.yaml
--- a/stable-diffusion-private/configs/stable-diffusion/v1_laionhr.yaml
+++ b/stable-diffusion-private/configs/stable-diffusion/v1_laionhr.yaml
--- a/stable-diffusion-private/configs/stable-diffusion/v2_laionhr1024.yaml
+++ b/stable-diffusion-private/configs/stable-diffusion/v2_laionhr1024.yaml
--- a/stable-diffusion-private/configs/stable-diffusion/v2_laionhr1024_2.yaml
+++ b/stable-diffusion-private/configs/stable-diffusion/v2_laionhr1024_2.yaml
--- a/stable-diffusion-private/configs/stable-diffusion/v3_pretraining.yaml
+++ b/stable-diffusion-private/configs/stable-diffusion/v3_pretraining.yaml
--- a/stable-diffusion-private/data/DejaVuSans.ttf
+++ b/stable-diffusion-private/data/DejaVuSans.ttf
--- a/stable-diffusion-private/data/captions_val2014.json
+++ b/stable-diffusion-private/data/captions_val2014.json
--- a/stable-diffusion-private/data/example_conditioning/superresolution/sample_0.jpg
+++ b/stable-diffusion-private/data/example_conditioning/superresolution/sample_0.jpg
--- a/stable-diffusion-private/data/example_conditioning/text_conditional/sample_0.txt
+++ b/stable-diffusion-private/data/example_conditioning/text_conditional/sample_0.txt
+A basket of cerries
--- a/stable-diffusion-private/data/imagenet_clsidx_to_label.txt
+++ b/stable-diffusion-private/data/imagenet_clsidx_to_label.txt
--- a/stable-diffusion-private/data/imagenet_train_hr_indices.p
+++ b/stable-diffusion-private/data/imagenet_train_hr_indices.p
--- a/stable-diffusion-private/data/imagenet_val_hr_indices.p
+++ b/stable-diffusion-private/data/imagenet_val_hr_indices.p
--- a/stable-diffusion-private/data/index_synset.yaml
+++ b/stable-diffusion-private/data/index_synset.yaml
--- a/stable-diffusion-private/data/inpainting_examples/6458524847_2f4c361183_k.png
+++ b/stable-diffusion-private/data/inpainting_examples/6458524847_2f4c361183_k.png
--- a/stable-diffusion-private/data/inpainting_examples/6458524847_2f4c361183_k_mask.png
+++ b/stable-diffusion-private/data/inpainting_examples/6458524847_2f4c361183_k_mask.png
--- a/stable-diffusion-private/data/inpainting_examples/8399166846_f6fb4e4b8e_k.png
+++ b/stable-diffusion-private/data/inpainting_examples/8399166846_f6fb4e4b8e_k.png
--- a/stable-diffusion-private/data/inpainting_examples/8399166846_f6fb4e4b8e_k_mask.png
+++ b/stable-diffusion-private/data/inpainting_examples/8399166846_f6fb4e4b8e_k_mask.png
--- a/stable-diffusion-private/data/inpainting_examples/alex-iby-G_Pk4D9rMLs.png
+++ b/stable-diffusion-private/data/inpainting_examples/alex-iby-G_Pk4D9rMLs.png
--- a/stable-diffusion-private/data/inpainting_examples/alex-iby-G_Pk4D9rMLs_mask.png
+++ b/stable-diffusion-private/data/inpainting_examples/alex-iby-G_Pk4D9rMLs_mask.png
--- a/stable-diffusion-private/data/inpainting_examples/bench2.png
+++ b/stable-diffusion-private/data/inpainting_examples/bench2.png
--- a/stable-diffusion-private/data/inpainting_examples/bench2_mask.png
+++ b/stable-diffusion-private/data/inpainting_examples/bench2_mask.png
--- a/stable-diffusion-private/data/inpainting_examples/bertrand-gabioud-CpuFzIsHYJ0.png
+++ b/stable-diffusion-private/data/inpainting_examples/bertrand-gabioud-CpuFzIsHYJ0.png
--- a/stable-diffusion-private/data/inpainting_examples/bertrand-gabioud-CpuFzIsHYJ0_mask.png
+++ b/stable-diffusion-private/data/inpainting_examples/bertrand-gabioud-CpuFzIsHYJ0_mask.png
--- a/stable-diffusion-private/data/inpainting_examples/billow926-12-Wc-Zgx6Y.png
+++ b/stable-diffusion-private/data/inpainting_examples/billow926-12-Wc-Zgx6Y.png
--- a/stable-diffusion-private/data/inpainting_examples/billow926-12-Wc-Zgx6Y_mask.png
+++ b/stable-diffusion-private/data/inpainting_examples/billow926-12-Wc-Zgx6Y_mask.png
--- a/stable-diffusion-private/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png
+++ b/stable-diffusion-private/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png
--- a/stable-diffusion-private/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png
+++ b/stable-diffusion-private/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png
--- a/stable-diffusion-private/data/inpainting_examples/photo-1583445095369-9c651e7e5d34.png
+++ b/stable-diffusion-private/data/inpainting_examples/photo-1583445095369-9c651e7e5d34.png
--- a/stable-diffusion-private/data/inpainting_examples/photo-1583445095369-9c651e7e5d34_mask.png
+++ b/stable-diffusion-private/data/inpainting_examples/photo-1583445095369-9c651e7e5d34_mask.png
--- a/stable-diffusion-private/environment.yaml
+++ b/stable-diffusion-private/environment.yaml
--- a/stable-diffusion-private/finetune.py
+++ b/stable-diffusion-private/finetune.py
--- a/stable-diffusion-private/inference.py
+++ b/stable-diffusion-private/inference.py
--- a/stable-diffusion-private/ldm/data/__init__.py
+++ b/stable-diffusion-private/ldm/data/__init__.py
--- a/stable-diffusion-private/ldm/data/base.py
+++ b/stable-diffusion-private/ldm/data/base.py
--- a/stable-diffusion-private/ldm/data/coco.py
+++ b/stable-diffusion-private/ldm/data/coco.py
--- a/stable-diffusion-private/ldm/data/dummy.py
+++ b/stable-diffusion-private/ldm/data/dummy.py
--- a/stable-diffusion-private/ldm/data/imagenet.py
+++ b/stable-diffusion-private/ldm/data/imagenet.py
--- a/stable-diffusion-private/ldm/data/inpainting/__init__.py
+++ b/stable-diffusion-private/ldm/data/inpainting/__init__.py
--- a/stable-diffusion-private/ldm/data/inpainting/synthetic_mask.py
+++ b/stable-diffusion-private/ldm/data/inpainting/synthetic_mask.py
--- a/stable-diffusion-private/ldm/data/laion.py
+++ b/stable-diffusion-private/ldm/data/laion.py
--- a/stable-diffusion-private/ldm/data/lsun.py
+++ b/stable-diffusion-private/ldm/data/lsun.py
--- a/stable-diffusion-private/ldm/lr_scheduler.py
+++ b/stable-diffusion-private/ldm/lr_scheduler.py
--- a/stable-diffusion-private/ldm/models/autoencoder.py
+++ b/stable-diffusion-private/ldm/models/autoencoder.py
--- a/stable-diffusion-private/ldm/models/diffusion/__init__.py
+++ b/stable-diffusion-private/ldm/models/diffusion/__init__.py
--- a/stable-diffusion-private/ldm/models/diffusion/classifier.py
+++ b/stable-diffusion-private/ldm/models/diffusion/classifier.py
--- a/stable-diffusion-private/ldm/models/diffusion/ddim.py
+++ b/stable-diffusion-private/ldm/models/diffusion/ddim.py
--- a/stable-diffusion-private/ldm/models/diffusion/ddpm.py
+++ b/stable-diffusion-private/ldm/models/diffusion/ddpm.py
--- a/stable-diffusion-private/ldm/models/diffusion/plms.py
+++ b/stable-diffusion-private/ldm/models/diffusion/plms.py
--- a/stable-diffusion-private/ldm/models/diffusion/sampling_util.py
+++ b/stable-diffusion-private/ldm/models/diffusion/sampling_util.py
--- a/stable-diffusion-private/ldm/modules/attention.py
+++ b/stable-diffusion-private/ldm/modules/attention.py
--- a/stable-diffusion-private/ldm/modules/diffusionmodules/__init__.py
+++ b/stable-diffusion-private/ldm/modules/diffusionmodules/__init__.py
--- a/stable-diffusion-private/ldm/modules/diffusionmodules/model.py
+++ b/stable-diffusion-private/ldm/modules/diffusionmodules/model.py
--- a/stable-diffusion-private/ldm/modules/diffusionmodules/openaimodel.py
+++ b/stable-diffusion-private/ldm/modules/diffusionmodules/openaimodel.py
--- a/stable-diffusion-private/ldm/modules/diffusionmodules/util.py
+++ b/stable-diffusion-private/ldm/modules/diffusionmodules/util.py
--- a/stable-diffusion-private/ldm/modules/distributions/__init__.py
+++ b/stable-diffusion-private/ldm/modules/distributions/__init__.py
--- a/stable-diffusion-private/ldm/modules/distributions/distributions.py
+++ b/stable-diffusion-private/ldm/modules/distributions/distributions.py
--- a/stable-diffusion-private/ldm/modules/ema.py
+++ b/stable-diffusion-private/ldm/modules/ema.py
--- a/stable-diffusion-private/ldm/modules/encoders/__init__.py
+++ b/stable-diffusion-private/ldm/modules/encoders/__init__.py
--- a/stable-diffusion-private/ldm/modules/encoders/modules.py
+++ b/stable-diffusion-private/ldm/modules/encoders/modules.py
--- a/stable-diffusion-private/ldm/modules/evaluate/adm_evaluator.py
+++ b/stable-diffusion-private/ldm/modules/evaluate/adm_evaluator.py
--- a/stable-diffusion-private/ldm/modules/evaluate/evaluate_perceptualsim.py
+++ b/stable-diffusion-private/ldm/modules/evaluate/evaluate_perceptualsim.py
--- a/stable-diffusion-private/ldm/modules/evaluate/frechet_video_distance.py
+++ b/stable-diffusion-private/ldm/modules/evaluate/frechet_video_distance.py
--- a/stable-diffusion-private/ldm/modules/evaluate/ssim.py
+++ b/stable-diffusion-private/ldm/modules/evaluate/ssim.py
--- a/stable-diffusion-private/ldm/modules/evaluate/torch_frechet_video_distance.py
+++ b/stable-diffusion-private/ldm/modules/evaluate/torch_frechet_video_distance.py
--- a/stable-diffusion-private/ldm/modules/image_degradation/__init__.py
+++ b/stable-diffusion-private/ldm/modules/image_degradation/__init__.py
--- a/stable-diffusion-private/ldm/modules/image_degradation/bsrgan.py
+++ b/stable-diffusion-private/ldm/modules/image_degradation/bsrgan.py
--- a/stable-diffusion-private/ldm/modules/image_degradation/bsrgan_light.py
+++ b/stable-diffusion-private/ldm/modules/image_degradation/bsrgan_light.py
--- a/stable-diffusion-private/ldm/modules/image_degradation/utils/test.png
+++ b/stable-diffusion-private/ldm/modules/image_degradation/utils/test.png
--- a/stable-diffusion-private/ldm/modules/image_degradation/utils_image.py
+++ b/stable-diffusion-private/ldm/modules/image_degradation/utils_image.py
--- a/stable-diffusion-private/ldm/modules/losses/__init__.py
+++ b/stable-diffusion-private/ldm/modules/losses/__init__.py
--- a/stable-diffusion-private/ldm/modules/losses/contperceptual.py
+++ b/stable-diffusion-private/ldm/modules/losses/contperceptual.py
--- a/stable-diffusion-private/ldm/modules/losses/vqperceptual.py
+++ b/stable-diffusion-private/ldm/modules/losses/vqperceptual.py
--- a/stable-diffusion-private/ldm/modules/x_transformer.py
+++ b/stable-diffusion-private/ldm/modules/x_transformer.py
--- a/stable-diffusion-private/ldm/util.py
+++ b/stable-diffusion-private/ldm/util.py
--- a/stable-diffusion-private/main.py
+++ b/stable-diffusion-private/main.py
--- a/stable-diffusion-private/models/first_stage_models/kl-f16/config.yaml
+++ b/stable-diffusion-private/models/first_stage_models/kl-f16/config.yaml
--- a/stable-diffusion-private/models/first_stage_models/kl-f32/config.yaml
+++ b/stable-diffusion-private/models/first_stage_models/kl-f32/config.yaml
--- a/stable-diffusion-private/models/first_stage_models/kl-f4/config.yaml
+++ b/stable-diffusion-private/models/first_stage_models/kl-f4/config.yaml
--- a/stable-diffusion-private/models/first_stage_models/kl-f8/config.yaml
+++ b/stable-diffusion-private/models/first_stage_models/kl-f8/config.yaml
--- a/stable-diffusion-private/models/first_stage_models/vq-f16/config.yaml
+++ b/stable-diffusion-private/models/first_stage_models/vq-f16/config.yaml
--- a/stable-diffusion-private/models/first_stage_models/vq-f4-noattn/config.yaml
+++ b/stable-diffusion-private/models/first_stage_models/vq-f4-noattn/config.yaml
--- a/stable-diffusion-private/models/first_stage_models/vq-f4/config.yaml
+++ b/stable-diffusion-private/models/first_stage_models/vq-f4/config.yaml
--- a/stable-diffusion-private/models/first_stage_models/vq-f8-n256/config.yaml
+++ b/stable-diffusion-private/models/first_stage_models/vq-f8-n256/config.yaml
--- a/stable-diffusion-private/models/first_stage_models/vq-f8/config.yaml
+++ b/stable-diffusion-private/models/first_stage_models/vq-f8/config.yaml
--- a/stable-diffusion-private/models/ldm/bsr_sr/config.yaml
+++ b/stable-diffusion-private/models/ldm/bsr_sr/config.yaml
--- a/stable-diffusion-private/models/ldm/celeba256/config.yaml
+++ b/stable-diffusion-private/models/ldm/celeba256/config.yaml
--- a/stable-diffusion-private/models/ldm/cin256/config.yaml
+++ b/stable-diffusion-private/models/ldm/cin256/config.yaml
--- a/stable-diffusion-private/models/ldm/ffhq256/config.yaml
+++ b/stable-diffusion-private/models/ldm/ffhq256/config.yaml
--- a/stable-diffusion-private/models/ldm/inpainting_big/config.yaml
+++ b/stable-diffusion-private/models/ldm/inpainting_big/config.yaml
--- a/stable-diffusion-private/models/ldm/layout2img-openimages256/config.yaml
+++ b/stable-diffusion-private/models/ldm/layout2img-openimages256/config.yaml
--- a/stable-diffusion-private/models/ldm/lsun_beds256/config.yaml
+++ b/stable-diffusion-private/models/ldm/lsun_beds256/config.yaml
--- a/stable-diffusion-private/models/ldm/lsun_churches256/config.yaml
+++ b/stable-diffusion-private/models/ldm/lsun_churches256/config.yaml
--- a/stable-diffusion-private/models/ldm/semantic_synthesis256/config.yaml
+++ b/stable-diffusion-private/models/ldm/semantic_synthesis256/config.yaml
--- a/stable-diffusion-private/models/ldm/semantic_synthesis512/config.yaml
+++ b/stable-diffusion-private/models/ldm/semantic_synthesis512/config.yaml
--- a/stable-diffusion-private/models/ldm/text2img256/config.yaml
+++ b/stable-diffusion-private/models/ldm/text2img256/config.yaml
--- a/stable-diffusion-private/module_train.py
+++ b/stable-diffusion-private/module_train.py
--- a/stable-diffusion-private/notebook_helpers.py
+++ b/stable-diffusion-private/notebook_helpers.py
--- a/stable-diffusion-private/requirements.txt
+++ b/stable-diffusion-private/requirements.txt
--- a/stable-diffusion-private/scripts/autoencoder-eval.py
+++ b/stable-diffusion-private/scripts/autoencoder-eval.py
--- a/stable-diffusion-private/scripts/checker.py
+++ b/stable-diffusion-private/scripts/checker.py
--- a/stable-diffusion-private/scripts/cmd_on_new_ckpt.py
+++ b/stable-diffusion-private/scripts/cmd_on_new_ckpt.py
--- a/stable-diffusion-private/scripts/demo/inpainting.py
+++ b/stable-diffusion-private/scripts/demo/inpainting.py
--- a/stable-diffusion-private/scripts/download_first_stages.sh
+++ b/stable-diffusion-private/scripts/download_first_stages.sh
--- a/stable-diffusion-private/scripts/download_models.sh
+++ b/stable-diffusion-private/scripts/download_models.sh
--- a/stable-diffusion-private/scripts/img2img.py
+++ b/stable-diffusion-private/scripts/img2img.py
--- a/stable-diffusion-private/scripts/inpaint.py
+++ b/stable-diffusion-private/scripts/inpaint.py
--- a/stable-diffusion-private/scripts/latent_imagenet_diffusion.ipynb
+++ b/stable-diffusion-private/scripts/latent_imagenet_diffusion.ipynb
--- a/stable-diffusion-private/scripts/logging_template.py
+++ b/stable-diffusion-private/scripts/logging_template.py
--- a/stable-diffusion-private/scripts/mnist-distributed.py
+++ b/stable-diffusion-private/scripts/mnist-distributed.py
--- a/stable-diffusion-private/scripts/printckpt.py
+++ b/stable-diffusion-private/scripts/printckpt.py
--- a/stable-diffusion-private/scripts/prompts/aesthetic-prompts-plain.txt
+++ b/stable-diffusion-private/scripts/prompts/aesthetic-prompts-plain.txt
--- a/stable-diffusion-private/scripts/prompts/aesthetic-prompts-surrealism.txt
+++ b/stable-diffusion-private/scripts/prompts/aesthetic-prompts-surrealism.txt
--- a/stable-diffusion-private/scripts/prompts/prompts-with-wings.txt
+++ b/stable-diffusion-private/scripts/prompts/prompts-with-wings.txt
--- a/stable-diffusion-private/scripts/prompts/six-prompts
+++ b/stable-diffusion-private/scripts/prompts/six-prompts
--- a/stable-diffusion-private/scripts/prompts/weird-dalle-prompts.txt
+++ b/stable-diffusion-private/scripts/prompts/weird-dalle-prompts.txt
--- a/stable-diffusion-private/scripts/prompts/wings1.txt
+++ b/stable-diffusion-private/scripts/prompts/wings1.txt
--- a/stable-diffusion-private/scripts/prompts/wings2.txt
+++ b/stable-diffusion-private/scripts/prompts/wings2.txt
--- a/stable-diffusion-private/scripts/prompts/wings3.txt
+++ b/stable-diffusion-private/scripts/prompts/wings3.txt
--- a/stable-diffusion-private/scripts/prompts/wings4.txt
+++ b/stable-diffusion-private/scripts/prompts/wings4.txt
--- a/stable-diffusion-private/scripts/prune-ckpt.py
+++ b/stable-diffusion-private/scripts/prune-ckpt.py
--- a/stable-diffusion-private/scripts/sample_diffusion.py
+++ b/stable-diffusion-private/scripts/sample_diffusion.py
--- a/stable-diffusion-private/scripts/slurm/README.md
+++ b/stable-diffusion-private/scripts/slurm/README.md
--- a/stable-diffusion-private/scripts/slurm/resume_512/launcher.sh
+++ b/stable-diffusion-private/scripts/slurm/resume_512/launcher.sh
--- a/stable-diffusion-private/scripts/slurm/resume_512/sbatch.sh
+++ b/stable-diffusion-private/scripts/slurm/resume_512/sbatch.sh
--- a/stable-diffusion-private/scripts/slurm/resume_512_improvedaesthetic/launcher.sh
+++ b/stable-diffusion-private/scripts/slurm/resume_512_improvedaesthetic/launcher.sh
--- a/stable-diffusion-private/scripts/slurm/resume_512_improvedaesthetic/sbatch.sh
+++ b/stable-diffusion-private/scripts/slurm/resume_512_improvedaesthetic/sbatch.sh
--- a/stable-diffusion-private/scripts/slurm/resume_768_hr/launcher.sh
+++ b/stable-diffusion-private/scripts/slurm/resume_768_hr/launcher.sh
--- a/stable-diffusion-private/scripts/slurm/resume_768_hr/sbatch.sh
+++ b/stable-diffusion-private/scripts/slurm/resume_768_hr/sbatch.sh
--- a/stable-diffusion-private/scripts/slurm/v1-upscaling-f16-pretraining-512-aesthetics/launcher.sh
+++ b/stable-diffusion-private/scripts/slurm/v1-upscaling-f16-pretraining-512-aesthetics/launcher.sh
--- a/stable-diffusion-private/scripts/slurm/v1-upscaling-f16-pretraining-512-aesthetics/sbatch.sh
+++ b/stable-diffusion-private/scripts/slurm/v1-upscaling-f16-pretraining-512-aesthetics/sbatch.sh
--- a/stable-diffusion-private/scripts/slurm/v1_iahr_torch111/launcher.sh
+++ b/stable-diffusion-private/scripts/slurm/v1_iahr_torch111/launcher.sh
--- a/stable-diffusion-private/scripts/slurm/v1_iahr_torch111/sbatch.sh
+++ b/stable-diffusion-private/scripts/slurm/v1_iahr_torch111/sbatch.sh
--- a/stable-diffusion-private/scripts/slurm/v1_iahr_torch111_ucg/launcher.sh
+++ b/stable-diffusion-private/scripts/slurm/v1_iahr_torch111_ucg/launcher.sh
--- a/stable-diffusion-private/scripts/slurm/v1_iahr_torch111_ucg/sbatch.sh
+++ b/stable-diffusion-private/scripts/slurm/v1_iahr_torch111_ucg/sbatch.sh
--- a/stable-diffusion-private/scripts/slurm/v1_improvedaesthetics/launcher.sh
+++ b/stable-diffusion-private/scripts/slurm/v1_improvedaesthetics/launcher.sh
--- a/stable-diffusion-private/scripts/slurm/v1_improvedaesthetics/sbatch.sh
+++ b/stable-diffusion-private/scripts/slurm/v1_improvedaesthetics/sbatch.sh
--- a/stable-diffusion-private/scripts/slurm/v1_improvedaesthetics_torch111/launcher.sh
+++ b/stable-diffusion-private/scripts/slurm/v1_improvedaesthetics_torch111/launcher.sh
--- a/stable-diffusion-private/scripts/slurm/v1_improvedaesthetics_torch111/sbatch.sh
+++ b/stable-diffusion-private/scripts/slurm/v1_improvedaesthetics_torch111/sbatch.sh
--- a/stable-diffusion-private/scripts/slurm/v1_inpainting_aesthetics-larger-masks/launcher.sh
+++ b/stable-diffusion-private/scripts/slurm/v1_inpainting_aesthetics-larger-masks/launcher.sh
--- a/stable-diffusion-private/scripts/slurm/v1_inpainting_aesthetics-larger-masks/sbatch.sh
+++ b/stable-diffusion-private/scripts/slurm/v1_inpainting_aesthetics-larger-masks/sbatch.sh
--- a/stable-diffusion-private/scripts/slurm/v1_inpainting_improvedaesthetics_torch111/launcher.sh
+++ b/stable-diffusion-private/scripts/slurm/v1_inpainting_improvedaesthetics_torch111/launcher.sh
--- a/stable-diffusion-private/scripts/slurm/v1_inpainting_improvedaesthetics_torch111/sbatch.sh
+++ b/stable-diffusion-private/scripts/slurm/v1_inpainting_improvedaesthetics_torch111/sbatch.sh
--- a/stable-diffusion-private/scripts/slurm/v1_laionhr_torch111/launcher.sh
+++ b/stable-diffusion-private/scripts/slurm/v1_laionhr_torch111/launcher.sh
--- a/stable-diffusion-private/scripts/slurm/v1_laionhr_torch111/sbatch.sh
+++ b/stable-diffusion-private/scripts/slurm/v1_laionhr_torch111/sbatch.sh
--- a/stable-diffusion-private/scripts/slurm/v2_laionhr1024/launcher.sh
+++ b/stable-diffusion-private/scripts/slurm/v2_laionhr1024/launcher.sh
--- a/stable-diffusion-private/scripts/slurm/v2_laionhr1024/sbatch.sh
+++ b/stable-diffusion-private/scripts/slurm/v2_laionhr1024/sbatch.sh
--- a/stable-diffusion-private/scripts/slurm/v2_laionhr1024_2/launcher.sh
+++ b/stable-diffusion-private/scripts/slurm/v2_laionhr1024_2/launcher.sh
--- a/stable-diffusion-private/scripts/slurm/v2_laionhr1024_2/sbatch.sh
+++ b/stable-diffusion-private/scripts/slurm/v2_laionhr1024_2/sbatch.sh
--- a/stable-diffusion-private/scripts/slurm/v3_pretraining/launcher.sh
+++ b/stable-diffusion-private/scripts/slurm/v3_pretraining/launcher.sh
--- a/stable-diffusion-private/scripts/slurm/v3_pretraining/sbatch.sh
+++ b/stable-diffusion-private/scripts/slurm/v3_pretraining/sbatch.sh
--- a/stable-diffusion-private/scripts/test_gpu.py
+++ b/stable-diffusion-private/scripts/test_gpu.py
--- a/stable-diffusion-private/scripts/test_gpu.sh
+++ b/stable-diffusion-private/scripts/test_gpu.sh
--- a/stable-diffusion-private/scripts/txt2img.py
+++ b/stable-diffusion-private/scripts/txt2img.py
--- a/stable-diffusion-private/scripts/vqgan_codebook_visualizer.py
+++ b/stable-diffusion-private/scripts/vqgan_codebook_visualizer.py
--- a/stable-diffusion-private/setup.py
+++ b/stable-diffusion-private/setup.py
--- a/stable-diffusion-private/trainer.py
+++ b/stable-diffusion-private/trainer.py
--- a/stable-diffusion-private/trainer_original.py
+++ b/stable-diffusion-private/trainer_original.py