Merge branch 'release_candidate'

c19d0443 · AUTOMATIC1111 · feee37d7 · 8b3d98c5 · c19d0443 · c19d0443
Commit c19d0443 authored Jul 27, 2024 by AUTOMATIC1111
95 changed files
--- a/.github/ISSUE_TEMPLATE/bug_report.yml
+++ b/.github/ISSUE_TEMPLATE/bug_report.yml
@@ -91,7 +91,7 @@ body:
    id: logs
    attributes:
      label: Console logs
-      description: Please provide **full** cmd/terminal logs from the moment you started UI to the end of it, after the bug occured. If it's very long, provide a link to pastebin or similar service.
+      description: Please provide **full** cmd/terminal logs from the moment you started UI to the end of it, after the bug occurred. If it's very long, provide a link to pastebin or similar service.
      render: Shell
    validations:
      required: true

--- a/.gitignore
+++ b/.gitignore
@@ -2,6 +2,7 @@ __pycache__
 *.ckpt
 *.safetensors
 *.pth
+.DS_Store
 /ESRGAN/*
 /SwinIR/*
 /repositories
@@ -39,3 +40,5 @@ notification.mp3
 /.coverage*
 /test/test_outputs
 /cache
+trace.json
+/sysinfo-????-??-??-??-??.json
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
--- a/README.md
+++ b/README.md
@@ -78,7 +78,7 @@ A web interface for Stable Diffusion, implemented using Gradio library.
 - Clip skip
 - Hypernetworks
 - Loras (same as Hypernetworks but more pretty)
- A separate UI where you can choose, with preview, which embeddings, hypernetworks or Loras to add to your prompt 
+- A separate UI where you can choose, with preview, which embeddings, hypernetworks or Loras to add to your prompt
 - Can select to load a different VAE from settings screen
 - Estimated completion time in progress bar
 - API
@@ -122,16 +122,38 @@ Alternatively, use online services (like Google Colab):
 # Debian-based:
 sudo apt install wget git python3 python3-venv libgl1 libglib2.0-0
 # Red Hat-based:
-sudo dnf install wget git python3 gperftools-libs libglvnd-glx 
+sudo dnf install wget git python3 gperftools-libs libglvnd-glx
 # openSUSE-based:
 sudo zypper install wget git python3 libtcmalloc4 libglvnd
 # Arch-based:
 sudo pacman -S wget git python3
 ```
+If your system is very new, you need to install python3.11 or python3.10:
+```bash
+# Ubuntu 24.04
+sudo add-apt-repository ppa:deadsnakes/ppa
+sudo apt update
+sudo apt install python3.11
+# Manjaro/Arch
+sudo pacman -S yay
+yay -S python311 # do not confuse with python3.11 package
+# Only for 3.11
+# Then set up env variable in launch script
+export python_cmd="python3.11"
+# or in webui-user.sh
+python_cmd="python3.11"
+```
 2. Navigate to the directory you would like the webui to be installed and execute the following command:
 ```bash
 wget -q https://raw.githubusercontent.com/AUTOMATIC1111/stable-diffusion-webui/master/webui.sh
 ```
+Or just clone the repo wherever you want:
+```bash
+git clone https://github.com/AUTOMATIC1111/stable-diffusion-webui
+```
 3. Run `webui.sh`.
 4. Check `webui-user.sh` for options.
 ### Installation on Apple Silicon
@@ -150,7 +172,7 @@ For the purposes of getting Google and other search engines to crawl the wiki, h
 ## Credits
 Licenses for borrowed code can be found in `Settings -> Licenses` screen, and also in `html/licenses.html` file.
- Stable Diffusion - https://github.com/Stability-AI/stablediffusion, https://github.com/CompVis/taming-transformers
+- Stable Diffusion - https://github.com/Stability-AI/stablediffusion, https://github.com/CompVis/taming-transformers, https://github.com/mcmonkey4eva/sd3-ref
 - k-diffusion - https://github.com/crowsonkb/k-diffusion.git
 - Spandrel - https://github.com/chaiNNer-org/spandrel implementing
  - GFPGAN - https://github.com/TencentARC/GFPGAN.git

--- a/configs/alt-diffusion-inference.yaml
+++ b/configs/alt-diffusion-inference.yaml
@@ -40,7 +40,7 @@ model:
        use_spatial_transformer: True
        transformer_depth: 1
        context_dim: 768
-        use_checkpoint: True
+        use_checkpoint: False
        legacy: False
    first_stage_config:

--- a/configs/alt-diffusion-m18-inference.yaml
+++ b/configs/alt-diffusion-m18-inference.yaml
@@ -41,7 +41,7 @@ model:
        use_linear_in_transformer: True
        transformer_depth: 1
        context_dim: 1024
-        use_checkpoint: True
+        use_checkpoint: False
        legacy: False
    first_stage_config:

--- a/configs/instruct-pix2pix.yaml
+++ b/configs/instruct-pix2pix.yaml
@@ -45,7 +45,7 @@ model:
        use_spatial_transformer: True
        transformer_depth: 1
        context_dim: 768
-        use_checkpoint: True
+        use_checkpoint: False
        legacy: False
    first_stage_config:

--- a/configs/sd3-inference.yaml
+++ b/configs/sd3-inference.yaml
+model:
+  target: modules.models.sd3.sd3_model.SD3Inferencer
+  params:
+    shift: 3
+    state_dict: null
--- a/configs/sd_xl_inpaint.yaml
+++ b/configs/sd_xl_inpaint.yaml
@@ -21,7 +21,7 @@ model:
      params:
        adm_in_channels: 2816
        num_classes: sequential
-        use_checkpoint: True
+        use_checkpoint: False
        in_channels: 9
        out_channels: 4
        model_channels: 320

--- a/configs/v1-inference.yaml
+++ b/configs/v1-inference.yaml
@@ -40,7 +40,7 @@ model:
        use_spatial_transformer: True
        transformer_depth: 1
        context_dim: 768
-        use_checkpoint: True
+        use_checkpoint: False
        legacy: False
    first_stage_config:

--- a/configs/v1-inpainting-inference.yaml
+++ b/configs/v1-inpainting-inference.yaml
@@ -40,7 +40,7 @@ model:
        use_spatial_transformer: True
        transformer_depth: 1
        context_dim: 768
-        use_checkpoint: True
+        use_checkpoint: False
        legacy: False
    first_stage_config:

--- a/extensions-builtin/LDSR/sd_hijack_ddpm_v1.py
+++ b/extensions-builtin/LDSR/sd_hijack_ddpm_v1.py
@@ -572,7 +572,7 @@ class LatentDiffusionV1(DDPMV1):
        :param h: height
        :param w: width
        :return: normalized distance to image border,
-         wtith min distance = 0 at border and max dist = 0.5 at image center
+         with min distance = 0 at border and max dist = 0.5 at image center
        """
        lower_right_corner = torch.tensor([h - 1, w - 1]).view(1, 1, 2)
        arr = self.meshgrid(h, w) / lower_right_corner

--- a/extensions-builtin/Lora/extra_networks_lora.py
+++ b/extensions-builtin/Lora/extra_networks_lora.py
@@ -9,6 +9,8 @@ class ExtraNetworkLora(extra_networks.ExtraNetwork):
        self.errors = {}
        """mapping of network names to the number of errors the network had during operation"""
+    remove_symbols = str.maketrans('', '', ":,")
    def activate(self, p, params_list):
        additional = shared.opts.sd_lora
@@ -43,22 +45,15 @@ class ExtraNetworkLora(extra_networks.ExtraNetwork):
        networks.load_networks(names, te_multipliers, unet_multipliers, dyn_dims)
        if shared.opts.lora_add_hashes_to_infotext:
-            network_hashes = []
+            if not getattr(p, "is_hr_pass", False) or not hasattr(p, "lora_hashes"):
-            for item in networks.loaded_networks:
+                p.lora_hashes = {}
-                shorthash = item.network_on_disk.shorthash
-                if not shorthash:
-                    continue
-                alias = item.mentioned_name
-                if not alias:
-                    continue
-                alias = alias.replace(":", "").replace(",", "")
+            for item in networks.loaded_networks:
+                if item.network_on_disk.shorthash and item.mentioned_name:
-                network_hashes.append(f"{alias}: {shorthash}")
+                    p.lora_hashes[item.mentioned_name.translate(self.remove_symbols)] = item.network_on_disk.shorthash
-            if network_hashes:
+            if p.lora_hashes:
-                p.extra_generation_params["Lora hashes"] = ", ".join(network_hashes)
+                p.extra_generation_params["Lora hashes"] = ', '.join(f'{k}: {v}' for k, v in p.lora_hashes.items())
    def deactivate(self, p):
        if self.errors:

--- a/extensions-builtin/Lora/network.py
+++ b/extensions-builtin/Lora/network.py
@@ -7,6 +7,7 @@ import torch.nn as nn
 import torch.nn.functional as F
 from modules import sd_models, cache, errors, hashes, shared
+import modules.models.sd3.mmdit
 NetworkWeights = namedtuple('NetworkWeights', ['network_key', 'sd_key', 'w', 'sd_module'])
@@ -114,7 +115,10 @@ class NetworkModule:
        self.sd_key = weights.sd_key
        self.sd_module = weights.sd_module
-        if hasattr(self.sd_module, 'weight'):
+        if isinstance(self.sd_module, modules.models.sd3.mmdit.QkvLinear):
+            s = self.sd_module.weight.shape
+            self.shape = (s[0] // 3, s[1])
+        elif hasattr(self.sd_module, 'weight'):
            self.shape = self.sd_module.weight.shape
        elif isinstance(self.sd_module, nn.MultiheadAttention):
            # For now, only self-attn use Pytorch's MHA
@@ -204,10 +208,12 @@ class NetworkModule:
        if ex_bias is not None:
            ex_bias = ex_bias * self.multiplier()
+        updown = updown * self.calc_scale()
        if self.dora_scale is not None:
            updown = self.apply_weight_decompose(updown, orig_weight)
-        return updown * self.calc_scale() * self.multiplier(), ex_bias
+        return updown * self.multiplier(), ex_bias
    def calc_updown(self, target):
        raise NotImplementedError()

--- a/extensions-builtin/Lora/network_lora.py
+++ b/extensions-builtin/Lora/network_lora.py
 import torch
 import lyco_helpers
+import modules.models.sd3.mmdit
 import network
 from modules import devices
@@ -10,6 +11,13 @@ class ModuleTypeLora(network.ModuleType):
        if all(x in weights.w for x in ["lora_up.weight", "lora_down.weight"]):
            return NetworkModuleLora(net, weights)
+        if all(x in weights.w for x in ["lora_A.weight", "lora_B.weight"]):
+            w = weights.w.copy()
+            weights.w.clear()
+            weights.w.update({"lora_up.weight": w["lora_B.weight"], "lora_down.weight": w["lora_A.weight"]})
+            return NetworkModuleLora(net, weights)
        return None
@@ -29,7 +37,7 @@ class NetworkModuleLora(network.NetworkModule):
        if weight is None and none_ok:
            return None
-        is_linear = type(self.sd_module) in [torch.nn.Linear, torch.nn.modules.linear.NonDynamicallyQuantizableLinear, torch.nn.MultiheadAttention]
+        is_linear = type(self.sd_module) in [torch.nn.Linear, torch.nn.modules.linear.NonDynamicallyQuantizableLinear, torch.nn.MultiheadAttention, modules.models.sd3.mmdit.QkvLinear]
        is_conv = type(self.sd_module) in [torch.nn.Conv2d]
        if is_linear:

--- a/extensions-builtin/Lora/networks.py
+++ b/extensions-builtin/Lora/networks.py
--- a/extensions-builtin/Lora/scripts/lora_script.py
+++ b/extensions-builtin/Lora/scripts/lora_script.py
@@ -36,6 +36,7 @@ shared.options_templates.update(shared.options_section(('extra_networks', "Extra
    "sd_lora": shared.OptionInfo("None", "Add network to prompt", gr.Dropdown, lambda: {"choices": ["None", *networks.available_networks]}, refresh=networks.list_available_networks),
    "lora_preferred_name": shared.OptionInfo("Alias from file", "When adding to prompt, refer to Lora by", gr.Radio, {"choices": ["Alias from file", "Filename"]}),
    "lora_add_hashes_to_infotext": shared.OptionInfo(True, "Add Lora hashes to infotext"),
+    "lora_bundled_ti_to_infotext": shared.OptionInfo(True, "Add Lora name as TI hashes for bundled Textual Inversion").info('"Add Textual Inversion hashes to infotext" needs to be enabled'),
    "lora_show_all": shared.OptionInfo(False, "Always show all networks on the Lora page").info("otherwise, those detected as for incompatible version of Stable Diffusion will be hidden"),
    "lora_hide_unknown_for_versions": shared.OptionInfo([], "Hide networks of unknown versions for model versions", gr.CheckboxGroup, {"choices": ["SD1", "SD2", "SDXL"]}),
    "lora_in_memory_limit": shared.OptionInfo(0, "Number of Lora networks to keep cached in memory", gr.Number, {"precision": 0}),

--- a/extensions-builtin/Lora/ui_edit_user_metadata.py
+++ b/extensions-builtin/Lora/ui_edit_user_metadata.py
@@ -21,10 +21,12 @@ re_comma = re.compile(r" *, *")
 def build_tags(metadata):
    tags = {}
-    for _, tags_dict in metadata.get("ss_tag_frequency", {}).items():
+    ss_tag_frequency = metadata.get("ss_tag_frequency", {})
-        for tag, tag_count in tags_dict.items():
+    if ss_tag_frequency is not None and hasattr(ss_tag_frequency, 'items'):
-            tag = tag.strip()
+        for _, tags_dict in ss_tag_frequency.items():
-            tags[tag] = tags.get(tag, 0) + int(tag_count)
+            for tag, tag_count in tags_dict.items():
+                tag = tag.strip()
+                tags[tag] = tags.get(tag, 0) + int(tag_count)
    if tags and is_non_comma_tagset(tags):
        new_tags = {}

--- a/extensions-builtin/Lora/ui_extra_networks_lora.py
+++ b/extensions-builtin/Lora/ui_extra_networks_lora.py
@@ -60,7 +60,7 @@ class ExtraNetworksPageLora(ui_extra_networks.ExtraNetworksPage):
        else:
            sd_version = lora_on_disk.sd_version
-        if shared.opts.lora_show_all or not enable_filter:
+        if shared.opts.lora_show_all or not enable_filter or not shared.sd_model:
            pass
        elif sd_version == network.SdVersion.Unknown:
            model_version = network.SdVersion.SDXL if shared.sd_model.is_sdxl else network.SdVersion.SD2 if shared.sd_model.is_sd2 else network.SdVersion.SD1

--- a/extensions-builtin/hypertile/scripts/hypertile_script.py
+++ b/extensions-builtin/hypertile/scripts/hypertile_script.py
 import hypertile
 from modules import scripts, script_callbacks, shared
-from scripts.hypertile_xyz import add_axis_options
 class ScriptHypertile(scripts.Script):
@@ -93,7 +92,6 @@ def on_ui_settings():
        "hypertile_max_depth_unet": shared.OptionInfo(3, "Hypertile U-Net max depth", gr.Slider, {"minimum": 0, "maximum": 3, "step": 1}, infotext="Hypertile U-Net max depth").info("larger = more neural network layers affected; minor effect on performance"),
        "hypertile_max_tile_unet": shared.OptionInfo(256, "Hypertile U-Net max tile size", gr.Slider, {"minimum": 0, "maximum": 512, "step": 16}, infotext="Hypertile U-Net max tile size").info("larger = worse performance"),
        "hypertile_swap_size_unet": shared.OptionInfo(3, "Hypertile U-Net swap size", gr.Slider, {"minimum": 0, "maximum": 64, "step": 1}, infotext="Hypertile U-Net swap size"),
        "hypertile_enable_vae": shared.OptionInfo(False, "Enable Hypertile VAE", infotext="Hypertile VAE").info("minimal change in the generated picture"),
        "hypertile_max_depth_vae": shared.OptionInfo(3, "Hypertile VAE max depth", gr.Slider, {"minimum": 0, "maximum": 3, "step": 1}, infotext="Hypertile VAE max depth"),
        "hypertile_max_tile_vae": shared.OptionInfo(128, "Hypertile VAE max tile size", gr.Slider, {"minimum": 0, "maximum": 512, "step": 16}, infotext="Hypertile VAE max tile size"),
@@ -105,5 +103,20 @@ def on_ui_settings():
        shared.opts.add_option(name, opt)
+def add_axis_options():
+    xyz_grid = [x for x in scripts.scripts_data if x.script_class.__module__ == "xyz_grid.py"][0].module
+    xyz_grid.axis_options.extend([
+        xyz_grid.AxisOption("[Hypertile] Unet First pass Enabled", str, xyz_grid.apply_override('hypertile_enable_unet', boolean=True), choices=xyz_grid.boolean_choice(reverse=True)),
+        xyz_grid.AxisOption("[Hypertile] Unet Second pass Enabled", str, xyz_grid.apply_override('hypertile_enable_unet_secondpass', boolean=True), choices=xyz_grid.boolean_choice(reverse=True)),
+        xyz_grid.AxisOption("[Hypertile] Unet Max Depth", int, xyz_grid.apply_override("hypertile_max_depth_unet"), confirm=xyz_grid.confirm_range(0, 3, '[Hypertile] Unet Max Depth'), choices=lambda: [str(x) for x in range(4)]),
+        xyz_grid.AxisOption("[Hypertile] Unet Max Tile Size", int, xyz_grid.apply_override("hypertile_max_tile_unet"), confirm=xyz_grid.confirm_range(0, 512, '[Hypertile] Unet Max Tile Size')),
+        xyz_grid.AxisOption("[Hypertile] Unet Swap Size", int, xyz_grid.apply_override("hypertile_swap_size_unet"), confirm=xyz_grid.confirm_range(0, 64, '[Hypertile] Unet Swap Size')),
+        xyz_grid.AxisOption("[Hypertile] VAE Enabled", str, xyz_grid.apply_override('hypertile_enable_vae', boolean=True), choices=xyz_grid.boolean_choice(reverse=True)),
+        xyz_grid.AxisOption("[Hypertile] VAE Max Depth", int, xyz_grid.apply_override("hypertile_max_depth_vae"), confirm=xyz_grid.confirm_range(0, 3, '[Hypertile] VAE Max Depth'), choices=lambda: [str(x) for x in range(4)]),
+        xyz_grid.AxisOption("[Hypertile] VAE Max Tile Size", int, xyz_grid.apply_override("hypertile_max_tile_vae"), confirm=xyz_grid.confirm_range(0, 512, '[Hypertile] VAE Max Tile Size')),
+        xyz_grid.AxisOption("[Hypertile] VAE Swap Size", int, xyz_grid.apply_override("hypertile_swap_size_vae"), confirm=xyz_grid.confirm_range(0, 64, '[Hypertile] VAE Swap Size')),
+    ])
 script_callbacks.on_ui_settings(on_ui_settings)
 script_callbacks.on_before_ui(add_axis_options)
--- a/extensions-builtin/hypertile/scripts/hypertile_xyz.py
+++ b/extensions-builtin/hypertile/scripts/hypertile_xyz.py
-from modules import scripts
-from modules.shared import opts
-xyz_grid = [x for x in scripts.scripts_data if x.script_class.__module__ == "xyz_grid.py"][0].module
-def int_applier(value_name:str, min_range:int = -1, max_range:int = -1):
-    """
-    Returns a function that applies the given value to the given value_name in opts.data.
-    """
-    def validate(value_name:str, value:str):
-        value = int(value)
-        # validate value
-        if not min_range == -1:
-            assert value >= min_range, f"Value {value} for {value_name} must be greater than or equal to {min_range}"
-        if not max_range == -1:
-            assert value <= max_range, f"Value {value} for {value_name} must be less than or equal to {max_range}"
-    def apply_int(p, x, xs):
-        validate(value_name, x)
-        opts.data[value_name] = int(x)
-    return apply_int
-def bool_applier(value_name:str):
-    """
-    Returns a function that applies the given value to the given value_name in opts.data.
-    """
-    def validate(value_name:str, value:str):
-        assert value.lower() in ["true", "false"], f"Value {value} for {value_name} must be either true or false"
-    def apply_bool(p, x, xs):
-        validate(value_name, x)
-        value_boolean = x.lower() == "true"
-        opts.data[value_name] = value_boolean
-    return apply_bool
-def add_axis_options():
-    extra_axis_options = [
-        xyz_grid.AxisOption("[Hypertile] Unet First pass Enabled", str, bool_applier("hypertile_enable_unet"), choices=xyz_grid.boolean_choice(reverse=True)),
-        xyz_grid.AxisOption("[Hypertile] Unet Second pass Enabled", str, bool_applier("hypertile_enable_unet_secondpass"), choices=xyz_grid.boolean_choice(reverse=True)),
-        xyz_grid.AxisOption("[Hypertile] Unet Max Depth", int, int_applier("hypertile_max_depth_unet", 0, 3), choices=lambda: [str(x) for x in range(4)]),
-        xyz_grid.AxisOption("[Hypertile] Unet Max Tile Size", int, int_applier("hypertile_max_tile_unet", 0, 512)),
-        xyz_grid.AxisOption("[Hypertile] Unet Swap Size", int, int_applier("hypertile_swap_size_unet", 0, 64)),
-        xyz_grid.AxisOption("[Hypertile] VAE Enabled", str, bool_applier("hypertile_enable_vae"), choices=xyz_grid.boolean_choice(reverse=True)),
-        xyz_grid.AxisOption("[Hypertile] VAE Max Depth", int, int_applier("hypertile_max_depth_vae", 0, 3), choices=lambda: [str(x) for x in range(4)]),
-        xyz_grid.AxisOption("[Hypertile] VAE Max Tile Size", int, int_applier("hypertile_max_tile_vae", 0, 512)),
-        xyz_grid.AxisOption("[Hypertile] VAE Swap Size", int, int_applier("hypertile_swap_size_vae", 0, 64)),
-    ]
-    set_a = {opt.label for opt in xyz_grid.axis_options}
-    set_b = {opt.label for opt in extra_axis_options}
-    if set_a.intersection(set_b):
-        return
-    xyz_grid.axis_options.extend(extra_axis_options)
--- a/extensions-builtin/soft-inpainting/scripts/soft_inpainting.py
+++ b/extensions-builtin/soft-inpainting/scripts/soft_inpainting.py
@@ -3,6 +3,7 @@ import gradio as gr
 import math
 from modules.ui_components import InputAccordion
 import modules.scripts as scripts
+from modules.torch_utils import float64
 class SoftInpaintingSettings:
@@ -79,13 +80,11 @@ def latent_blend(settings, a, b, t):
    # Calculate the magnitude of the interpolated vectors. (We will remove this magnitude.)
    # 64-bit operations are used here to allow large exponents.
-    current_magnitude = torch.norm(image_interp, p=2, dim=1, keepdim=True).to(torch.float64).add_(0.00001)
+    current_magnitude = torch.norm(image_interp, p=2, dim=1, keepdim=True).to(float64(image_interp)).add_(0.00001)
    # Interpolate the powered magnitudes, then un-power them (bring them back to a power of 1).
-    a_magnitude = torch.norm(a, p=2, dim=1, keepdim=True).to(torch.float64).pow_(
+    a_magnitude = torch.norm(a, p=2, dim=1, keepdim=True).to(float64(a)).pow_(settings.inpaint_detail_preservation) * one_minus_t3
-        settings.inpaint_detail_preservation) * one_minus_t3
+    b_magnitude = torch.norm(b, p=2, dim=1, keepdim=True).to(float64(b)).pow_(settings.inpaint_detail_preservation) * t3
-    b_magnitude = torch.norm(b, p=2, dim=1, keepdim=True).to(torch.float64).pow_(
-        settings.inpaint_detail_preservation) * t3
    desired_magnitude = a_magnitude
    desired_magnitude.add_(b_magnitude).pow_(1 / settings.inpaint_detail_preservation)
    del a_magnitude, b_magnitude, t3, one_minus_t3

--- a/javascript/contextMenus.js
+++ b/javascript/contextMenus.js
@@ -8,9 +8,6 @@ var contextMenuInit = function() {
    };
    function showContextMenu(event, element, menuEntries) {
-        let posx = event.clientX + document.body.scrollLeft + document.documentElement.scrollLeft;
-        let posy = event.clientY + document.body.scrollTop + document.documentElement.scrollTop;
        let oldMenu = gradioApp().querySelector('#context-menu');
        if (oldMenu) {
            oldMenu.remove();
@@ -23,10 +20,8 @@ var contextMenuInit = function() {
        contextMenu.style.background = baseStyle.background;
        contextMenu.style.color = baseStyle.color;
        contextMenu.style.fontFamily = baseStyle.fontFamily;
-        contextMenu.style.top = posy + 'px';
+        contextMenu.style.top = event.pageY + 'px';
-        contextMenu.style.left = posx + 'px';
+        contextMenu.style.left = event.pageX + 'px';
        const contextMenuList = document.createElement('ul');
        contextMenuList.className = 'context-menu-items';
@@ -43,21 +38,6 @@ var contextMenuInit = function() {
        });
        gradioApp().appendChild(contextMenu);
-        let menuWidth = contextMenu.offsetWidth + 4;
-        let menuHeight = contextMenu.offsetHeight + 4;
-        let windowWidth = window.innerWidth;
-        let windowHeight = window.innerHeight;
-        if ((windowWidth - posx) < menuWidth) {
-            contextMenu.style.left = windowWidth - menuWidth + "px";
-        }
-        if ((windowHeight - posy) < menuHeight) {
-            contextMenu.style.top = windowHeight - menuHeight + "px";
-        }
    }
    function appendContextMenuOption(targetElementSelector, entryName, entryFunction) {
@@ -107,16 +87,23 @@ var contextMenuInit = function() {
                oldMenu.remove();
            }
        });
-        gradioApp().addEventListener("contextmenu", function(e) {
+        ['contextmenu', 'touchstart'].forEach((eventType) => {
-            let oldMenu = gradioApp().querySelector('#context-menu');
+            gradioApp().addEventListener(eventType, function(e) {
-            if (oldMenu) {
+                let ev = e;
-                oldMenu.remove();
+                if (eventType.startsWith('touch')) {
-            }
+                    if (e.touches.length !== 2) return;
-            menuSpecs.forEach(function(v, k) {
+                    ev = e.touches[0];
-                if (e.composedPath()[0].matches(k)) {
+                }
-                    showContextMenu(e, e.composedPath()[0], v);
+                let oldMenu = gradioApp().querySelector('#context-menu');
-                    e.preventDefault();
+                if (oldMenu) {
+                    oldMenu.remove();
                }
+                menuSpecs.forEach(function(v, k) {
+                    if (e.composedPath()[0].matches(k)) {
+                        showContextMenu(ev, e.composedPath()[0], v);
+                        e.preventDefault();
+                    }
+                });
            });
        });
        eventListenerApplied = true;

--- a/javascript/dragdrop.js
+++ b/javascript/dragdrop.js
@@ -56,6 +56,15 @@ function eventHasFiles(e) {
    return false;
 }
+function isURL(url) {
+    try {
+        const _ = new URL(url);
+        return true;
+    } catch {
+        return false;
+    }
+}
 function dragDropTargetIsPrompt(target) {
    if (target?.placeholder && target?.placeholder.indexOf("Prompt") >= 0) return true;
    if (target?.parentNode?.parentNode?.className?.indexOf("prompt") > 0) return true;
@@ -77,7 +86,7 @@ window.document.addEventListener('dragover', e => {
 window.document.addEventListener('drop', async e => {
    const target = e.composedPath()[0];
    const url = e.dataTransfer.getData('text/uri-list') || e.dataTransfer.getData('text/plain');
-    if (!eventHasFiles(e) && !url) return;
+    if (!eventHasFiles(e) && !isURL(url)) return;
    if (dragDropTargetIsPrompt(target)) {
        e.stopPropagation();

--- a/javascript/imageviewer.js
+++ b/javascript/imageviewer.js
@@ -6,6 +6,8 @@ function closeModal() {
 function showModal(event) {
    const source = event.target || event.srcElement;
    const modalImage = gradioApp().getElementById("modalImage");
+    const modalToggleLivePreviewBtn = gradioApp().getElementById("modal_toggle_live_preview");
+    modalToggleLivePreviewBtn.innerHTML = opts.js_live_preview_in_modal_lightbox ? "&#x1F5C7;" : "&#x1F5C6;";
    const lb = gradioApp().getElementById("lightboxModal");
    modalImage.src = source.src;
    if (modalImage.style.display === 'none') {
@@ -51,14 +53,7 @@ function modalImageSwitch(offset) {
    var galleryButtons = all_gallery_buttons();
    if (galleryButtons.length > 1) {
-        var currentButton = selected_gallery_button();
+        var result = selected_gallery_index();
-        var result = -1;
-        galleryButtons.forEach(function(v, i) {
-            if (v == currentButton) {
-                result = i;
-            }
-        });
        if (result != -1) {
            var nextButton = galleryButtons[negmod((result + offset), galleryButtons.length)];
@@ -159,6 +154,13 @@ function modalZoomToggle(event) {
    event.stopPropagation();
 }
+function modalLivePreviewToggle(event) {
+    const modalToggleLivePreview = gradioApp().getElementById("modal_toggle_live_preview");
+    opts.js_live_preview_in_modal_lightbox = !opts.js_live_preview_in_modal_lightbox;
+    modalToggleLivePreview.innerHTML = opts.js_live_preview_in_modal_lightbox ? "&#x1F5C7;" : "&#x1F5C6;";
+    event.stopPropagation();
+}
 function modalTileImageToggle(event) {
    const modalImage = gradioApp().getElementById("modalImage");
    const modal = gradioApp().getElementById("lightboxModal");
@@ -216,6 +218,14 @@ document.addEventListener("DOMContentLoaded", function() {
    modalSave.title = "Save Image(s)";
    modalControls.appendChild(modalSave);
+    const modalToggleLivePreview = document.createElement('span');
+    modalToggleLivePreview.className = 'modalToggleLivePreview cursor';
+    modalToggleLivePreview.id = "modal_toggle_live_preview";
+    modalToggleLivePreview.innerHTML = "&#x1F5C6;";
+    modalToggleLivePreview.onclick = modalLivePreviewToggle;
+    modalToggleLivePreview.title = "Toggle live preview";
+    modalControls.appendChild(modalToggleLivePreview);
    const modalClose = document.createElement('span');
    modalClose.className = 'modalClose cursor';
    modalClose.innerHTML = '&times;';

--- a/javascript/progressbar.js
+++ b/javascript/progressbar.js
@@ -76,6 +76,26 @@ function requestProgress(id_task, progressbarContainer, gallery, atEnd, onProgre
    var dateStart = new Date();
    var wasEverActive = false;
    var parentProgressbar = progressbarContainer.parentNode;
+    var wakeLock = null;
+    var requestWakeLock = async function() {
+        if (!opts.prevent_screen_sleep_during_generation || wakeLock) return;
+        try {
+            wakeLock = await navigator.wakeLock.request('screen');
+        } catch (err) {
+            console.error('Wake Lock is not supported.');
+        }
+    };
+    var releaseWakeLock = async function() {
+        if (!opts.prevent_screen_sleep_during_generation || !wakeLock) return;
+        try {
+            await wakeLock.release();
+            wakeLock = null;
+        } catch (err) {
+            console.error('Wake Lock release failed', err);
+        }
+    };
    var divProgress = document.createElement('div');
    divProgress.className = 'progressDiv';
@@ -89,6 +109,7 @@ function requestProgress(id_task, progressbarContainer, gallery, atEnd, onProgre
    var livePreview = null;
    var removeProgressBar = function() {
+        releaseWakeLock();
        if (!divProgress) return;
        setTitle("");
@@ -100,6 +121,7 @@ function requestProgress(id_task, progressbarContainer, gallery, atEnd, onProgre
    };
    var funProgress = function(id_task) {
+        requestWakeLock();
        request("./internal/progress", {id_task: id_task, live_preview: false}, function(res) {
            if (res.completed) {
                removeProgressBar();

--- a/javascript/ui.js
+++ b/javascript/ui.js
@@ -26,6 +26,14 @@ function selected_gallery_index() {
    return all_gallery_buttons().findIndex(elem => elem.classList.contains('selected'));
 }
+function gallery_container_buttons(gallery_container) {
+    return gradioApp().querySelectorAll(`#${gallery_container} .thumbnail-item.thumbnail-small`);
+}
+function selected_gallery_index_id(gallery_container) {
+    return Array.from(gallery_container_buttons(gallery_container)).findIndex(elem => elem.classList.contains('selected'));
+}
 function extract_image_from_gallery(gallery) {
    if (gallery.length == 0) {
        return [null];
@@ -299,6 +307,7 @@ onAfterUiUpdate(function() {
    var jsdata = textarea.value;
    opts = JSON.parse(jsdata);
+    executeCallbacks(optionsAvailableCallbacks); /*global optionsAvailableCallbacks*/
    executeCallbacks(optionsChangedCallbacks); /*global optionsChangedCallbacks*/
    Object.defineProperty(textarea, 'value', {
@@ -337,8 +346,8 @@ onOptionsChanged(function() {
 let txt2img_textarea, img2img_textarea = undefined;
 function restart_reload() {
+    document.body.style.backgroundColor = "var(--background-fill-primary)";
    document.body.innerHTML = '<h1 style="font-family:monospace;margin-top:20%;color:lightgray;text-align:center;">Reloading...</h1>';
    var requestPing = function() {
        requestGet("./internal/ping", {}, function(data) {
            location.reload();

--- a/modules/api/api.py
+++ b/modules/api/api.py
@@ -43,7 +43,7 @@ def script_name_to_index(name, scripts):
 def validate_sampler_name(name):
    config = sd_samplers.all_samplers_map.get(name, None)
    if config is None:
-        raise HTTPException(status_code=404, detail="Sampler not found")
+        raise HTTPException(status_code=400, detail="Sampler not found")
    return name
@@ -113,7 +113,7 @@ def encode_pil_to_base64(image):
            image.save(output_bytes, format="PNG", pnginfo=(metadata if use_metadata else None), quality=opts.jpeg_quality)
        elif opts.samples_format.lower() in ("jpg", "jpeg", "webp"):
-            if image.mode == "RGBA":
+            if image.mode in ("RGBA", "P"):
                image = image.convert("RGB")
            parameters = image.info.get('parameters', None)
            exif_bytes = piexif.dump({
@@ -372,7 +372,7 @@ class Api:
            return {}
        possible_fields = infotext_utils.paste_fields[tabname]["fields"]
-        set_fields = request.model_dump(exclude_unset=True) if hasattr(request, "request") else request.dict(exclude_unset=True)  # pydantic v1/v2 have differenrt names for this
+        set_fields = request.model_dump(exclude_unset=True) if hasattr(request, "request") else request.dict(exclude_unset=True)  # pydantic v1/v2 have different names for this
        params = infotext_utils.parse_generation_parameters(request.infotext)
        def get_field_value(field, params):
@@ -438,15 +438,19 @@ class Api:
        self.apply_infotext(txt2imgreq, "txt2img", script_runner=script_runner, mentioned_script_args=infotext_script_args)
        selectable_scripts, selectable_script_idx = self.get_selectable_script(txt2imgreq.script_name, script_runner)
+        sampler, scheduler = sd_samplers.get_sampler_and_scheduler(txt2imgreq.sampler_name or txt2imgreq.sampler_index, txt2imgreq.scheduler)
        populate = txt2imgreq.copy(update={  # Override __init__ params
-            "sampler_name": validate_sampler_name(txt2imgreq.sampler_name or txt2imgreq.sampler_index),
+            "sampler_name": validate_sampler_name(sampler),
            "do_not_save_samples": not txt2imgreq.save_images,
            "do_not_save_grid": not txt2imgreq.save_images,
        })
        if populate.sampler_name:
            populate.sampler_index = None  # prevent a warning later on
+        if not populate.scheduler and scheduler != "Automatic":
+            populate.scheduler = scheduler
        args = vars(populate)
        args.pop('script_name', None)
        args.pop('script_args', None) # will refeed them to the pipeline directly after initializing them
@@ -502,9 +506,10 @@ class Api:
        self.apply_infotext(img2imgreq, "img2img", script_runner=script_runner, mentioned_script_args=infotext_script_args)
        selectable_scripts, selectable_script_idx = self.get_selectable_script(img2imgreq.script_name, script_runner)
+        sampler, scheduler = sd_samplers.get_sampler_and_scheduler(img2imgreq.sampler_name or img2imgreq.sampler_index, img2imgreq.scheduler)
        populate = img2imgreq.copy(update={  # Override __init__ params
-            "sampler_name": validate_sampler_name(img2imgreq.sampler_name or img2imgreq.sampler_index),
+            "sampler_name": validate_sampler_name(sampler),
            "do_not_save_samples": not img2imgreq.save_images,
            "do_not_save_grid": not img2imgreq.save_images,
            "mask": mask,
@@ -512,6 +517,9 @@ class Api:
        if populate.sampler_name:
            populate.sampler_index = None  # prevent a warning later on
+        if not populate.scheduler and scheduler != "Automatic":
+            populate.scheduler = scheduler
        args = vars(populate)
        args.pop('include_init_images', None)  # this is meant to be done by "exclude": True in model, but it's for a reason that I cannot determine.
        args.pop('script_name', None)

--- a/modules/call_queue.py
+++ b/modules/call_queue.py
+import os.path
 from functools import wraps
 import html
 import time
-from modules import shared, progress, errors, devices, fifo_lock
+from modules import shared, progress, errors, devices, fifo_lock, profiling
 queue_lock = fifo_lock.FIFOLock()
@@ -46,6 +47,22 @@ def wrap_gradio_gpu_call(func, extra_outputs=None):
 def wrap_gradio_call(func, extra_outputs=None, add_stats=False):
+    @wraps(func)
+    def f(*args, **kwargs):
+        try:
+            res = func(*args, **kwargs)
+        finally:
+            shared.state.skipped = False
+            shared.state.interrupted = False
+            shared.state.stopping_generation = False
+            shared.state.job_count = 0
+            shared.state.job = ""
+        return res
+    return wrap_gradio_call_no_job(f, extra_outputs, add_stats)
+def wrap_gradio_call_no_job(func, extra_outputs=None, add_stats=False):
    @wraps(func)
    def f(*args, extra_outputs_array=extra_outputs, **kwargs):
        run_memmon = shared.opts.memmon_poll_rate > 0 and not shared.mem_mon.disabled and add_stats
@@ -65,9 +82,6 @@ def wrap_gradio_call(func, extra_outputs=None, add_stats=False):
                arg_str += f" (Argument list truncated at {max_debug_str_len}/{len(arg_str)} characters)"
            errors.report(f"{message}\n{arg_str}", exc_info=True)
-            shared.state.job = ""
-            shared.state.job_count = 0
            if extra_outputs_array is None:
                extra_outputs_array = [None, '']
@@ -76,11 +90,6 @@ def wrap_gradio_call(func, extra_outputs=None, add_stats=False):
        devices.torch_gc()
-        shared.state.skipped = False
-        shared.state.interrupted = False
-        shared.state.stopping_generation = False
-        shared.state.job_count = 0
        if not add_stats:
            return tuple(res)
@@ -111,9 +120,15 @@ def wrap_gradio_call(func, extra_outputs=None, add_stats=False):
        else:
            vram_html = ''
+        if shared.opts.profiling_enable and os.path.exists(shared.opts.profiling_filename):
+            profiling_html = f"<p class='profile'> [ <a href='{profiling.webpath()}' download>Profile</a> ] </p>"
+        else:
+            profiling_html = ''
        # last item is always HTML
-        res[-1] += f"<div class='performance'><p class='time'>Time taken: <wbr><span class='measurement'>{elapsed_text}</span></p>{vram_html}</div>"
+        res[-1] += f"<div class='performance'><p class='time'>Time taken: <wbr><span class='measurement'>{elapsed_text}</span></p>{vram_html}{profiling_html}</div>"
        return tuple(res)
    return f
--- a/modules/cmd_args.py
+++ b/modules/cmd_args.py
@@ -20,6 +20,7 @@ parser.add_argument("--dump-sysinfo", action='store_true', help="launch.py argum
 parser.add_argument("--loglevel", type=str, help="log level; one of: CRITICAL, ERROR, WARNING, INFO, DEBUG", default=None)
 parser.add_argument("--do-not-download-clip", action='store_true', help="do not download CLIP model even if it's not included in the checkpoint")
 parser.add_argument("--data-dir", type=normalized_filepath, default=os.path.dirname(os.path.dirname(os.path.realpath(__file__))), help="base path where all user data is stored")
+parser.add_argument("--models-dir", type=normalized_filepath, default=None, help="base path where models are stored; overrides --data-dir")
 parser.add_argument("--config", type=normalized_filepath, default=sd_default_config, help="path to config which constructs model",)
 parser.add_argument("--ckpt", type=normalized_filepath, default=sd_model_file, help="path to checkpoint of stable diffusion model; if specified, this checkpoint will be added to the list of checkpoints and loaded",)
 parser.add_argument("--ckpt-dir", type=normalized_filepath, default=None, help="Path to directory with stable diffusion checkpoints")
@@ -29,7 +30,7 @@ parser.add_argument("--gfpgan-model", type=normalized_filepath, help="GFPGAN mod
 parser.add_argument("--no-half", action='store_true', help="do not switch the model to 16-bit floats")
 parser.add_argument("--no-half-vae", action='store_true', help="do not switch the VAE model to 16-bit floats")
 parser.add_argument("--no-progressbar-hiding", action='store_true', help="do not hide progressbar in gradio UI (we hide it because it slows down ML if you have hardware acceleration in browser)")
-parser.add_argument("--max-batch-count", type=int, default=16, help="maximum batch count value for the UI")
+parser.add_argument("--max-batch-count", type=int, default=16, help="does not do anything")
 parser.add_argument("--embeddings-dir", type=normalized_filepath, default=os.path.join(data_path, 'embeddings'), help="embeddings directory for textual inversion (default: embeddings)")
 parser.add_argument("--textual-inversion-templates-dir", type=normalized_filepath, default=os.path.join(script_path, 'textual_inversion_templates'), help="directory with textual inversion templates")
 parser.add_argument("--hypernetwork-dir", type=normalized_filepath, default=os.path.join(models_path, 'hypernetworks'), help="hypernetwork directory")
@@ -41,7 +42,7 @@ parser.add_argument("--lowvram", action='store_true', help="enable stable diffus
 parser.add_argument("--lowram", action='store_true', help="load stable diffusion checkpoint weights to VRAM instead of RAM")
 parser.add_argument("--always-batch-cond-uncond", action='store_true', help="does not do anything")
 parser.add_argument("--unload-gfpgan", action='store_true', help="does not do anything.")
-parser.add_argument("--precision", type=str, help="evaluate at this precision", choices=["full", "autocast"], default="autocast")
+parser.add_argument("--precision", type=str, help="evaluate at this precision", choices=["full", "half", "autocast"], default="autocast")
 parser.add_argument("--upcast-sampling", action='store_true', help="upcast sampling. No effect with --no-half. Usually produces similar results to --no-half with better performance while using less memory.")
 parser.add_argument("--share", action='store_true', help="use share=True for gradio and make the UI accessible through their site")
 parser.add_argument("--ngrok", type=str, help="ngrok authtoken, alternative to gradio --share", default=None)

--- a/modules/deepbooru.py
+++ b/modules/deepbooru.py
@@ -57,7 +57,7 @@ class DeepDanbooru:
        a = np.expand_dims(np.array(pic, dtype=np.float32), 0) / 255
        with torch.no_grad(), devices.autocast():
-            x = torch.from_numpy(a).to(devices.device)
+            x = torch.from_numpy(a).to(devices.device, devices.dtype)
            y = self.model(x)[0].detach().cpu().numpy()
        probability_dict = {}

--- a/modules/devices.py
+++ b/modules/devices.py
@@ -114,6 +114,9 @@ errors.run(enable_tf32, "Enabling TF32")
 cpu: torch.device = torch.device("cpu")
 fp8: bool = False
+# Force fp16 for all models in inference. No casting during inference.
+# This flag is controlled by "--precision half" command line arg.
+force_fp16: bool = False
 device: torch.device = None
 device_interrogate: torch.device = None
 device_gfpgan: torch.device = None
@@ -127,6 +130,8 @@ unet_needs_upcast = False
 def cond_cast_unet(input):
+    if force_fp16:
+        return input.to(torch.float16)
    return input.to(dtype_unet) if unet_needs_upcast else input
@@ -206,6 +211,11 @@ def autocast(disable=False):
    if disable:
        return contextlib.nullcontext()
+    if force_fp16:
+        # No casting during inference if force_fp16 is enabled.
+        # All tensor dtype conversion happens before inference.
+        return contextlib.nullcontext()
    if fp8 and device==cpu:
        return torch.autocast("cpu", dtype=torch.bfloat16, enabled=True)
@@ -233,22 +243,22 @@ def test_for_nans(x, where):
    if shared.cmd_opts.disable_nan_check:
        return
-    if not torch.all(torch.isnan(x)).item():
+    if not torch.isnan(x[(0, ) * len(x.shape)]):
        return
    if where == "unet":
-        message = "A tensor with all NaNs was produced in Unet."
+        message = "A tensor with NaNs was produced in Unet."
        if not shared.cmd_opts.no_half:
            message += " This could be either because there's not enough precision to represent the picture, or because your video card does not support half type. Try setting the \"Upcast cross attention layer to float32\" option in Settings > Stable Diffusion or using the --no-half commandline argument to fix this."
    elif where == "vae":
-        message = "A tensor with all NaNs was produced in VAE."
+        message = "A tensor with NaNs was produced in VAE."
        if not shared.cmd_opts.no_half and not shared.cmd_opts.no_half_vae:
            message += " This could be because there's not enough precision to represent the picture. Try adding --no-half-vae commandline argument to fix this."
    else:
-        message = "A tensor with all NaNs was produced."
+        message = "A tensor with NaNs was produced."
    message += " Use --disable-nan-check commandline argument to disable this check."
@@ -258,7 +268,7 @@ def test_for_nans(x, where):
 @lru_cache
 def first_time_calculation():
    """
-    just do any calculation with pytorch layers - the first time this is done it allocaltes about 700MB of memory and
+    just do any calculation with pytorch layers - the first time this is done it allocates about 700MB of memory and
    spends about 2.7 seconds doing that, at least with NVidia.
    """
@@ -269,3 +279,17 @@ def first_time_calculation():
    x = torch.zeros((1, 1, 3, 3)).to(device, dtype)
    conv2d = torch.nn.Conv2d(1, 1, (3, 3)).to(device, dtype)
    conv2d(x)
+def force_model_fp16():
+    """
+    ldm and sgm has modules.diffusionmodules.util.GroupNorm32.forward, which
+    force conversion of input to float32. If force_fp16 is enabled, we need to
+    prevent this casting.
+    """
+    assert force_fp16
+    import sgm.modules.diffusionmodules.util as sgm_util
+    import ldm.modules.diffusionmodules.util as ldm_util
+    sgm_util.GroupNorm32 = torch.nn.GroupNorm
+    ldm_util.GroupNorm32 = torch.nn.GroupNorm
+    print("ldm/sgm GroupNorm32 replaced with normal torch.nn.GroupNorm due to `--precision half`.")
--- a/modules/extensions.py
+++ b/modules/extensions.py
@@ -191,8 +191,9 @@ class Extension:
    def check_updates(self):
        repo = Repo(self.path)
+        branch_name = f'{repo.remote().name}/{self.branch}'
        for fetch in repo.remote().fetch(dry_run=True):
-            if self.branch and fetch.name != f'{repo.remote().name}/{self.branch}':
+            if self.branch and fetch.name != branch_name:
                continue
            if fetch.flags != fetch.HEAD_UPTODATE:
                self.can_update = True
@@ -200,7 +201,7 @@ class Extension:
                return
        try:
-            origin = repo.rev_parse('origin')
+            origin = repo.rev_parse(branch_name)
            if repo.head.commit != origin:
                self.can_update = True
                self.status = "behind HEAD"
@@ -213,8 +214,10 @@ class Extension:
        self.can_update = False
        self.status = "latest"
-    def fetch_and_reset_hard(self, commit='origin'):
+    def fetch_and_reset_hard(self, commit=None):
        repo = Repo(self.path)
+        if commit is None:
+            commit = f'{repo.remote().name}/{self.branch}'
        # Fix: `error: Your local changes to the following files would be overwritten by merge`,
        # because WSL2 Docker set 755 file permissions instead of 644, this results to the error.
        repo.git.fetch(all=True)

--- a/modules/gfpgan_model.py
+++ b/modules/gfpgan_model.py
@@ -36,13 +36,11 @@ class FaceRestorerGFPGAN(face_restoration_utils.CommonFaceRestoration):
            ext_filter=['.pth'],
        ):
            if 'GFPGAN' in os.path.basename(model_path):
-                model = modelloader.load_spandrel_model(
+                return modelloader.load_spandrel_model(
                    model_path,
                    device=self.get_device(),
                    expected_architecture='GFPGAN',
                ).model
-                model.different_w = True  # see https://github.com/chaiNNer-org/spandrel/pull/81
-                return model
        raise ValueError("No GFPGAN model found")
    def restore(self, np_image):

--- a/modules/images.py
+++ b/modules/images.py
@@ -54,11 +54,14 @@ def image_grid(imgs, batch_size=1, rows=None):
    params = script_callbacks.ImageGridLoopParams(imgs, cols, rows)
    script_callbacks.image_grid_callback(params)
-    w, h = imgs[0].size
+    w, h = map(max, zip(*(img.size for img in imgs)))
-    grid = Image.new('RGB', size=(params.cols * w, params.rows * h), color='black')
+    grid_background_color = ImageColor.getcolor(opts.grid_background_color, 'RGB')
+    grid = Image.new('RGB', size=(params.cols * w, params.rows * h), color=grid_background_color)
    for i, img in enumerate(params.imgs):
-        grid.paste(img, box=(i % params.cols * w, i // params.cols * h))
+        img_w, img_h = img.size
+        w_offset, h_offset = 0 if img_w == w else (w - img_w) // 2, 0 if img_h == h else (h - img_h) // 2
+        grid.paste(img, box=(i % params.cols * w + w_offset, i // params.cols * h + h_offset))
    return grid
@@ -377,6 +380,7 @@ def get_sampler_scheduler(p, sampler):
 class FilenameGenerator:
    replacements = {
+        'basename': lambda self: self.basename or 'img',
        'seed': lambda self: self.seed if self.seed is not None else '',
        'seed_first': lambda self: self.seed if self.p.batch_size == 1 else self.p.all_seeds[0],
        'seed_last': lambda self: NOTHING_AND_SKIP_PREVIOUS_TEXT if self.p.batch_size == 1 else self.p.all_seeds[-1],
@@ -413,12 +417,13 @@ class FilenameGenerator:
    }
    default_time_format = '%Y%m%d%H%M%S'
-    def __init__(self, p, seed, prompt, image, zip=False):
+    def __init__(self, p, seed, prompt, image, zip=False, basename=""):
        self.p = p
        self.seed = seed
        self.prompt = prompt
        self.image = image
        self.zip = zip
+        self.basename = basename
    def get_vae_filename(self):
        """Get the name of the VAE file."""
@@ -606,9 +611,10 @@ def save_image_with_geninfo(image, geninfo, filename, extension=None, existing_p
                    piexif.ExifIFD.UserComment: piexif.helper.UserComment.dump(geninfo or "", encoding="unicode")
                },
            })
+        else:
+            exif_bytes = None
+        image.save(filename,format=image_format, quality=opts.jpeg_quality, exif=exif_bytes)
-        image.save(filename,format=image_format, exif=exif_bytes)
    elif extension.lower() == ".gif":
        image.save(filename, format=image_format, comment=geninfo)
    else:
@@ -648,12 +654,12 @@ def save_image(image, path, basename, seed=None, prompt=None, extension='png', i
        txt_fullfn (`str` or None):
            If a text file is saved for this image, this will be its full path. Otherwise None.
    """
-    namegen = FilenameGenerator(p, seed, prompt, image)
+    namegen = FilenameGenerator(p, seed, prompt, image, basename=basename)
    # WebP and JPG formats have maximum dimension limits of 16383 and 65535 respectively. switch to PNG which has a much higher limit
    if (image.height > 65535 or image.width > 65535) and extension.lower() in ("jpg", "jpeg") or (image.height > 16383 or image.width > 16383) and extension.lower() == "webp":
        print('Image dimensions too large; saving as PNG')
-        extension = ".png"
+        extension = "png"
    if save_to_dirs is None:
        save_to_dirs = (grid and opts.grid_save_to_dirs) or (not grid and opts.save_to_dirs and not no_prompt)
@@ -789,7 +795,10 @@ def read_info_from_image(image: Image.Image) -> tuple[str | None, dict]:
        if exif_comment:
            geninfo = exif_comment
    elif "comment" in items: # for gif
-        geninfo = items["comment"].decode('utf8', errors="ignore")
+        if isinstance(items["comment"], bytes):
+            geninfo = items["comment"].decode('utf8', errors="ignore")
+        else:
+            geninfo = items["comment"]
    for field in IGNORED_INFO_KEYS:
        items.pop(field, None)

--- a/modules/img2img.py
+++ b/modules/img2img.py
@@ -17,11 +17,14 @@ from modules.ui import plaintext_to_html
 import modules.scripts
-def process_batch(p, input_dir, output_dir, inpaint_mask_dir, args, to_scale=False, scale_by=1.0, use_png_info=False, png_info_props=None, png_info_dir=None):
+def process_batch(p, input, output_dir, inpaint_mask_dir, args, to_scale=False, scale_by=1.0, use_png_info=False, png_info_props=None, png_info_dir=None):
    output_dir = output_dir.strip()
    processing.fix_seed(p)
-    batch_images = list(shared.walk_files(input_dir, allowed_extensions=(".png", ".jpg", ".jpeg", ".webp", ".tif", ".tiff")))
+    if isinstance(input, str):
+        batch_images = list(shared.walk_files(input, allowed_extensions=(".png", ".jpg", ".jpeg", ".webp", ".tif", ".tiff")))
+    else:
+        batch_images = [os.path.abspath(x.name) for x in input]
    is_inpaint_batch = False
    if inpaint_mask_dir:
@@ -146,7 +149,7 @@ def process_batch(p, input_dir, output_dir, inpaint_mask_dir, args, to_scale=Fal
    return batch_results
-def img2img(id_task: str, request: gr.Request, mode: int, prompt: str, negative_prompt: str, prompt_styles, init_img, sketch, init_img_with_mask, inpaint_color_sketch, inpaint_color_sketch_orig, init_img_inpaint, init_mask_inpaint, mask_blur: int, mask_alpha: float, inpainting_fill: int, n_iter: int, batch_size: int, cfg_scale: float, image_cfg_scale: float, denoising_strength: float, selected_scale_tab: int, height: int, width: int, scale_by: float, resize_mode: int, inpaint_full_res: bool, inpaint_full_res_padding: int, inpainting_mask_invert: int, img2img_batch_input_dir: str, img2img_batch_output_dir: str, img2img_batch_inpaint_mask_dir: str, override_settings_texts, img2img_batch_use_png_info: bool, img2img_batch_png_info_props: list, img2img_batch_png_info_dir: str, *args):
+def img2img(id_task: str, request: gr.Request, mode: int, prompt: str, negative_prompt: str, prompt_styles, init_img, sketch, init_img_with_mask, inpaint_color_sketch, inpaint_color_sketch_orig, init_img_inpaint, init_mask_inpaint, mask_blur: int, mask_alpha: float, inpainting_fill: int, n_iter: int, batch_size: int, cfg_scale: float, image_cfg_scale: float, denoising_strength: float, selected_scale_tab: int, height: int, width: int, scale_by: float, resize_mode: int, inpaint_full_res: bool, inpaint_full_res_padding: int, inpainting_mask_invert: int, img2img_batch_input_dir: str, img2img_batch_output_dir: str, img2img_batch_inpaint_mask_dir: str, override_settings_texts, img2img_batch_use_png_info: bool, img2img_batch_png_info_props: list, img2img_batch_png_info_dir: str, img2img_batch_source_type: str, img2img_batch_upload: list, *args):
    override_settings = create_override_settings_dict(override_settings_texts)
    is_batch = mode == 5
@@ -221,8 +224,15 @@ def img2img(id_task: str, request: gr.Request, mode: int, prompt: str, negative_
    with closing(p):
        if is_batch:
-            assert not shared.cmd_opts.hide_ui_dir_config, "Launched with --hide-ui-dir-config, batch img2img disabled"
+            if img2img_batch_source_type == "upload":
-            processed = process_batch(p, img2img_batch_input_dir, img2img_batch_output_dir, img2img_batch_inpaint_mask_dir, args, to_scale=selected_scale_tab == 1, scale_by=scale_by, use_png_info=img2img_batch_use_png_info, png_info_props=img2img_batch_png_info_props, png_info_dir=img2img_batch_png_info_dir)
+                assert isinstance(img2img_batch_upload, list) and img2img_batch_upload
+                output_dir = ""
+                inpaint_mask_dir = ""
+                png_info_dir = img2img_batch_png_info_dir if not shared.cmd_opts.hide_ui_dir_config else ""
+                processed = process_batch(p, img2img_batch_upload, output_dir, inpaint_mask_dir, args, to_scale=selected_scale_tab == 1, scale_by=scale_by, use_png_info=img2img_batch_use_png_info, png_info_props=img2img_batch_png_info_props, png_info_dir=png_info_dir)
+            else: # "from dir"
+                assert not shared.cmd_opts.hide_ui_dir_config, "Launched with --hide-ui-dir-config, batch img2img disabled"
+                processed = process_batch(p, img2img_batch_input_dir, img2img_batch_output_dir, img2img_batch_inpaint_mask_dir, args, to_scale=selected_scale_tab == 1, scale_by=scale_by, use_png_info=img2img_batch_use_png_info, png_info_props=img2img_batch_png_info_props, png_info_dir=img2img_batch_png_info_dir)
            if processed is None:
                processed = Processed(p, [], p.seed, "")

--- a/modules/infotext_utils.py
+++ b/modules/infotext_utils.py
@@ -146,18 +146,19 @@ def connect_paste_params_buttons():
        destination_height_component = next(iter([field for field, name in fields if name == "Size-2"] if fields else []), None)
        if binding.source_image_component and destination_image_component:
+            need_send_dementions = destination_width_component and binding.tabname != 'inpaint'
            if isinstance(binding.source_image_component, gr.Gallery):
-                func = send_image_and_dimensions if destination_width_component else image_from_url_text
+                func = send_image_and_dimensions if need_send_dementions else image_from_url_text
                jsfunc = "extract_image_from_gallery"
            else:
-                func = send_image_and_dimensions if destination_width_component else lambda x: x
+                func = send_image_and_dimensions if need_send_dementions else lambda x: x
                jsfunc = None
            binding.paste_button.click(
                fn=func,
                _js=jsfunc,
                inputs=[binding.source_image_component],
-                outputs=[destination_image_component, destination_width_component, destination_height_component] if destination_width_component else [destination_image_component],
+                outputs=[destination_image_component, destination_width_component, destination_height_component] if need_send_dementions else [destination_image_component],
                show_progress=False,
            )

--- a/modules/launch_utils.py
+++ b/modules/launch_utils.py
@@ -9,6 +9,7 @@ import importlib.util
 import importlib.metadata
 import platform
 import json
+import shlex
 from functools import lru_cache
 from modules import cmd_args, errors
@@ -76,7 +77,7 @@ def git_tag():
    except Exception:
        try:
-            changelog_md = os.path.join(os.path.dirname(os.path.dirname(__file__)), "CHANGELOG.md")
+            changelog_md = os.path.join(script_path, "CHANGELOG.md")
            with open(changelog_md, "r", encoding="utf-8") as file:
                line = next((line.strip() for line in file if line.strip()), "<none>")
                line = line.replace("## ", "")
@@ -231,7 +232,7 @@ def run_extension_installer(extension_dir):
    try:
        env = os.environ.copy()
-        env['PYTHONPATH'] = f"{os.path.abspath('.')}{os.pathsep}{env.get('PYTHONPATH', '')}"
+        env['PYTHONPATH'] = f"{script_path}{os.pathsep}{env.get('PYTHONPATH', '')}"
        stdout = run(f'"{python}" "{path_installer}"', errdesc=f"Error running install.py for extension {extension_dir}", custom_env=env).strip()
        if stdout:
@@ -445,7 +446,6 @@ def prepare_environment():
        exit(0)
 def configure_for_tests():
    if "--api" not in sys.argv:
        sys.argv.append("--api")
@@ -461,7 +461,7 @@ def configure_for_tests():
 def start():
-    print(f"Launching {'API server' if '--nowebui' in sys.argv else 'Web UI'} with arguments: {' '.join(sys.argv[1:])}")
+    print(f"Launching {'API server' if '--nowebui' in sys.argv else 'Web UI'} with arguments: {shlex.join(sys.argv[1:])}")
    import webui
    if '--nowebui' in sys.argv:
        webui.api_only()

--- a/modules/lowvram.py
+++ b/modules/lowvram.py
+from collections import namedtuple
 import torch
 from modules import devices, shared
 module_in_gpu = None
 cpu = torch.device("cpu")
+ModuleWithParent = namedtuple('ModuleWithParent', ['module', 'parent'], defaults=['None'])
 def send_everything_to_cpu():
    global module_in_gpu
@@ -75,13 +78,14 @@ def setup_for_low_vram(sd_model, use_medvram):
        (sd_model, 'depth_model'),
        (sd_model, 'embedder'),
        (sd_model, 'model'),
-        (sd_model, 'embedder'),
    ]
    is_sdxl = hasattr(sd_model, 'conditioner')
    is_sd2 = not is_sdxl and hasattr(sd_model.cond_stage_model, 'model')
-    if is_sdxl:
+    if hasattr(sd_model, 'medvram_fields'):
+        to_remain_in_cpu = sd_model.medvram_fields()
+    elif is_sdxl:
        to_remain_in_cpu.append((sd_model, 'conditioner'))
    elif is_sd2:
        to_remain_in_cpu.append((sd_model.cond_stage_model, 'model'))
@@ -103,7 +107,21 @@ def setup_for_low_vram(sd_model, use_medvram):
        setattr(obj, field, module)
    # register hooks for those the first three models
-    if is_sdxl:
+    if hasattr(sd_model, "cond_stage_model") and hasattr(sd_model.cond_stage_model, "medvram_modules"):
+        for module in sd_model.cond_stage_model.medvram_modules():
+            if isinstance(module, ModuleWithParent):
+                parent = module.parent
+                module = module.module
+            else:
+                parent = None
+            if module:
+                module.register_forward_pre_hook(send_me_to_gpu)
+                if parent:
+                    parents[module] = parent
+    elif is_sdxl:
        sd_model.conditioner.register_forward_pre_hook(send_me_to_gpu)
    elif is_sd2:
        sd_model.cond_stage_model.model.register_forward_pre_hook(send_me_to_gpu)
@@ -117,9 +135,9 @@ def setup_for_low_vram(sd_model, use_medvram):
    sd_model.first_stage_model.register_forward_pre_hook(send_me_to_gpu)
    sd_model.first_stage_model.encode = first_stage_model_encode_wrap
    sd_model.first_stage_model.decode = first_stage_model_decode_wrap
-    if sd_model.depth_model:
+    if getattr(sd_model, 'depth_model', None) is not None:
        sd_model.depth_model.register_forward_pre_hook(send_me_to_gpu)
-    if sd_model.embedder:
+    if getattr(sd_model, 'embedder', None) is not None:
        sd_model.embedder.register_forward_pre_hook(send_me_to_gpu)
    if use_medvram:

--- a/modules/modelloader.py
+++ b/modules/modelloader.py
@@ -23,6 +23,7 @@ def load_file_from_url(
    model_dir: str,
    progress: bool = True,
    file_name: str | None = None,
+    hash_prefix: str | None = None,
 ) -> str:
    """Download a file from `url` into `model_dir`, using the file present if possible.
@@ -36,11 +37,11 @@ def load_file_from_url(
    if not os.path.exists(cached_file):
        print(f'Downloading: "{url}" to {cached_file}\n')
        from torch.hub import download_url_to_file
-        download_url_to_file(url, cached_file, progress=progress)
+        download_url_to_file(url, cached_file, progress=progress, hash_prefix=hash_prefix)
    return cached_file
-def load_models(model_path: str, model_url: str = None, command_path: str = None, ext_filter=None, download_name=None, ext_blacklist=None) -> list:
+def load_models(model_path: str, model_url: str = None, command_path: str = None, ext_filter=None, download_name=None, ext_blacklist=None, hash_prefix=None) -> list:
    """
    A one-and done loader to try finding the desired models in specified directories.
@@ -49,6 +50,7 @@ def load_models(model_path: str, model_url: str = None, command_path: str = None
    @param model_path: The location to store/find models in.
    @param command_path: A command-line argument to search for models in first.
    @param ext_filter: An optional list of filename extensions to filter by
+    @param hash_prefix: the expected sha256 of the model_url
    @return: A list of paths containing the desired model(s)
    """
    output = []
@@ -78,7 +80,7 @@ def load_models(model_path: str, model_url: str = None, command_path: str = None
        if model_url is not None and len(output) == 0:
            if download_name is not None:
-                output.append(load_file_from_url(model_url, model_dir=places[0], file_name=download_name))
+                output.append(load_file_from_url(model_url, model_dir=places[0], file_name=download_name, hash_prefix=hash_prefix))
            else:
                output.append(model_url)
@@ -137,6 +139,27 @@ def load_upscalers():
        key=lambda x: x.name.lower() if not isinstance(x.scaler, (UpscalerNone, UpscalerLanczos, UpscalerNearest)) else ""
    )
+# None: not loaded, False: failed to load, True: loaded
+_spandrel_extra_init_state = None
+def _init_spandrel_extra_archs() -> None:
+    """
+    Try to initialize `spandrel_extra_archs` (exactly once).
+    """
+    global _spandrel_extra_init_state
+    if _spandrel_extra_init_state is not None:
+        return
+    try:
+        import spandrel
+        import spandrel_extra_arches
+        spandrel.MAIN_REGISTRY.add(*spandrel_extra_arches.EXTRA_REGISTRY)
+        _spandrel_extra_init_state = True
+    except Exception:
+        logger.warning("Failed to load spandrel_extra_arches", exc_info=True)
+        _spandrel_extra_init_state = False
 def load_spandrel_model(
    path: str | os.PathLike,
@@ -146,11 +169,16 @@ def load_spandrel_model(
    dtype: str | torch.dtype | None = None,
    expected_architecture: str | None = None,
 ) -> spandrel.ModelDescriptor:
+    global _spandrel_extra_init_state
    import spandrel
+    _init_spandrel_extra_archs()
    model_descriptor = spandrel.ModelLoader(device=device).load_from_file(str(path))
-    if expected_architecture and model_descriptor.architecture != expected_architecture:
+    arch = model_descriptor.architecture
+    if expected_architecture and arch.name != expected_architecture:
        logger.warning(
-            f"Model {path!r} is not a {expected_architecture!r} model (got {model_descriptor.architecture!r})",
+            f"Model {path!r} is not a {expected_architecture!r} model (got {arch.name!r})",
        )
    half = False
    if prefer_half:
@@ -164,6 +192,6 @@ def load_spandrel_model(
    model_descriptor.model.eval()
    logger.debug(
        "Loaded %s from %s (device=%s, half=%s, dtype=%s)",
-        model_descriptor, path, device, half, dtype,
+        arch, path, device, half, dtype,
    )
    return model_descriptor
--- a/modules/models/diffusion/uni_pc/uni_pc.py
+++ b/modules/models/diffusion/uni_pc/uni_pc.py
@@ -323,7 +323,7 @@ def model_wrapper(
    def model_fn(x, t_continuous, condition, unconditional_condition):
        """
-        The noise predicition model function that is used for DPM-Solver.
+        The noise prediction model function that is used for DPM-Solver.
        """
        if t_continuous.reshape((-1,)).shape[0] == 1:
            t_continuous = t_continuous.expand((x.shape[0]))

--- a/modules/models/sd3/mmdit.py
+++ b/modules/models/sd3/mmdit.py
--- a/modules/models/sd3/other_impls.py
+++ b/modules/models/sd3/other_impls.py
--- a/modules/models/sd3/sd3_cond.py
+++ b/modules/models/sd3/sd3_cond.py
+import os
+import safetensors
+import torch
+import typing
+from transformers import CLIPTokenizer, T5TokenizerFast
+from modules import shared, devices, modelloader, sd_hijack_clip, prompt_parser
+from modules.models.sd3.other_impls import SDClipModel, SDXLClipG, T5XXLModel, SD3Tokenizer
+class SafetensorsMapping(typing.Mapping):
+    def __init__(self, file):
+        self.file = file
+    def __len__(self):
+        return len(self.file.keys())
+    def __iter__(self):
+        for key in self.file.keys():
+            yield key
+    def __getitem__(self, key):
+        return self.file.get_tensor(key)
+CLIPL_URL = "https://huggingface.co/AUTOMATIC/stable-diffusion-3-medium-text-encoders/resolve/main/clip_l.safetensors"
+CLIPL_CONFIG = {
+    "hidden_act": "quick_gelu",
+    "hidden_size": 768,
+    "intermediate_size": 3072,
+    "num_attention_heads": 12,
+    "num_hidden_layers": 12,
+}
+CLIPG_URL = "https://huggingface.co/AUTOMATIC/stable-diffusion-3-medium-text-encoders/resolve/main/clip_g.safetensors"
+CLIPG_CONFIG = {
+    "hidden_act": "gelu",
+    "hidden_size": 1280,
+    "intermediate_size": 5120,
+    "num_attention_heads": 20,
+    "num_hidden_layers": 32,
+    "textual_inversion_key": "clip_g",
+}
+T5_URL = "https://huggingface.co/AUTOMATIC/stable-diffusion-3-medium-text-encoders/resolve/main/t5xxl_fp16.safetensors"
+T5_CONFIG = {
+    "d_ff": 10240,
+    "d_model": 4096,
+    "num_heads": 64,
+    "num_layers": 24,
+    "vocab_size": 32128,
+}
+class Sd3ClipLG(sd_hijack_clip.TextConditionalModel):
+    def __init__(self, clip_l, clip_g):
+        super().__init__()
+        self.clip_l = clip_l
+        self.clip_g = clip_g
+        self.tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
+        empty = self.tokenizer('')["input_ids"]
+        self.id_start = empty[0]
+        self.id_end = empty[1]
+        self.id_pad = empty[1]
+        self.return_pooled = True
+    def tokenize(self, texts):
+        return self.tokenizer(texts, truncation=False, add_special_tokens=False)["input_ids"]
+    def encode_with_transformers(self, tokens):
+        tokens_g = tokens.clone()
+        for batch_pos in range(tokens_g.shape[0]):
+            index = tokens_g[batch_pos].cpu().tolist().index(self.id_end)
+            tokens_g[batch_pos, index+1:tokens_g.shape[1]] = 0
+        l_out, l_pooled = self.clip_l(tokens)
+        g_out, g_pooled = self.clip_g(tokens_g)
+        lg_out = torch.cat([l_out, g_out], dim=-1)
+        lg_out = torch.nn.functional.pad(lg_out, (0, 4096 - lg_out.shape[-1]))
+        vector_out = torch.cat((l_pooled, g_pooled), dim=-1)
+        lg_out.pooled = vector_out
+        return lg_out
+    def encode_embedding_init_text(self, init_text, nvpt):
+        return torch.zeros((nvpt, 768+1280), device=devices.device) # XXX
+class Sd3T5(torch.nn.Module):
+    def __init__(self, t5xxl):
+        super().__init__()
+        self.t5xxl = t5xxl
+        self.tokenizer = T5TokenizerFast.from_pretrained("google/t5-v1_1-xxl")
+        empty = self.tokenizer('', padding='max_length', max_length=2)["input_ids"]
+        self.id_end = empty[0]
+        self.id_pad = empty[1]
+    def tokenize(self, texts):
+        return self.tokenizer(texts, truncation=False, add_special_tokens=False)["input_ids"]
+    def tokenize_line(self, line, *, target_token_count=None):
+        if shared.opts.emphasis != "None":
+            parsed = prompt_parser.parse_prompt_attention(line)
+        else:
+            parsed = [[line, 1.0]]
+        tokenized = self.tokenize([text for text, _ in parsed])
+        tokens = []
+        multipliers = []
+        for text_tokens, (text, weight) in zip(tokenized, parsed):
+            if text == 'BREAK' and weight == -1:
+                continue
+            tokens += text_tokens
+            multipliers += [weight] * len(text_tokens)
+        tokens += [self.id_end]
+        multipliers += [1.0]
+        if target_token_count is not None:
+            if len(tokens) < target_token_count:
+                tokens += [self.id_pad] * (target_token_count - len(tokens))
+                multipliers += [1.0] * (target_token_count - len(tokens))
+            else:
+                tokens = tokens[0:target_token_count]
+                multipliers = multipliers[0:target_token_count]
+        return tokens, multipliers
+    def forward(self, texts, *, token_count):
+        if not self.t5xxl or not shared.opts.sd3_enable_t5:
+            return torch.zeros((len(texts), token_count, 4096), device=devices.device, dtype=devices.dtype)
+        tokens_batch = []
+        for text in texts:
+            tokens, multipliers = self.tokenize_line(text, target_token_count=token_count)
+            tokens_batch.append(tokens)
+        t5_out, t5_pooled = self.t5xxl(tokens_batch)
+        return t5_out
+    def encode_embedding_init_text(self, init_text, nvpt):
+        return torch.zeros((nvpt, 4096), device=devices.device) # XXX
+class SD3Cond(torch.nn.Module):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.tokenizer = SD3Tokenizer()
+        with torch.no_grad():
+            self.clip_g = SDXLClipG(CLIPG_CONFIG, device="cpu", dtype=devices.dtype)
+            self.clip_l = SDClipModel(layer="hidden", layer_idx=-2, device="cpu", dtype=devices.dtype, layer_norm_hidden_state=False, return_projected_pooled=False, textmodel_json_config=CLIPL_CONFIG)
+            if shared.opts.sd3_enable_t5:
+                self.t5xxl = T5XXLModel(T5_CONFIG, device="cpu", dtype=devices.dtype)
+            else:
+                self.t5xxl = None
+            self.model_lg = Sd3ClipLG(self.clip_l, self.clip_g)
+            self.model_t5 = Sd3T5(self.t5xxl)
+    def forward(self, prompts: list[str]):
+        with devices.without_autocast():
+            lg_out, vector_out = self.model_lg(prompts)
+            t5_out = self.model_t5(prompts, token_count=lg_out.shape[1])
+            lgt_out = torch.cat([lg_out, t5_out], dim=-2)
+        return {
+            'crossattn': lgt_out,
+            'vector': vector_out,
+        }
+    def before_load_weights(self, state_dict):
+        clip_path = os.path.join(shared.models_path, "CLIP")
+        if 'text_encoders.clip_g.transformer.text_model.embeddings.position_embedding.weight' not in state_dict:
+            clip_g_file = modelloader.load_file_from_url(CLIPG_URL, model_dir=clip_path, file_name="clip_g.safetensors")
+            with safetensors.safe_open(clip_g_file, framework="pt") as file:
+                self.clip_g.transformer.load_state_dict(SafetensorsMapping(file))
+        if 'text_encoders.clip_l.transformer.text_model.embeddings.position_embedding.weight' not in state_dict:
+            clip_l_file = modelloader.load_file_from_url(CLIPL_URL, model_dir=clip_path, file_name="clip_l.safetensors")
+            with safetensors.safe_open(clip_l_file, framework="pt") as file:
+                self.clip_l.transformer.load_state_dict(SafetensorsMapping(file), strict=False)
+        if self.t5xxl and 'text_encoders.t5xxl.transformer.encoder.embed_tokens.weight' not in state_dict:
+            t5_file = modelloader.load_file_from_url(T5_URL, model_dir=clip_path, file_name="t5xxl_fp16.safetensors")
+            with safetensors.safe_open(t5_file, framework="pt") as file:
+                self.t5xxl.transformer.load_state_dict(SafetensorsMapping(file), strict=False)
+    def encode_embedding_init_text(self, init_text, nvpt):
+        return self.model_lg.encode_embedding_init_text(init_text, nvpt)
+    def tokenize(self, texts):
+        return self.model_lg.tokenize(texts)
+    def medvram_modules(self):
+        return [self.clip_g, self.clip_l, self.t5xxl]
+    def get_token_count(self, text):
+        _, token_count = self.model_lg.process_texts([text])
+        return token_count
+    def get_target_prompt_token_count(self, token_count):
+        return self.model_lg.get_target_prompt_token_count(token_count)
--- a/modules/models/sd3/sd3_impls.py
+++ b/modules/models/sd3/sd3_impls.py
--- a/modules/models/sd3/sd3_model.py
+++ b/modules/models/sd3/sd3_model.py
+import contextlib
+import torch
+import k_diffusion
+from modules.models.sd3.sd3_impls import BaseModel, SDVAE, SD3LatentFormat
+from modules.models.sd3.sd3_cond import SD3Cond
+from modules import shared, devices
+class SD3Denoiser(k_diffusion.external.DiscreteSchedule):
+    def __init__(self, inner_model, sigmas):
+        super().__init__(sigmas, quantize=shared.opts.enable_quantization)
+        self.inner_model = inner_model
+    def forward(self, input, sigma, **kwargs):
+        return self.inner_model.apply_model(input, sigma, **kwargs)
+class SD3Inferencer(torch.nn.Module):
+    def __init__(self, state_dict, shift=3, use_ema=False):
+        super().__init__()
+        self.shift = shift
+        with torch.no_grad():
+            self.model = BaseModel(shift=shift, state_dict=state_dict, prefix="model.diffusion_model.", device="cpu", dtype=devices.dtype)
+            self.first_stage_model = SDVAE(device="cpu", dtype=devices.dtype_vae)
+            self.first_stage_model.dtype = self.model.diffusion_model.dtype
+        self.alphas_cumprod = 1 / (self.model.model_sampling.sigmas ** 2 + 1)
+        self.text_encoders = SD3Cond()
+        self.cond_stage_key = 'txt'
+        self.parameterization = "eps"
+        self.model.conditioning_key = "crossattn"
+        self.latent_format = SD3LatentFormat()
+        self.latent_channels = 16
+    @property
+    def cond_stage_model(self):
+        return self.text_encoders
+    def before_load_weights(self, state_dict):
+        self.cond_stage_model.before_load_weights(state_dict)
+    def ema_scope(self):
+        return contextlib.nullcontext()
+    def get_learned_conditioning(self, batch: list[str]):
+        return self.cond_stage_model(batch)
+    def apply_model(self, x, t, cond):
+        return self.model(x, t, c_crossattn=cond['crossattn'], y=cond['vector'])
+    def decode_first_stage(self, latent):
+        latent = self.latent_format.process_out(latent)
+        return self.first_stage_model.decode(latent)
+    def encode_first_stage(self, image):
+        latent = self.first_stage_model.encode(image)
+        return self.latent_format.process_in(latent)
+    def get_first_stage_encoding(self, x):
+        return x
+    def create_denoiser(self):
+        return SD3Denoiser(self, self.model.model_sampling.sigmas)
+    def medvram_fields(self):
+        return [
+            (self, 'first_stage_model'),
+            (self, 'text_encoders'),
+            (self, 'model'),
+        ]
+    def add_noise_to_latent(self, x, noise, amount):
+        return x * (1 - amount) + noise * amount
+    def fix_dimensions(self, width, height):
+        return width // 16 * 16, height // 16 * 16
+    def diffusers_weight_mapping(self):
+        for i in range(self.model.depth):
+            yield f"transformer.transformer_blocks.{i}.attn.to_q", f"diffusion_model_joint_blocks_{i}_x_block_attn_qkv_q_proj"
+            yield f"transformer.transformer_blocks.{i}.attn.to_k", f"diffusion_model_joint_blocks_{i}_x_block_attn_qkv_k_proj"
+            yield f"transformer.transformer_blocks.{i}.attn.to_v", f"diffusion_model_joint_blocks_{i}_x_block_attn_qkv_v_proj"
+            yield f"transformer.transformer_blocks.{i}.attn.to_out.0", f"diffusion_model_joint_blocks_{i}_x_block_attn_proj"
+            yield f"transformer.transformer_blocks.{i}.attn.add_q_proj", f"diffusion_model_joint_blocks_{i}_context_block.attn_qkv_q_proj"
+            yield f"transformer.transformer_blocks.{i}.attn.add_k_proj", f"diffusion_model_joint_blocks_{i}_context_block.attn_qkv_k_proj"
+            yield f"transformer.transformer_blocks.{i}.attn.add_v_proj", f"diffusion_model_joint_blocks_{i}_context_block.attn_qkv_v_proj"
+            yield f"transformer.transformer_blocks.{i}.attn.add_out_proj.0", f"diffusion_model_joint_blocks_{i}_context_block_attn_proj"
--- a/modules/paths_internal.py
+++ b/modules/paths_internal.py
@@ -24,11 +24,12 @@ default_sd_model_file = sd_model_file
 # Parse the --data-dir flag first so we can use it as a base for our other argument default values
 parser_pre = argparse.ArgumentParser(add_help=False)
 parser_pre.add_argument("--data-dir", type=str, default=os.path.dirname(modules_path), help="base path where all user data is stored", )
+parser_pre.add_argument("--models-dir", type=str, default=None, help="base path where models are stored; overrides --data-dir", )
 cmd_opts_pre = parser_pre.parse_known_args()[0]
 data_path = cmd_opts_pre.data_dir
-models_path = os.path.join(data_path, "models")
+models_path = cmd_opts_pre.models_dir if cmd_opts_pre.models_dir else os.path.join(data_path, "models")
 extensions_dir = os.path.join(data_path, "extensions")
 extensions_builtin_dir = os.path.join(script_path, "extensions-builtin")
 config_states_dir = os.path.join(script_path, "config_states")

--- a/modules/postprocessing.py
+++ b/modules/postprocessing.py
@@ -51,7 +51,7 @@ def run_postprocessing(extras_mode, image, image_folder, input_dir, output_dir,
        shared.state.textinfo = name
        shared.state.skipped = False
-        if shared.state.interrupted:
+        if shared.state.interrupted or shared.state.stopping_generation:
            break
        if isinstance(image_placeholder, str):
@@ -62,11 +62,13 @@ def run_postprocessing(extras_mode, image, image_folder, input_dir, output_dir,
        else:
            image_data = image_placeholder
+        image_data = image_data if image_data.mode in ("RGBA", "RGB") else image_data.convert("RGB")
        parameters, existing_pnginfo = images.read_info_from_image(image_data)
        if parameters:
            existing_pnginfo["parameters"] = parameters
-        initial_pp = scripts_postprocessing.PostprocessedImage(image_data if image_data.mode in ("RGBA", "RGB") else image_data.convert("RGB"))
+        initial_pp = scripts_postprocessing.PostprocessedImage(image_data)
        scripts.scripts_postproc.run(initial_pp, args)

--- a/modules/processing.py
+++ b/modules/processing.py
--- a/modules/profiling.py
+++ b/modules/profiling.py
+import torch
+from modules import shared, ui_gradio_extensions
+class Profiler:
+    def __init__(self):
+        if not shared.opts.profiling_enable:
+            self.profiler = None
+            return
+        activities = []
+        if "CPU" in shared.opts.profiling_activities:
+            activities.append(torch.profiler.ProfilerActivity.CPU)
+        if "CUDA" in shared.opts.profiling_activities:
+            activities.append(torch.profiler.ProfilerActivity.CUDA)
+        if not activities:
+            self.profiler = None
+            return
+        self.profiler = torch.profiler.profile(
+            activities=activities,
+            record_shapes=shared.opts.profiling_record_shapes,
+            profile_memory=shared.opts.profiling_profile_memory,
+            with_stack=shared.opts.profiling_with_stack
+        )
+    def __enter__(self):
+        if self.profiler:
+            self.profiler.__enter__()
+        return self
+    def __exit__(self, exc_type, exc, exc_tb):
+        if self.profiler:
+            shared.state.textinfo = "Finishing profile..."
+            self.profiler.__exit__(exc_type, exc, exc_tb)
+            self.profiler.export_chrome_trace(shared.opts.profiling_filename)
+def webpath():
+    return ui_gradio_extensions.webpath(shared.opts.profiling_filename)
--- a/modules/prompt_parser.py
+++ b/modules/prompt_parser.py
@@ -268,7 +268,7 @@ def get_multicond_learned_conditioning(model, prompts, steps, hires_steps=None,
 class DictWithShape(dict):
-    def __init__(self, x, shape):
+    def __init__(self, x, shape=None):
        super().__init__()
        self.update(x)

--- a/modules/safe.py
+++ b/modules/safe.py
@@ -64,8 +64,8 @@ class RestrictedUnpickler(pickle.Unpickler):
        raise Exception(f"global '{module}/{name}' is forbidden")
-# Regular expression that accepts 'dirname/version', 'dirname/data.pkl', and 'dirname/data/<number>'
+# Regular expression that accepts 'dirname/version', 'dirname/byteorder', 'dirname/data.pkl', '.data/serialization_id', and 'dirname/data/<number>'
-allowed_zip_names_re = re.compile(r"^([^/]+)/((data/\d+)|version|(data\.pkl))$")
+allowed_zip_names_re = re.compile(r"^([^/]+)/((data/\d+)|version|byteorder|.data/serialization_id|(data\.pkl))$")
 data_pkl_re = re.compile(r"^([^/]+)/data\.pkl$")
 def check_zip_filenames(filename, names):

--- a/modules/scripts.py
+++ b/modules/scripts.py
@@ -187,6 +187,13 @@ class Script:
        """
        pass
+    def process_before_every_sampling(self, p, *args, **kwargs):
+        """
+        Similar to process(), called before every sampling.
+        If you use high-res fix, this will be called two times.
+        """
+        pass
    def process_batch(self, p, *args, **kwargs):
        """
        Same as process(), but called for every batch.
@@ -826,6 +833,14 @@ class ScriptRunner:
            except Exception:
                errors.report(f"Error running process: {script.filename}", exc_info=True)
+    def process_before_every_sampling(self, p, **kwargs):
+        for script in self.ordered_scripts('process_before_every_sampling'):
+            try:
+                script_args = p.script_args[script.args_from:script.args_to]
+                script.process_before_every_sampling(p, *script_args, **kwargs)
+            except Exception:
+                errors.report(f"Error running process_before_every_sampling: {script.filename}", exc_info=True)
    def before_process_batch(self, p, **kwargs):
        for script in self.ordered_scripts('before_process_batch'):
            try:

--- a/modules/sd_hijack.py
+++ b/modules/sd_hijack.py
@@ -325,7 +325,10 @@ class StableDiffusionModelHijack:
        if self.clip is None:
            return "-", "-"
-        _, token_count = self.clip.process_texts([text])
+        if hasattr(self.clip, 'get_token_count'):
+            token_count = self.clip.get_token_count(text)
+        else:
+            _, token_count = self.clip.process_texts([text])
        return token_count, self.clip.get_target_prompt_token_count(token_count)
@@ -356,13 +359,28 @@ class EmbeddingsWithFixes(torch.nn.Module):
                vec = embedding.vec[self.textual_inversion_key] if isinstance(embedding.vec, dict) else embedding.vec
                emb = devices.cond_cast_unet(vec)
                emb_len = min(tensor.shape[0] - offset - 1, emb.shape[0])
-                tensor = torch.cat([tensor[0:offset + 1], emb[0:emb_len], tensor[offset + 1 + emb_len:]])
+                tensor = torch.cat([tensor[0:offset + 1], emb[0:emb_len], tensor[offset + 1 + emb_len:]]).to(dtype=inputs_embeds.dtype)
            vecs.append(tensor)
        return torch.stack(vecs)
+class TextualInversionEmbeddings(torch.nn.Embedding):
+    def __init__(self, num_embeddings: int, embedding_dim: int, textual_inversion_key='clip_l', **kwargs):
+        super().__init__(num_embeddings, embedding_dim, **kwargs)
+        self.embeddings = model_hijack
+        self.textual_inversion_key = textual_inversion_key
+    @property
+    def wrapped(self):
+        return super().forward
+    def forward(self, input_ids):
+        return EmbeddingsWithFixes.forward(self, input_ids)
 def add_circular_option_to_conv_2d():
    conv2d_constructor = torch.nn.Conv2d.__init__

--- a/modules/sd_hijack_clip.py
+++ b/modules/sd_hijack_clip.py
@@ -27,24 +27,21 @@ chunk. Those objects are found in PromptChunk.fixes and, are placed into FrozenC
 are applied by sd_hijack.EmbeddingsWithFixes's forward function."""
-class FrozenCLIPEmbedderWithCustomWordsBase(torch.nn.Module):
+class TextConditionalModel(torch.nn.Module):
-    """A pytorch module that is a wrapper for FrozenCLIPEmbedder module. it enhances FrozenCLIPEmbedder, making it possible to
+    def __init__(self):
-    have unlimited prompt length and assign weights to tokens in prompt.
-    """
-    def __init__(self, wrapped, hijack):
        super().__init__()
-        self.wrapped = wrapped
+        self.hijack = sd_hijack.model_hijack
-        """Original FrozenCLIPEmbedder module; can also be FrozenOpenCLIPEmbedder or xlmr.BertSeriesModelWithTransformation,
-        depending on model."""
-        self.hijack: sd_hijack.StableDiffusionModelHijack = hijack
        self.chunk_length = 75
-        self.is_trainable = getattr(wrapped, 'is_trainable', False)
+        self.is_trainable = False
-        self.input_key = getattr(wrapped, 'input_key', 'txt')
+        self.input_key = 'txt'
-        self.legacy_ucg_val = None
+        self.return_pooled = False
+        self.comma_token = None
+        self.id_start = None
+        self.id_end = None
+        self.id_pad = None
    def empty_chunk(self):
        """creates an empty PromptChunk and returns it"""
@@ -210,10 +207,6 @@ class FrozenCLIPEmbedderWithCustomWordsBase(torch.nn.Module):
        is when you do prompt editing: "a picture of a [cat:dog:0.4] eating ice cream"
        """
-        if opts.use_old_emphasis_implementation:
-            import modules.sd_hijack_clip_old
-            return modules.sd_hijack_clip_old.forward_old(self, texts)
        batch_chunks, token_count = self.process_texts(texts)
        used_embeddings = {}
@@ -252,7 +245,7 @@ class FrozenCLIPEmbedderWithCustomWordsBase(torch.nn.Module):
        if any(x for x in texts if "(" in x or "[" in x) and opts.emphasis != "Original":
            self.hijack.extra_generation_params["Emphasis"] = opts.emphasis
-        if getattr(self.wrapped, 'return_pooled', False):
+        if self.return_pooled:
            return torch.hstack(zs), zs[0].pooled
        else:
            return torch.hstack(zs)
@@ -292,6 +285,34 @@ class FrozenCLIPEmbedderWithCustomWordsBase(torch.nn.Module):
        return z
+class FrozenCLIPEmbedderWithCustomWordsBase(TextConditionalModel):
+    """A pytorch module that is a wrapper for FrozenCLIPEmbedder module. it enhances FrozenCLIPEmbedder, making it possible to
+    have unlimited prompt length and assign weights to tokens in prompt.
+    """
+    def __init__(self, wrapped, hijack):
+        super().__init__()
+        self.hijack = hijack
+        self.wrapped = wrapped
+        """Original FrozenCLIPEmbedder module; can also be FrozenOpenCLIPEmbedder or xlmr.BertSeriesModelWithTransformation,
+        depending on model."""
+        self.is_trainable = getattr(wrapped, 'is_trainable', False)
+        self.input_key = getattr(wrapped, 'input_key', 'txt')
+        self.return_pooled = getattr(self.wrapped, 'return_pooled', False)
+        self.legacy_ucg_val = None  # for sgm codebase
+    def forward(self, texts):
+        if opts.use_old_emphasis_implementation:
+            import modules.sd_hijack_clip_old
+            return modules.sd_hijack_clip_old.forward_old(self, texts)
+        return super().forward(texts)
 class FrozenCLIPEmbedderWithCustomWords(FrozenCLIPEmbedderWithCustomWordsBase):
    def __init__(self, wrapped, hijack):
        super().__init__(wrapped, hijack)
@@ -353,7 +374,9 @@ class FrozenCLIPEmbedderForSDXLWithCustomWords(FrozenCLIPEmbedderWithCustomWords
    def encode_with_transformers(self, tokens):
        outputs = self.wrapped.transformer(input_ids=tokens, output_hidden_states=self.wrapped.layer == "hidden")
-        if self.wrapped.layer == "last":
+        if opts.sdxl_clip_l_skip is True:
+            z = outputs.hidden_states[-opts.CLIP_stop_at_last_layers]
+        elif self.wrapped.layer == "last":
            z = outputs.last_hidden_state
        else:
            z = outputs.hidden_states[self.wrapped.layer_idx]

--- a/modules/sd_hijack_optimizations.py
+++ b/modules/sd_hijack_optimizations.py
@@ -486,7 +486,8 @@ def xformers_attention_forward(self, x, context=None, mask=None, **kwargs):
    k_in = self.to_k(context_k)
    v_in = self.to_v(context_v)
-    q, k, v = (rearrange(t, 'b n (h d) -> b n h d', h=h) for t in (q_in, k_in, v_in))
+    q, k, v = (t.reshape(t.shape[0], t.shape[1], h, -1) for t in (q_in, k_in, v_in))
    del q_in, k_in, v_in
    dtype = q.dtype
@@ -497,7 +498,8 @@ def xformers_attention_forward(self, x, context=None, mask=None, **kwargs):
    out = out.to(dtype)
-    out = rearrange(out, 'b n h d -> b n (h d)', h=h)
+    b, n, h, d = out.shape
+    out = out.reshape(b, n, h * d)
    return self.to_out(out)

--- a/modules/sd_hijack_unet.py
+++ b/modules/sd_hijack_unet.py
 import torch
 from packaging import version
+from einops import repeat
+import math
 from modules import devices
 from modules.sd_hijack_utils import CondFunc
@@ -36,7 +38,7 @@ th = TorchHijackForUnet()
 # Below are monkey patches to enable upcasting a float16 UNet for float32 sampling
 def apply_model(orig_func, self, x_noisy, t, cond, **kwargs):
+    """Always make sure inputs to unet are in correct dtype."""
    if isinstance(cond, dict):
        for y in cond.keys():
            if isinstance(cond[y], list):
@@ -45,7 +47,59 @@ def apply_model(orig_func, self, x_noisy, t, cond, **kwargs):
                cond[y] = cond[y].to(devices.dtype_unet) if isinstance(cond[y], torch.Tensor) else cond[y]
    with devices.autocast():
-        return orig_func(self, x_noisy.to(devices.dtype_unet), t.to(devices.dtype_unet), cond, **kwargs).float()
+        result = orig_func(self, x_noisy.to(devices.dtype_unet), t.to(devices.dtype_unet), cond, **kwargs)
+        if devices.unet_needs_upcast:
+            return result.float()
+        else:
+            return result
+# Monkey patch to create timestep embed tensor on device, avoiding a block.
+def timestep_embedding(_, timesteps, dim, max_period=10000, repeat_only=False):
+    """
+    Create sinusoidal timestep embeddings.
+    :param timesteps: a 1-D Tensor of N indices, one per batch element.
+                      These may be fractional.
+    :param dim: the dimension of the output.
+    :param max_period: controls the minimum frequency of the embeddings.
+    :return: an [N x dim] Tensor of positional embeddings.
+    """
+    if not repeat_only:
+        half = dim // 2
+        freqs = torch.exp(
+            -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32, device=timesteps.device) / half
+        )
+        args = timesteps[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+    else:
+        embedding = repeat(timesteps, 'b -> b d', d=dim)
+    return embedding
+# Monkey patch to SpatialTransformer removing unnecessary contiguous calls.
+# Prevents a lot of unnecessary aten::copy_ calls
+def spatial_transformer_forward(_, self, x: torch.Tensor, context=None):
+    # note: if no context is given, cross-attention defaults to self-attention
+    if not isinstance(context, list):
+        context = [context]
+    b, c, h, w = x.shape
+    x_in = x
+    x = self.norm(x)
+    if not self.use_linear:
+        x = self.proj_in(x)
+    x = x.permute(0, 2, 3, 1).reshape(b, h * w, c)
+    if self.use_linear:
+        x = self.proj_in(x)
+    for i, block in enumerate(self.transformer_blocks):
+        x = block(x, context=context[i])
+    if self.use_linear:
+        x = self.proj_out(x)
+    x = x.view(b, h, w, c).permute(0, 3, 1, 2)
+    if not self.use_linear:
+        x = self.proj_out(x)
+    return x + x_in
 class GELUHijack(torch.nn.GELU, torch.nn.Module):
@@ -64,12 +118,15 @@ def hijack_ddpm_edit():
    if not ddpm_edit_hijack:
        CondFunc('modules.models.diffusion.ddpm_edit.LatentDiffusion.decode_first_stage', first_stage_sub, first_stage_cond)
        CondFunc('modules.models.diffusion.ddpm_edit.LatentDiffusion.encode_first_stage', first_stage_sub, first_stage_cond)
-        ddpm_edit_hijack = CondFunc('modules.models.diffusion.ddpm_edit.LatentDiffusion.apply_model', apply_model, unet_needs_upcast)
+        ddpm_edit_hijack = CondFunc('modules.models.diffusion.ddpm_edit.LatentDiffusion.apply_model', apply_model)
 unet_needs_upcast = lambda *args, **kwargs: devices.unet_needs_upcast
 CondFunc('ldm.models.diffusion.ddpm.LatentDiffusion.apply_model', apply_model, unet_needs_upcast)
+CondFunc('ldm.modules.diffusionmodules.openaimodel.timestep_embedding', timestep_embedding)
+CondFunc('ldm.modules.attention.SpatialTransformer.forward', spatial_transformer_forward)
 CondFunc('ldm.modules.diffusionmodules.openaimodel.timestep_embedding', lambda orig_func, timesteps, *args, **kwargs: orig_func(timesteps, *args, **kwargs).to(torch.float32 if timesteps.dtype == torch.int64 else devices.dtype_unet), unet_needs_upcast)
 if version.parse(torch.__version__) <= version.parse("1.13.2") or torch.cuda.is_available():
    CondFunc('ldm.modules.diffusionmodules.util.GroupNorm32.forward', lambda orig_func, self, *args, **kwargs: orig_func(self.float(), *args, **kwargs), unet_needs_upcast)
    CondFunc('ldm.modules.attention.GEGLU.forward', lambda orig_func, self, x: orig_func(self.float(), x.float()).to(devices.dtype_unet), unet_needs_upcast)
@@ -81,5 +138,17 @@ CondFunc('ldm.models.diffusion.ddpm.LatentDiffusion.decode_first_stage', first_s
 CondFunc('ldm.models.diffusion.ddpm.LatentDiffusion.encode_first_stage', first_stage_sub, first_stage_cond)
 CondFunc('ldm.models.diffusion.ddpm.LatentDiffusion.get_first_stage_encoding', lambda orig_func, *args, **kwargs: orig_func(*args, **kwargs).float(), first_stage_cond)
-CondFunc('sgm.modules.diffusionmodules.wrappers.OpenAIWrapper.forward', apply_model, unet_needs_upcast)
+CondFunc('ldm.models.diffusion.ddpm.LatentDiffusion.apply_model', apply_model)
-CondFunc('sgm.modules.diffusionmodules.openaimodel.timestep_embedding', lambda orig_func, timesteps, *args, **kwargs: orig_func(timesteps, *args, **kwargs).to(torch.float32 if timesteps.dtype == torch.int64 else devices.dtype_unet), unet_needs_upcast)
+CondFunc('sgm.modules.diffusionmodules.wrappers.OpenAIWrapper.forward', apply_model)
+def timestep_embedding_cast_result(orig_func, timesteps, *args, **kwargs):
+    if devices.unet_needs_upcast and timesteps.dtype == torch.int64:
+        dtype = torch.float32
+    else:
+        dtype = devices.dtype_unet
+    return orig_func(timesteps, *args, **kwargs).to(dtype=dtype)
+CondFunc('ldm.modules.diffusionmodules.openaimodel.timestep_embedding', timestep_embedding_cast_result)
+CondFunc('sgm.modules.diffusionmodules.openaimodel.timestep_embedding', timestep_embedding_cast_result)
--- a/modules/sd_hijack_utils.py
+++ b/modules/sd_hijack_utils.py
 import importlib
+always_true_func = lambda *args, **kwargs: True
 class CondFunc:
-    def __new__(cls, orig_func, sub_func, cond_func):
+    def __new__(cls, orig_func, sub_func, cond_func=always_true_func):
        self = super(CondFunc, cls).__new__(cls)
        if isinstance(orig_func, str):
            func_path = orig_func.split('.')
@@ -20,13 +24,13 @@ class CondFunc:
                print(f"Warning: Failed to resolve {orig_func} for CondFunc hijack")
                pass
        self.__init__(orig_func, sub_func, cond_func)
        return lambda *args, **kwargs: self(*args, **kwargs)
    def __init__(self, orig_func, sub_func, cond_func):
        self.__orig_func = orig_func
        self.__sub_func = sub_func
        self.__cond_func = cond_func
    def __call__(self, *args, **kwargs):
        if not self.__cond_func or self.__cond_func(self.__orig_func, *args, **kwargs):
            return self.__sub_func(self.__orig_func, *args, **kwargs)
        else:
            return self.__orig_func(*args, **kwargs)
--- a/modules/sd_models.py
+++ b/modules/sd_models.py
--- a/modules/sd_models_config.py
+++ b/modules/sd_models_config.py
@@ -23,6 +23,8 @@ config_inpainting = os.path.join(sd_configs_path, "v1-inpainting-inference.yaml"
 config_instruct_pix2pix = os.path.join(sd_configs_path, "instruct-pix2pix.yaml")
 config_alt_diffusion = os.path.join(sd_configs_path, "alt-diffusion-inference.yaml")
 config_alt_diffusion_m18 = os.path.join(sd_configs_path, "alt-diffusion-m18-inference.yaml")
+config_sd3 = os.path.join(sd_configs_path, "sd3-inference.yaml")
 def is_using_v_parameterization_for_sd2(state_dict):
    """
@@ -31,11 +33,11 @@ def is_using_v_parameterization_for_sd2(state_dict):
    import ldm.modules.diffusionmodules.openaimodel
-    device = devices.cpu
+    device = devices.device
    with sd_disable_initialization.DisableInitialization():
        unet = ldm.modules.diffusionmodules.openaimodel.UNetModel(
-            use_checkpoint=True,
+            use_checkpoint=False,
            use_fp16=False,
            image_size=32,
            in_channels=4,
@@ -56,12 +58,13 @@ def is_using_v_parameterization_for_sd2(state_dict):
    with torch.no_grad():
        unet_sd = {k.replace("model.diffusion_model.", ""): v for k, v in state_dict.items() if "model.diffusion_model." in k}
        unet.load_state_dict(unet_sd, strict=True)
-        unet.to(device=device, dtype=torch.float)
+        unet.to(device=device, dtype=devices.dtype_unet)
        test_cond = torch.ones((1, 2, 1024), device=device) * 0.5
        x_test = torch.ones((1, 4, 8, 8), device=device) * 0.5
-        out = (unet(x_test, torch.asarray([999], device=device), context=test_cond) - x_test).mean().item()
+        with devices.autocast():
+            out = (unet(x_test, torch.asarray([999], device=device), context=test_cond) - x_test).mean().cpu().item()
    return out < -1
@@ -71,11 +74,15 @@ def guess_model_config_from_state_dict(sd, filename):
    diffusion_model_input = sd.get('model.diffusion_model.input_blocks.0.0.weight', None)
    sd2_variations_weight = sd.get('embedder.model.ln_final.weight', None)
+    if "model.diffusion_model.x_embedder.proj.weight" in sd:
+        return config_sd3
    if sd.get('conditioner.embedders.1.model.ln_final.weight', None) is not None:
        if diffusion_model_input.shape[1] == 9:
            return config_sdxl_inpainting
        else:
            return config_sdxl
    if sd.get('conditioner.embedders.0.model.ln_final.weight', None) is not None:
        return config_sdxl_refiner
    elif sd.get('depth_model.model.pretrained.act_postprocess3.0.project.0.bias', None) is not None:
@@ -99,7 +106,6 @@ def guess_model_config_from_state_dict(sd, filename):
        if diffusion_model_input.shape[1] == 8:
            return config_instruct_pix2pix
    if sd.get('cond_stage_model.roberta.embeddings.word_embeddings.weight', None) is not None:
        if sd.get('cond_stage_model.transformation.weight').size()[0] == 1024:
            return config_alt_diffusion_m18

--- a/modules/sd_models_types.py
+++ b/modules/sd_models_types.py
@@ -32,3 +32,9 @@ class WebuiSdModel(LatentDiffusion):
    is_sd1: bool
    """True if the model's architecture is SD 1.x"""
+    is_sd3: bool
+    """True if the model's architecture is SD 3"""
+    latent_channels: int
+    """number of layer in latent image representation; will be 16 in SD3 and 4 in other version"""
--- a/modules/sd_models_xl.py
+++ b/modules/sd_models_xl.py
@@ -35,11 +35,10 @@ def get_learned_conditioning(self: sgm.models.diffusion.DiffusionEngine, batch:
 def apply_model(self: sgm.models.diffusion.DiffusionEngine, x, t, cond):
-    sd = self.model.state_dict()
+    """WARNING: This function is called once per denoising iteration. DO NOT add
-    diffusion_model_input = sd.get('diffusion_model.input_blocks.0.0.weight', None)
+    expensive functionc calls such as `model.state_dict`. """
-    if diffusion_model_input is not None:
+    if self.is_sdxl_inpaint:
-        if diffusion_model_input.shape[1] == 9:
+        x = torch.cat([x] + cond['c_concat'], dim=1)
-            x = torch.cat([x] + cond['c_concat'], dim=1)
    return self.model(x, t, cond)

--- a/modules/sd_samplers.py
+++ b/modules/sd_samplers.py
 from __future__ import annotations
 import functools
+import logging
 from modules import sd_samplers_kdiffusion, sd_samplers_timesteps, sd_samplers_lcm, shared, sd_samplers_common, sd_schedulers
 # imports for functions that previously were here and are used by other modules
@@ -98,7 +98,7 @@ def get_hr_scheduler_from_infotext(d: dict):
 @functools.cache
-def get_sampler_and_scheduler(sampler_name, scheduler_name):
+def get_sampler_and_scheduler(sampler_name, scheduler_name, *, convert_automatic=True):
    default_sampler = samplers[0]
    found_scheduler = sd_schedulers.schedulers_map.get(scheduler_name, sd_schedulers.schedulers[0])
@@ -116,10 +116,17 @@ def get_sampler_and_scheduler(sampler_name, scheduler_name):
    sampler = all_samplers_map.get(name, default_sampler)
    # revert back to Automatic if it's the default scheduler for the selected sampler
-    if sampler.options.get('scheduler', None) == found_scheduler.name:
+    if convert_automatic and sampler.options.get('scheduler', None) == found_scheduler.name:
        found_scheduler = sd_schedulers.schedulers[0]
    return sampler.name, found_scheduler.label
+def fix_p_invalid_sampler_and_scheduler(p):
+    i_sampler_name, i_scheduler = p.sampler_name, p.scheduler
+    p.sampler_name, p.scheduler = get_sampler_and_scheduler(p.sampler_name, p.scheduler, convert_automatic=False)
+    if p.sampler_name != i_sampler_name or i_scheduler != p.scheduler:
+        logging.warning(f'Sampler Scheduler autocorrection: "{i_sampler_name}" -> "{p.sampler_name}", "{i_scheduler}" -> "{p.scheduler}"')
 set_samplers()
--- a/modules/sd_samplers_cfg_denoiser.py
+++ b/modules/sd_samplers_cfg_denoiser.py
 import torch
-from modules import prompt_parser, devices, sd_samplers_common
+from modules import prompt_parser, sd_samplers_common
 from modules.shared import opts, state
 import modules.shared as shared
@@ -58,6 +58,11 @@ class CFGDenoiser(torch.nn.Module):
        self.model_wrap = None
        self.p = None
+        self.cond_scale_miltiplier = 1.0
+        self.need_last_noise_uncond = False
+        self.last_noise_uncond = None
        # NOTE: masking before denoising can cause the original latents to be oversmoothed
        # as the original latents do not have noise
        self.mask_before_denoising = False
@@ -212,9 +217,16 @@ class CFGDenoiser(torch.nn.Module):
        uncond = denoiser_params.text_uncond
        skip_uncond = False
-        # alternating uncond allows for higher thresholds without the quality loss normally expected from raising it
+        if shared.opts.skip_early_cond != 0. and self.step / self.total_steps <= shared.opts.skip_early_cond:
-        if self.step % 2 and s_min_uncond > 0 and sigma[0] < s_min_uncond and not is_edit_model:
+            skip_uncond = True
+            self.p.extra_generation_params["Skip Early CFG"] = shared.opts.skip_early_cond
+        elif (self.step % 2 or shared.opts.s_min_uncond_all) and s_min_uncond > 0 and sigma[0] < s_min_uncond and not is_edit_model:
            skip_uncond = True
+            self.p.extra_generation_params["NGMS"] = s_min_uncond
+            if shared.opts.s_min_uncond_all:
+                self.p.extra_generation_params["NGMS all steps"] = shared.opts.s_min_uncond_all
+        if skip_uncond:
            x_in = x_in[:-batch_size]
            sigma_in = sigma_in[:-batch_size]
@@ -266,14 +278,15 @@ class CFGDenoiser(torch.nn.Module):
        denoised_params = CFGDenoisedParams(x_out, state.sampling_step, state.sampling_steps, self.inner_model)
        cfg_denoised_callback(denoised_params)
-        devices.test_for_nans(x_out, "unet")
+        if self.need_last_noise_uncond:
+            self.last_noise_uncond = torch.clone(x_out[-uncond.shape[0]:])
        if is_edit_model:
-            denoised = self.combine_denoised_for_edit_model(x_out, cond_scale)
+            denoised = self.combine_denoised_for_edit_model(x_out, cond_scale * self.cond_scale_miltiplier)
        elif skip_uncond:
            denoised = self.combine_denoised(x_out, conds_list, uncond, 1.0)
        else:
-            denoised = self.combine_denoised(x_out, conds_list, uncond, cond_scale)
+            denoised = self.combine_denoised(x_out, conds_list, uncond, cond_scale * self.cond_scale_miltiplier)
        # Blend in the original latents (after)
        if not self.mask_before_denoising and self.mask is not None:

--- a/modules/sd_samplers_common.py
+++ b/modules/sd_samplers_common.py
@@ -54,7 +54,7 @@ def samples_to_images_tensor(sample, approximation=None, model=None):
    else:
        if model is None:
            model = shared.sd_model
-        with devices.without_autocast(): # fixes an issue with unstable VAEs that are flaky even in fp32
+        with torch.no_grad(), devices.without_autocast(): # fixes an issue with unstable VAEs that are flaky even in fp32
            x_sample = model.decode_first_stage(sample.to(model.first_stage_model.dtype))
    return x_sample
@@ -163,7 +163,7 @@ def apply_refiner(cfg_denoiser, sigma=None):
    else:
        # torch.max(sigma) only to handle rare case where we might have different sigmas in the same batch
        try:
-            timestep = torch.argmin(torch.abs(cfg_denoiser.inner_model.sigmas - torch.max(sigma)))
+            timestep = torch.argmin(torch.abs(cfg_denoiser.inner_model.sigmas.to(sigma.device) - torch.max(sigma)))
        except AttributeError:  # for samplers that don't use sigmas (DDIM) sigma is actually the timestep
            timestep = torch.max(sigma).to(dtype=int)
        completed_ratio = (999 - timestep) / 1000
@@ -246,7 +246,7 @@ class Sampler:
        self.eta_infotext_field = 'Eta'
        self.eta_default = 1.0
-        self.conditioning_key = shared.sd_model.model.conditioning_key
+        self.conditioning_key = getattr(shared.sd_model.model, 'conditioning_key', 'crossattn')
        self.p = None
        self.model_wrap_cfg = None

--- a/modules/sd_samplers_kdiffusion.py
+++ b/modules/sd_samplers_kdiffusion.py
 import torch
 import inspect
 import k_diffusion.sampling
-from modules import sd_samplers_common, sd_samplers_extra, sd_samplers_cfg_denoiser, sd_schedulers
+from modules import sd_samplers_common, sd_samplers_extra, sd_samplers_cfg_denoiser, sd_schedulers, devices
 from modules.sd_samplers_cfg_denoiser import CFGDenoiser  # noqa: F401
 from modules.script_callbacks import ExtraNoiseParams, extra_noise_callback
@@ -53,8 +53,13 @@ class CFGDenoiserKDiffusion(sd_samplers_cfg_denoiser.CFGDenoiser):
    @property
    def inner_model(self):
        if self.model_wrap is None:
-            denoiser = k_diffusion.external.CompVisVDenoiser if shared.sd_model.parameterization == "v" else k_diffusion.external.CompVisDenoiser
+            denoiser_constructor = getattr(shared.sd_model, 'create_denoiser', None)
-            self.model_wrap = denoiser(shared.sd_model, quantize=shared.opts.enable_quantization)
+            if denoiser_constructor is not None:
+                self.model_wrap = denoiser_constructor()
+            else:
+                denoiser = k_diffusion.external.CompVisVDenoiser if shared.sd_model.parameterization == "v" else k_diffusion.external.CompVisDenoiser
+                self.model_wrap = denoiser(shared.sd_model, quantize=shared.opts.enable_quantization)
        return self.model_wrap
@@ -115,12 +120,16 @@ class KDiffusionSampler(sd_samplers_common.Sampler):
            if scheduler.need_inner_model:
                sigmas_kwargs['inner_model'] = self.model_wrap
-            sigmas = scheduler.function(n=steps, **sigmas_kwargs, device=shared.device)
+            if scheduler.label == 'Beta':
+                p.extra_generation_params["Beta schedule alpha"] = opts.beta_dist_alpha
+                p.extra_generation_params["Beta schedule beta"] = opts.beta_dist_beta
+            sigmas = scheduler.function(n=steps, **sigmas_kwargs, device=devices.cpu)
        if discard_next_to_last_sigma:
            sigmas = torch.cat([sigmas[:-2], sigmas[-1:]])
-        return sigmas
+        return sigmas.cpu()
    def sample_img2img(self, p, x, noise, conditioning, unconditional_conditioning, steps=None, image_conditioning=None):
        steps, t_enc = sd_samplers_common.setup_img2img_steps(p, steps)
@@ -128,7 +137,10 @@ class KDiffusionSampler(sd_samplers_common.Sampler):
        sigmas = self.get_sigmas(p, steps)
        sigma_sched = sigmas[steps - t_enc - 1:]
-        xi = x + noise * sigma_sched[0]
+        if hasattr(shared.sd_model, 'add_noise_to_latent'):
+            xi = shared.sd_model.add_noise_to_latent(x, noise, sigma_sched[0])
+        else:
+            xi = x + noise * sigma_sched[0]
        if opts.img2img_extra_noise > 0:
            p.extra_generation_params["Extra noise"] = opts.img2img_extra_noise

--- a/modules/sd_samplers_timesteps.py
+++ b/modules/sd_samplers_timesteps.py
@@ -10,6 +10,7 @@ import modules.shared as shared
 samplers_timesteps = [
    ('DDIM', sd_samplers_timesteps_impl.ddim, ['ddim'], {}),
+    ('DDIM CFG++', sd_samplers_timesteps_impl.ddim_cfgpp, ['ddim_cfgpp'], {}),
    ('PLMS', sd_samplers_timesteps_impl.plms, ['plms'], {}),
    ('UniPC', sd_samplers_timesteps_impl.unipc, ['unipc'], {}),
 ]

--- a/modules/sd_samplers_timesteps_impl.py
+++ b/modules/sd_samplers_timesteps_impl.py
@@ -5,13 +5,14 @@ import numpy as np
 from modules import shared
 from modules.models.diffusion.uni_pc import uni_pc
+from modules.torch_utils import float64
 @torch.no_grad()
 def ddim(model, x, timesteps, extra_args=None, callback=None, disable=None, eta=0.0):
    alphas_cumprod = model.inner_model.inner_model.alphas_cumprod
    alphas = alphas_cumprod[timesteps]
-    alphas_prev = alphas_cumprod[torch.nn.functional.pad(timesteps[:-1], pad=(1, 0))].to(torch.float64 if x.device.type != 'mps' and x.device.type != 'xpu' else torch.float32)
+    alphas_prev = alphas_cumprod[torch.nn.functional.pad(timesteps[:-1], pad=(1, 0))].to(float64(x))
    sqrt_one_minus_alphas = torch.sqrt(1 - alphas)
    sigmas = eta * np.sqrt((1 - alphas_prev.cpu().numpy()) / (1 - alphas.cpu()) * (1 - alphas.cpu() / alphas_prev.cpu().numpy()))
@@ -39,11 +40,51 @@ def ddim(model, x, timesteps, extra_args=None, callback=None, disable=None, eta=
    return x
+@torch.no_grad()
+def ddim_cfgpp(model, x, timesteps, extra_args=None, callback=None, disable=None, eta=0.0):
+    """ Implements CFG++: Manifold-constrained Classifier Free Guidance For Diffusion Models (2024).
+    Uses the unconditional noise prediction instead of the conditional noise to guide the denoising direction.
+    The CFG scale is divided by 12.5 to map CFG from [0.0, 12.5] to [0, 1.0].
+    """
+    alphas_cumprod = model.inner_model.inner_model.alphas_cumprod
+    alphas = alphas_cumprod[timesteps]
+    alphas_prev = alphas_cumprod[torch.nn.functional.pad(timesteps[:-1], pad=(1, 0))].to(float64(x))
+    sqrt_one_minus_alphas = torch.sqrt(1 - alphas)
+    sigmas = eta * np.sqrt((1 - alphas_prev.cpu().numpy()) / (1 - alphas.cpu()) * (1 - alphas.cpu() / alphas_prev.cpu().numpy()))
+    model.cond_scale_miltiplier = 1 / 12.5
+    model.need_last_noise_uncond = True
+    extra_args = {} if extra_args is None else extra_args
+    s_in = x.new_ones((x.shape[0]))
+    s_x = x.new_ones((x.shape[0], 1, 1, 1))
+    for i in tqdm.trange(len(timesteps) - 1, disable=disable):
+        index = len(timesteps) - 1 - i
+        e_t = model(x, timesteps[index].item() * s_in, **extra_args)
+        last_noise_uncond = model.last_noise_uncond
+        a_t = alphas[index].item() * s_x
+        a_prev = alphas_prev[index].item() * s_x
+        sigma_t = sigmas[index].item() * s_x
+        sqrt_one_minus_at = sqrt_one_minus_alphas[index].item() * s_x
+        pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt()
+        dir_xt = (1. - a_prev - sigma_t ** 2).sqrt() * last_noise_uncond
+        noise = sigma_t * k_diffusion.sampling.torch.randn_like(x)
+        x = a_prev.sqrt() * pred_x0 + dir_xt + noise
+        if callback is not None:
+            callback({'x': x, 'i': i, 'sigma': 0, 'sigma_hat': 0, 'denoised': pred_x0})
+    return x
 @torch.no_grad()
 def plms(model, x, timesteps, extra_args=None, callback=None, disable=None):
    alphas_cumprod = model.inner_model.inner_model.alphas_cumprod
    alphas = alphas_cumprod[timesteps]
-    alphas_prev = alphas_cumprod[torch.nn.functional.pad(timesteps[:-1], pad=(1, 0))].to(torch.float64 if x.device.type != 'mps' and x.device.type != 'xpu' else torch.float32)
+    alphas_prev = alphas_cumprod[torch.nn.functional.pad(timesteps[:-1], pad=(1, 0))].to(float64(x))
    sqrt_one_minus_alphas = torch.sqrt(1 - alphas)
    extra_args = {} if extra_args is None else extra_args

--- a/modules/sd_schedulers.py
+++ b/modules/sd_schedulers.py
 import dataclasses
 import torch
 import k_diffusion
+import numpy as np
+from scipy import stats
+from modules import shared
+def to_d(x, sigma, denoised):
+    """Converts a denoiser output to a Karras ODE derivative."""
+    return (x - denoised) / sigma
+k_diffusion.sampling.to_d = to_d
 @dataclasses.dataclass
@@ -17,7 +27,7 @@ class Scheduler:
 def uniform(n, sigma_min, sigma_max, inner_model, device):
-    return inner_model.get_sigmas(n)
+    return inner_model.get_sigmas(n).to(device)
 def sgm_uniform(n, sigma_min, sigma_max, inner_model, device):
@@ -31,6 +41,92 @@ def sgm_uniform(n, sigma_min, sigma_max, inner_model, device):
    return torch.FloatTensor(sigs).to(device)
+def get_align_your_steps_sigmas(n, sigma_min, sigma_max, device):
+    # https://research.nvidia.com/labs/toronto-ai/AlignYourSteps/howto.html
+    def loglinear_interp(t_steps, num_steps):
+        """
+        Performs log-linear interpolation of a given array of decreasing numbers.
+        """
+        xs = np.linspace(0, 1, len(t_steps))
+        ys = np.log(t_steps[::-1])
+        new_xs = np.linspace(0, 1, num_steps)
+        new_ys = np.interp(new_xs, xs, ys)
+        interped_ys = np.exp(new_ys)[::-1].copy()
+        return interped_ys
+    if shared.sd_model.is_sdxl:
+        sigmas = [14.615, 6.315, 3.771, 2.181, 1.342, 0.862, 0.555, 0.380, 0.234, 0.113, 0.029]
+    else:
+        # Default to SD 1.5 sigmas.
+        sigmas = [14.615, 6.475, 3.861, 2.697, 1.886, 1.396, 0.963, 0.652, 0.399, 0.152, 0.029]
+    if n != len(sigmas):
+        sigmas = np.append(loglinear_interp(sigmas, n), [0.0])
+    else:
+        sigmas.append(0.0)
+    return torch.FloatTensor(sigmas).to(device)
+def kl_optimal(n, sigma_min, sigma_max, device):
+    alpha_min = torch.arctan(torch.tensor(sigma_min, device=device))
+    alpha_max = torch.arctan(torch.tensor(sigma_max, device=device))
+    step_indices = torch.arange(n + 1, device=device)
+    sigmas = torch.tan(step_indices / n * alpha_min + (1.0 - step_indices / n) * alpha_max)
+    return sigmas
+def simple_scheduler(n, sigma_min, sigma_max, inner_model, device):
+    sigs = []
+    ss = len(inner_model.sigmas) / n
+    for x in range(n):
+        sigs += [float(inner_model.sigmas[-(1 + int(x * ss))])]
+    sigs += [0.0]
+    return torch.FloatTensor(sigs).to(device)
+def normal_scheduler(n, sigma_min, sigma_max, inner_model, device, sgm=False, floor=False):
+    start = inner_model.sigma_to_t(torch.tensor(sigma_max))
+    end = inner_model.sigma_to_t(torch.tensor(sigma_min))
+    if sgm:
+        timesteps = torch.linspace(start, end, n + 1)[:-1]
+    else:
+        timesteps = torch.linspace(start, end, n)
+    sigs = []
+    for x in range(len(timesteps)):
+        ts = timesteps[x]
+        sigs.append(inner_model.t_to_sigma(ts))
+    sigs += [0.0]
+    return torch.FloatTensor(sigs).to(device)
+def ddim_scheduler(n, sigma_min, sigma_max, inner_model, device):
+    sigs = []
+    ss = max(len(inner_model.sigmas) // n, 1)
+    x = 1
+    while x < len(inner_model.sigmas):
+        sigs += [float(inner_model.sigmas[x])]
+        x += ss
+    sigs = sigs[::-1]
+    sigs += [0.0]
+    return torch.FloatTensor(sigs).to(device)
+def beta_scheduler(n, sigma_min, sigma_max, inner_model, device):
+    # From "Beta Sampling is All You Need" [arXiv:2407.12173] (Lee et. al, 2024) """
+    alpha = shared.opts.beta_dist_alpha
+    beta = shared.opts.beta_dist_beta
+    timesteps = 1 - np.linspace(0, 1, n)
+    timesteps = [stats.beta.ppf(x, alpha, beta) for x in timesteps]
+    sigmas = [sigma_min + (x * (sigma_max-sigma_min)) for x in timesteps]
+    sigmas += [0.0]
+    return torch.FloatTensor(sigmas).to(device)
 schedulers = [
    Scheduler('automatic', 'Automatic', None),
    Scheduler('uniform', 'Uniform', uniform, need_inner_model=True),
@@ -38,6 +134,12 @@ schedulers = [
    Scheduler('exponential', 'Exponential', k_diffusion.sampling.get_sigmas_exponential),
    Scheduler('polyexponential', 'Polyexponential', k_diffusion.sampling.get_sigmas_polyexponential, default_rho=1.0),
    Scheduler('sgm_uniform', 'SGM Uniform', sgm_uniform, need_inner_model=True, aliases=["SGMUniform"]),
+    Scheduler('kl_optimal', 'KL Optimal', kl_optimal),
+    Scheduler('align_your_steps', 'Align Your Steps', get_align_your_steps_sigmas),
+    Scheduler('simple', 'Simple', simple_scheduler, need_inner_model=True),
+    Scheduler('normal', 'Normal', normal_scheduler, need_inner_model=True),
+    Scheduler('ddim', 'DDIM', ddim_scheduler, need_inner_model=True),
+    Scheduler('beta', 'Beta', beta_scheduler, need_inner_model=True),
 ]
 schedulers_map = {**{x.name: x for x in schedulers}, **{x.label: x for x in schedulers}}
--- a/modules/sd_vae_approx.py
+++ b/modules/sd_vae_approx.py
@@ -8,9 +8,9 @@ sd_vae_approx_models = {}
 class VAEApprox(nn.Module):
-    def __init__(self):
+    def __init__(self, latent_channels=4):
        super(VAEApprox, self).__init__()
-        self.conv1 = nn.Conv2d(4, 8, (7, 7))
+        self.conv1 = nn.Conv2d(latent_channels, 8, (7, 7))
        self.conv2 = nn.Conv2d(8, 16, (5, 5))
        self.conv3 = nn.Conv2d(16, 32, (3, 3))
        self.conv4 = nn.Conv2d(32, 64, (3, 3))
@@ -40,7 +40,13 @@ def download_model(model_path, model_url):
 def model():
-    model_name = "vaeapprox-sdxl.pt" if getattr(shared.sd_model, 'is_sdxl', False) else "model.pt"
+    if shared.sd_model.is_sd3:
+        model_name = "vaeapprox-sd3.pt"
+    elif shared.sd_model.is_sdxl:
+        model_name = "vaeapprox-sdxl.pt"
+    else:
+        model_name = "model.pt"
    loaded_model = sd_vae_approx_models.get(model_name)
    if loaded_model is None:
@@ -52,7 +58,7 @@ def model():
            model_path = os.path.join(paths.models_path, "VAE-approx", model_name)
            download_model(model_path, 'https://github.com/AUTOMATIC1111/stable-diffusion-webui/releases/download/v1.0.0-pre/' + model_name)
-        loaded_model = VAEApprox()
+        loaded_model = VAEApprox(latent_channels=shared.sd_model.latent_channels)
        loaded_model.load_state_dict(torch.load(model_path, map_location='cpu' if devices.device.type != 'cuda' else None))
        loaded_model.eval()
        loaded_model.to(devices.device, devices.dtype)
@@ -64,7 +70,18 @@ def model():
 def cheap_approximation(sample):
    # https://discuss.huggingface.co/t/decoding-latents-to-rgb-without-upscaling/23204/2
-    if shared.sd_model.is_sdxl:
+    if shared.sd_model.is_sd3:
+        coeffs = [
+            [-0.0645,  0.0177,  0.1052], [ 0.0028,  0.0312,  0.0650],
+            [ 0.1848,  0.0762,  0.0360], [ 0.0944,  0.0360,  0.0889],
+            [ 0.0897,  0.0506, -0.0364], [-0.0020,  0.1203,  0.0284],
+            [ 0.0855,  0.0118,  0.0283], [-0.0539,  0.0658,  0.1047],
+            [-0.0057,  0.0116,  0.0700], [-0.0412,  0.0281, -0.0039],
+            [ 0.1106,  0.1171,  0.1220], [-0.0248,  0.0682, -0.0481],
+            [ 0.0815,  0.0846,  0.1207], [-0.0120, -0.0055, -0.0867],
+            [-0.0749, -0.0634, -0.0456], [-0.1418, -0.1457, -0.1259],
+        ]
+    elif shared.sd_model.is_sdxl:
        coeffs = [
            [ 0.3448,  0.4168,  0.4395],
            [-0.1953, -0.0290,  0.0250],

--- a/modules/sd_vae_taesd.py
+++ b/modules/sd_vae_taesd.py
@@ -34,9 +34,9 @@ class Block(nn.Module):
        return self.fuse(self.conv(x) + self.skip(x))
-def decoder():
+def decoder(latent_channels=4):
    return nn.Sequential(
-        Clamp(), conv(4, 64), nn.ReLU(),
+        Clamp(), conv(latent_channels, 64), nn.ReLU(),
        Block(64, 64), Block(64, 64), Block(64, 64), nn.Upsample(scale_factor=2), conv(64, 64, bias=False),
        Block(64, 64), Block(64, 64), Block(64, 64), nn.Upsample(scale_factor=2), conv(64, 64, bias=False),
        Block(64, 64), Block(64, 64), Block(64, 64), nn.Upsample(scale_factor=2), conv(64, 64, bias=False),
@@ -44,13 +44,13 @@ def decoder():
    )
-def encoder():
+def encoder(latent_channels=4):
    return nn.Sequential(
        conv(3, 64), Block(64, 64),
        conv(64, 64, stride=2, bias=False), Block(64, 64), Block(64, 64), Block(64, 64),
        conv(64, 64, stride=2, bias=False), Block(64, 64), Block(64, 64), Block(64, 64),
        conv(64, 64, stride=2, bias=False), Block(64, 64), Block(64, 64), Block(64, 64),
-        conv(64, 4),
+        conv(64, latent_channels),
    )
@@ -58,10 +58,14 @@ class TAESDDecoder(nn.Module):
    latent_magnitude = 3
    latent_shift = 0.5
-    def __init__(self, decoder_path="taesd_decoder.pth"):
+    def __init__(self, decoder_path="taesd_decoder.pth", latent_channels=None):
        """Initialize pretrained TAESD on the given device from the given checkpoints."""
        super().__init__()
-        self.decoder = decoder()
+        if latent_channels is None:
+            latent_channels = 16 if "taesd3" in str(decoder_path) else 4
+        self.decoder = decoder(latent_channels)
        self.decoder.load_state_dict(
            torch.load(decoder_path, map_location='cpu' if devices.device.type != 'cuda' else None))
@@ -70,10 +74,14 @@ class TAESDEncoder(nn.Module):
    latent_magnitude = 3
    latent_shift = 0.5
-    def __init__(self, encoder_path="taesd_encoder.pth"):
+    def __init__(self, encoder_path="taesd_encoder.pth", latent_channels=None):
        """Initialize pretrained TAESD on the given device from the given checkpoints."""
        super().__init__()
-        self.encoder = encoder()
+        if latent_channels is None:
+            latent_channels = 16 if "taesd3" in str(encoder_path) else 4
+        self.encoder = encoder(latent_channels)
        self.encoder.load_state_dict(
            torch.load(encoder_path, map_location='cpu' if devices.device.type != 'cuda' else None))
@@ -87,7 +95,13 @@ def download_model(model_path, model_url):
 def decoder_model():
-    model_name = "taesdxl_decoder.pth" if getattr(shared.sd_model, 'is_sdxl', False) else "taesd_decoder.pth"
+    if shared.sd_model.is_sd3:
+        model_name = "taesd3_decoder.pth"
+    elif shared.sd_model.is_sdxl:
+        model_name = "taesdxl_decoder.pth"
+    else:
+        model_name = "taesd_decoder.pth"
    loaded_model = sd_vae_taesd_models.get(model_name)
    if loaded_model is None:
@@ -106,7 +120,13 @@ def decoder_model():
 def encoder_model():
-    model_name = "taesdxl_encoder.pth" if getattr(shared.sd_model, 'is_sdxl', False) else "taesd_encoder.pth"
+    if shared.sd_model.is_sd3:
+        model_name = "taesd3_encoder.pth"
+    elif shared.sd_model.is_sdxl:
+        model_name = "taesdxl_encoder.pth"
+    else:
+        model_name = "taesd_encoder.pth"
    loaded_model = sd_vae_taesd_models.get(model_name)
    if loaded_model is None:

--- a/modules/shared.py
+++ b/modules/shared.py
@@ -47,7 +47,7 @@ restricted_opts: set[str] = None
 sd_model: sd_models_types.WebuiSdModel = None
 settings_components: dict = None
-"""assigned from ui.py, a mapping on setting names to gradio components repsponsible for those settings"""
+"""assigned from ui.py, a mapping on setting names to gradio components responsible for those settings"""
 tab_names = []

--- a/modules/shared_gradio_themes.py
+++ b/modules/shared_gradio_themes.py
@@ -69,3 +69,44 @@ def reload_gradio_theme(theme_name=None):
    # append additional values gradio_theme
    shared.gradio_theme.sd_webui_modal_lightbox_toolbar_opacity = shared.opts.sd_webui_modal_lightbox_toolbar_opacity
    shared.gradio_theme.sd_webui_modal_lightbox_icon_opacity = shared.opts.sd_webui_modal_lightbox_icon_opacity
+def resolve_var(name: str, gradio_theme=None, history=None):
+    """
+    Attempt to resolve a theme variable name to its value
+    Parameters:
+        name (str): The name of the theme variable
+            ie "background_fill_primary", "background_fill_primary_dark"
+            spaces and asterisk (*) prefix is removed from name before lookup
+        gradio_theme (gradio.themes.ThemeClass): The theme object to resolve the variable from
+            blank to use the webui default shared.gradio_theme
+        history (list): A list of previously resolved variables to prevent circular references
+            for regular use leave blank
+    Returns:
+        str: The resolved value
+    Error handling:
+        return either #000000 or #ffffff depending on initial name ending with "_dark"
+    """
+    try:
+        if history is None:
+            history = []
+        if gradio_theme is None:
+            gradio_theme = shared.gradio_theme
+        name = name.strip()
+        name = name[1:] if name.startswith("*") else name
+        if name in history:
+            raise ValueError(f'Circular references: name "{name}" in {history}')
+        if value := getattr(gradio_theme, name, None):
+            return resolve_var(value, gradio_theme, history + [name])
+        else:
+            return name
+    except Exception:
+        name = history[0] if history else name
+        errors.report(f'resolve_color({name})', exc_info=True)
+        return '#000000' if name.endswith("_dark") else '#ffffff'
--- a/modules/shared_init.py
+++ b/modules/shared_init.py
@@ -31,6 +31,14 @@ def initialize():
    devices.dtype_vae = torch.float32 if cmd_opts.no_half or cmd_opts.no_half_vae else torch.float16
    devices.dtype_inference = torch.float32 if cmd_opts.precision == 'full' else devices.dtype
+    if cmd_opts.precision == "half":
+        msg = "--no-half and --no-half-vae conflict with --precision half"
+        assert devices.dtype == torch.float16, msg
+        assert devices.dtype_vae == torch.float16, msg
+        assert devices.dtype_inference == torch.float16, msg
+        devices.force_fp16 = True
+        devices.force_model_fp16()
    shared.device = devices.device
    shared.weight_load_location = None if cmd_opts.lowram else "cpu"

--- a/modules/shared_options.py
+++ b/modules/shared_options.py
--- a/modules/shared_state.py
+++ b/modules/shared_state.py
@@ -162,7 +162,7 @@ class State:
            errors.record_exception()
    def assign_current_image(self, image):
-        if shared.opts.live_previews_image_format == 'jpeg' and image.mode == 'RGBA':
+        if shared.opts.live_previews_image_format == 'jpeg' and image.mode in ('RGBA', 'P'):
            image = image.convert('RGB')
        self.current_image = image
        self.id_live_preview += 1
--- a/modules/sysinfo.py
+++ b/modules/sysinfo.py
--- a/modules/textual_inversion/textual_inversion.py
+++ b/modules/textual_inversion/textual_inversion.py
--- a/modules/torch_utils.py
+++ b/modules/torch_utils.py
--- a/modules/ui.py
+++ b/modules/ui.py
--- a/modules/ui_common.py
+++ b/modules/ui_common.py
--- a/modules/ui_extensions.py
+++ b/modules/ui_extensions.py
--- a/modules/ui_extra_networks_user_metadata.py
+++ b/modules/ui_extra_networks_user_metadata.py
@@ -194,7 +194,7 @@ class UserMetadataEditor:
    def setup_ui(self, gallery):
        self.button_replace_preview.click(
            fn=self.save_preview,
-            _js="function(x, y, z){return [selected_gallery_index(), y, z]}",
+            _js=f"function(x, y, z){{return [selected_gallery_index_id('{self.tabname + '_gallery_container'}'), y, z]}}",
            inputs=[self.edit_name_input, gallery, self.edit_name_input],
            outputs=[self.html_preview, self.html_status]
        ).then(

--- a/modules/ui_gradio_extensions.py
+++ b/modules/ui_gradio_extensions.py
--- a/modules/ui_settings.py
+++ b/modules/ui_settings.py
--- a/modules/upscaler.py
+++ b/modules/upscaler.py
--- a/modules/util.py
+++ b/modules/util.py
--- a/requirements.txt
+++ b/requirements.txt
--- a/requirements_versions.txt
+++ b/requirements_versions.txt
--- a/script.js
+++ b/script.js
--- a/scripts/xyz_grid.py
+++ b/scripts/xyz_grid.py
--- a/style.css
+++ b/style.css
--- a/webui-macos-env.sh
+++ b/webui-macos-env.sh
--- a/webui.bat
+++ b/webui.bat
--- a/webui.sh
+++ b/webui.sh