From 9d678a6f41d0015f9bf8767ab614051e51f7405b Mon Sep 17 00:00:00 2001
From: Symbiomatrix <Symbiomatrix@gmail.com>
Date: Wed, 16 Aug 2023 00:08:09 +0300
Subject: [PATCH 01/31] Update resize_lora.py

---
 networks/resize_lora.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/networks/resize_lora.py b/networks/resize_lora.py
index 7b7406347..90e5dffc6 100644
--- a/networks/resize_lora.py
+++ b/networks/resize_lora.py
@@ -219,16 +219,16 @@ def resize_lora_model(lora_sd, new_rank, save_dtype, device, dynamic_method, dyn
     for key, value in tqdm(lora_sd.items()):
       weight_name = None
       if 'lora_down' in key:
-        block_down_name = key.split(".")[0]
-        weight_name = key.split(".")[-1]
+        block_down_name = key.rsplit('lora_down', 1)[0]
+        weight_name = key.rsplit(".", 1)[-1]
         lora_down_weight = value
       else:
         continue
 
       # find corresponding lora_up and alpha
       block_up_name = block_down_name
-      lora_up_weight = lora_sd.get(block_up_name + '.lora_up.' + weight_name, None)
-      lora_alpha = lora_sd.get(block_down_name + '.alpha', None)
+      lora_up_weight = lora_sd.get(block_up_name + 'lora_up.' + weight_name, None)
+      lora_alpha = lora_sd.get(block_down_name + 'alpha', None)
 
       weights_loaded = (lora_down_weight is not None and lora_up_weight is not None)
 
@@ -263,9 +263,9 @@ def resize_lora_model(lora_sd, new_rank, save_dtype, device, dynamic_method, dyn
           verbose_str+=f"\n"
 
         new_alpha = param_dict['new_alpha']
-        o_lora_sd[block_down_name + "." + "lora_down.weight"] = param_dict["lora_down"].to(save_dtype).contiguous()
-        o_lora_sd[block_up_name + "." + "lora_up.weight"] = param_dict["lora_up"].to(save_dtype).contiguous()
-        o_lora_sd[block_up_name + "." "alpha"] = torch.tensor(param_dict['new_alpha']).to(save_dtype)
+        o_lora_sd[block_down_name + "lora_down.weight"] = param_dict["lora_down"].to(save_dtype).contiguous()
+        o_lora_sd[block_up_name + "lora_up.weight"] = param_dict["lora_up"].to(save_dtype).contiguous()
+        o_lora_sd[block_up_name + "alpha"] = torch.tensor(param_dict['new_alpha']).to(save_dtype)
 
         block_down_name = None
         block_up_name = None

From 80aca1ccc72679e658fb24d2fb7ea1998c1f240a Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Tue, 5 Sep 2023 15:20:15 -0400
Subject: [PATCH 02/31] Add ip_noise_gamma metadata

---
 train_network.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/train_network.py b/train_network.py
index f752607e9..0f2946751 100644
--- a/train_network.py
+++ b/train_network.py
@@ -509,6 +509,7 @@ def train(self, args):
             "ss_prior_loss_weight": args.prior_loss_weight,
             "ss_min_snr_gamma": args.min_snr_gamma,
             "ss_scale_weight_norms": args.scale_weight_norms,
+            "ss_ip_noise_gamma": args.ip_noise_gamma,
         }
 
         if use_user_config:

From e33c007cd0be3d3fa341ca8e0dcc132184bfd7d7 Mon Sep 17 00:00:00 2001
From: jvkap <83289567+jvkap@users.noreply.github.com>
Date: Mon, 11 Sep 2023 11:29:06 -0300
Subject: [PATCH 03/31] Update resize_lora.py

---
 networks/resize_lora.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/networks/resize_lora.py b/networks/resize_lora.py
index 7b7406347..9f0207b98 100644
--- a/networks/resize_lora.py
+++ b/networks/resize_lora.py
@@ -283,7 +283,12 @@ def resize_lora_model(lora_sd, new_rank, save_dtype, device, dynamic_method, dyn
 
 
 def resize(args):
-
+  if args.save_to is None or not (args.save_to.endswith('.ckpt') or args.save_to.endswith('.safetensors')):
+    raise Exception("The --save_to argument must be specified and must be a .ckpt or .safetensors file.")
+  
+  if args.model is None or not (args.model.endswith('.ckpt') or args.model.endswith('.safetensors')):
+    raise Exception("The --model argument must be specified and must be a .ckpt or .safetensors file.")
+    
   def str_to_dtype(p):
     if p == 'float':
       return torch.float

From a0e05fa291039f7a81a34cdf565d20219e8c77c6 Mon Sep 17 00:00:00 2001
From: jvkap <83289567+jvkap@users.noreply.github.com>
Date: Mon, 11 Sep 2023 11:41:33 -0300
Subject: [PATCH 04/31] Update resize_lora.py

---
 networks/resize_lora.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/networks/resize_lora.py b/networks/resize_lora.py
index 9f0207b98..41585f79c 100644
--- a/networks/resize_lora.py
+++ b/networks/resize_lora.py
@@ -285,9 +285,7 @@ def resize_lora_model(lora_sd, new_rank, save_dtype, device, dynamic_method, dyn
 def resize(args):
   if args.save_to is None or not (args.save_to.endswith('.ckpt') or args.save_to.endswith('.safetensors')):
     raise Exception("The --save_to argument must be specified and must be a .ckpt or .safetensors file.")
-  
-  if args.model is None or not (args.model.endswith('.ckpt') or args.model.endswith('.safetensors')):
-    raise Exception("The --model argument must be specified and must be a .ckpt or .safetensors file.")
+
     
   def str_to_dtype(p):
     if p == 'float':

From 0ecfd91a208f845f45fdc39cae83d6be71e23720 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Wed, 13 Sep 2023 17:59:14 +0900
Subject: [PATCH 05/31] fix VAE becomes last one

---
 tools/merge_models.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/merge_models.py b/tools/merge_models.py
index dd04ea463..391bfe677 100644
--- a/tools/merge_models.py
+++ b/tools/merge_models.py
@@ -51,7 +51,7 @@ def merge(args):
             print(f"Model {model} does not exist")
             exit()
 
-    assert len(args.models) == len(args.ratios) or args.ratios is None, "ratios must be the same length as models"
+    assert args.ratios is None or len(args.models) == len(args.ratios), "ratios must be the same length as models"
 
     # load and merge
     ratio = 1.0 / len(args.models)  # default
@@ -113,13 +113,13 @@ def merge(args):
     # add supplementary keys' value (including VAE and TextEncoder)
     if len(supplementary_key_ratios) > 0:
         print("add first model's value")
-        with safe_open(model, framework="pt", device=args.device) as f:
+        with safe_open(args.models[0], framework="pt", device=args.device) as f:
             for key in tqdm(f.keys()):
                 _, new_key = replace_text_encoder_key(key)
                 if new_key not in supplementary_key_ratios:
                     continue
 
-                if is_unet_key(new_key): # not VAE or TextEncoder
+                if is_unet_key(new_key):  # not VAE or TextEncoder
                     print(f"Key {new_key} not in all models, ratio = {supplementary_key_ratios[new_key]}")
 
                 value = f.get_tensor(key)  # original key

From 90c47140b8d969c7ba55b1b85e0d518826a9b464 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Wed, 13 Sep 2023 17:59:34 +0900
Subject: [PATCH 06/31] add support model without position_ids

---
 library/sdxl_model_util.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/library/sdxl_model_util.py b/library/sdxl_model_util.py
index 6647b4394..2f0154cae 100644
--- a/library/sdxl_model_util.py
+++ b/library/sdxl_model_util.py
@@ -258,6 +258,10 @@ def load_models_from_sdxl_checkpoint(model_version, ckpt_path, map_location, dty
             te1_sd[k.replace("conditioner.embedders.0.transformer.", "")] = state_dict.pop(k)
         elif k.startswith("conditioner.embedders.1.model."):
             te2_sd[k] = state_dict.pop(k)
+    
+    # 一部のposition_idsがないモデルへの対応 / add position_ids for some models
+    if "text_model.embeddings.position_ids" not in te1_sd:
+        te1_sd["text_model.embeddings.position_ids"] = torch.arange(77).unsqueeze(0)
 
     info1 = _load_state_dict_on_device(text_model1, te1_sd, device=map_location)  # remain fp32
     print("text encoder 1:", info1)

From d337bbf8a08bce18e302b7e8403c70d58d632610 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Wed, 13 Sep 2023 20:58:37 +0900
Subject: [PATCH 07/31] get pool from CLIPVisionModel in img2img

---
 sdxl_gen_img.py | 42 +++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 41 insertions(+), 1 deletion(-)

diff --git a/sdxl_gen_img.py b/sdxl_gen_img.py
index c506ad3fc..7d9c68bfb 100755
--- a/sdxl_gen_img.py
+++ b/sdxl_gen_img.py
@@ -37,7 +37,7 @@
 from einops import rearrange
 from tqdm import tqdm
 from torchvision import transforms
-from transformers import CLIPTextModel, CLIPTokenizer, CLIPModel, CLIPTextConfig
+from transformers import CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection, CLIPImageProcessor
 import PIL
 from PIL import Image
 from PIL.PngImagePlugin import PngInfo
@@ -61,6 +61,8 @@
 LATENT_CHANNELS = 4
 DOWNSAMPLING_FACTOR = 8
 
+CLIP_VISION_MODEL = "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k"
+
 # region モジュール入れ替え部
 """
 高速化のためのモジュール入れ替え
@@ -320,6 +322,10 @@ def __init__(
         self.scheduler = scheduler
         self.safety_checker = None
 
+        self.clip_vision_model: CLIPVisionModelWithProjection = None
+        self.clip_vision_processor: CLIPImageProcessor = None
+        self.clip_vision_strength = 0.0
+
         # Textual Inversion
         self.token_replacements_list = []
         for _ in range(len(self.text_encoders)):
@@ -535,6 +541,21 @@ def __call__(
             num_sub_prompts = len(text_pool) // batch_size
             text_pool = text_pool[num_sub_prompts - 1 :: num_sub_prompts]  # last subprompt
 
+        if init_image is not None and self.clip_vision_model is not None:
+            print(f"encode by clip_vision_model and apply clip_vision_strength={self.clip_vision_strength}")
+            vision_input = self.clip_vision_processor(init_image, return_tensors="pt", device=self.device)
+            pixel_values = vision_input["pixel_values"].to(self.device, dtype=text_embeddings.dtype)
+
+            clip_vision_embeddings = self.clip_vision_model(pixel_values=pixel_values, output_hidden_states=True, return_dict=True)
+            clip_vision_embeddings = clip_vision_embeddings.image_embeds
+
+            if len(clip_vision_embeddings) == 1 and batch_size > 1:
+                clip_vision_embeddings = clip_vision_embeddings.repeat((batch_size, 1))
+
+            clip_vision_embeddings = clip_vision_embeddings * self.clip_vision_strength
+            assert clip_vision_embeddings.shape == text_pool.shape, f"{clip_vision_embeddings.shape} != {text_pool.shape}"
+            text_pool = clip_vision_embeddings  # replace: same as ComfyUI (?)
+
         c_vector = torch.cat([text_pool, c_vector], dim=1)
         uc_vector = torch.cat([uncond_pool, uc_vector], dim=1)
 
@@ -1767,6 +1788,19 @@ def resize_images(imgs, size):
         init_images = load_images(args.image_path)
         assert len(init_images) > 0, f"No image / 画像がありません: {args.image_path}"
         print(f"loaded {len(init_images)} images for img2img")
+
+        # CLIP Vision
+        if args.clip_vision_strength is not None:
+            print(f"load CLIP Vision model: {CLIP_VISION_MODEL}")
+            vision_model = CLIPVisionModelWithProjection.from_pretrained(CLIP_VISION_MODEL, projection_dim=1280)
+            vision_model.to(device, dtype)
+            processor = CLIPImageProcessor.from_pretrained(CLIP_VISION_MODEL)
+
+            pipe.clip_vision_model = vision_model
+            pipe.clip_vision_processor = processor
+            pipe.clip_vision_strength = args.clip_vision_strength
+            print(f"CLIP Vision model loaded.")
+
     else:
         init_images = None
 
@@ -2656,6 +2690,12 @@ def setup_parser() -> argparse.ArgumentParser:
         nargs="*",
         help="ControlNet guidance ratio for steps / ControlNetでガイドするステップ比率",
     )
+    parser.add_argument(
+        "--clip_vision_strength",
+        type=float,
+        default=None,
+        help="enable CLIP Vision Conditioning for img2img with this strength / img2imgでCLIP Vision Conditioningを有効にしてこのstrengthで処理する",
+    )
     # # parser.add_argument(
     #     "--control_net_image_path", type=str, default=None, nargs="*", help="image for ControlNet guidance / ControlNetでガイドに使う画像"
     # )

From db7a28ac25514eb7c318d7f1486abe7c8914ada7 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Mon, 18 Sep 2023 21:12:41 +0900
Subject: [PATCH 08/31] fix to work highres_fix_latents_upscaling

---
 sdxl_gen_img.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sdxl_gen_img.py b/sdxl_gen_img.py
index 7d9c68bfb..3500df575 100755
--- a/sdxl_gen_img.py
+++ b/sdxl_gen_img.py
@@ -766,7 +766,7 @@ def __call__(
                     return None
 
         if return_latents:
-            return (latents, False)
+            return latents
 
         latents = 1 / sdxl_model_util.VAE_SCALE_FACTOR * latents
         if vae_batch_size >= batch_size:
@@ -1814,7 +1814,7 @@ def resize_images(imgs, size):
 
     # promptがないとき、画像のPngInfoから取得する
     if init_images is not None and len(prompt_list) == 0 and not args.interactive:
-        print("get prompts from images' meta data")
+        print("get prompts from images' metadata")
         for img in init_images:
             if "prompt" in img.text:
                 prompt = img.text["prompt"]

From b64389c8a942c9b1b3a919f676d58770e7219a56 Mon Sep 17 00:00:00 2001
From: Disty0 <semihgulec2005@gmail.com>
Date: Tue, 19 Sep 2023 18:05:05 +0300
Subject: [PATCH 09/31] Intel ARC support with IPEX

---
 XTI_hijack.py                        |   7 +
 fine_tune.py                         |   7 +
 gen_img_diffusers.py                 |   7 +
 library/ipex/__init__.py             | 170 +++++++++++++++++++++++
 library/ipex/attention.py            | 152 +++++++++++++++++++++
 library/ipex/diffusers.py            | 119 ++++++++++++++++
 library/ipex/gradscaler.py           | 179 ++++++++++++++++++++++++
 library/ipex/hijacks.py              | 196 +++++++++++++++++++++++++++
 library/model_util.py                |   7 +
 sdxl_gen_img.py                      |   7 +
 sdxl_minimal_inference.py            |   7 +
 sdxl_train.py                        |   7 +
 sdxl_train_control_net_lllite.py     |   7 +
 sdxl_train_control_net_lllite_alt.py |   7 +
 sdxl_train_network.py                |   7 +
 sdxl_train_textual_inversion.py      |   7 +
 train_controlnet.py                  |   7 +
 train_db.py                          |   7 +
 train_network.py                     |   7 +
 train_textual_inversion.py           |   7 +
 train_textual_inversion_XTI.py       |   7 +
 21 files changed, 928 insertions(+)
 create mode 100644 library/ipex/__init__.py
 create mode 100644 library/ipex/attention.py
 create mode 100644 library/ipex/diffusers.py
 create mode 100644 library/ipex/gradscaler.py
 create mode 100644 library/ipex/hijacks.py

diff --git a/XTI_hijack.py b/XTI_hijack.py
index 36b5d3f2b..ec0849455 100644
--- a/XTI_hijack.py
+++ b/XTI_hijack.py
@@ -1,4 +1,11 @@
 import torch
+try:
+    import intel_extension_for_pytorch as ipex
+    if torch.xpu.is_available():
+        from library.ipex import ipex_init
+        ipex_init()
+except Exception:
+    pass
 from typing import Union, List, Optional, Dict, Any, Tuple
 from diffusers.models.unet_2d_condition import UNet2DConditionOutput
 
diff --git a/fine_tune.py b/fine_tune.py
index f89e897a8..f300d4688 100644
--- a/fine_tune.py
+++ b/fine_tune.py
@@ -10,6 +10,13 @@
 
 from tqdm import tqdm
 import torch
+try:
+    import intel_extension_for_pytorch as ipex
+    if torch.xpu.is_available():
+        from library.ipex import ipex_init
+        ipex_init()
+except Exception:
+    pass
 from accelerate.utils import set_seed
 from diffusers import DDPMScheduler
 
diff --git a/gen_img_diffusers.py b/gen_img_diffusers.py
index 273c0dd86..0ea66cde2 100644
--- a/gen_img_diffusers.py
+++ b/gen_img_diffusers.py
@@ -65,6 +65,13 @@
 import diffusers
 import numpy as np
 import torch
+try:
+    import intel_extension_for_pytorch as ipex
+    if torch.xpu.is_available():
+        from library.ipex import ipex_init
+        ipex_init()
+except Exception:
+    pass
 import torchvision
 from diffusers import (
     AutoencoderKL,
diff --git a/library/ipex/__init__.py b/library/ipex/__init__.py
new file mode 100644
index 000000000..9ec69012f
--- /dev/null
+++ b/library/ipex/__init__.py
@@ -0,0 +1,170 @@
+import os
+import sys
+import contextlib
+import torch
+import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unused-import
+from .hijacks import ipex_hijacks
+from .attention import attention_init
+
+# pylint: disable=protected-access, missing-function-docstring, line-too-long
+
+def ipex_init(): # pylint: disable=too-many-statements
+    try:
+        #Replace cuda with xpu:
+        torch.cuda.current_device = torch.xpu.current_device
+        torch.cuda.current_stream = torch.xpu.current_stream
+        torch.cuda.device = torch.xpu.device
+        torch.cuda.device_count = torch.xpu.device_count
+        torch.cuda.device_of = torch.xpu.device_of
+        torch.cuda.getDeviceIdListForCard = torch.xpu.getDeviceIdListForCard
+        torch.cuda.get_device_name = torch.xpu.get_device_name
+        torch.cuda.get_device_properties = torch.xpu.get_device_properties
+        torch.cuda.init = torch.xpu.init
+        torch.cuda.is_available = torch.xpu.is_available
+        torch.cuda.is_initialized = torch.xpu.is_initialized
+        torch.cuda.is_current_stream_capturing = lambda: False
+        torch.cuda.set_device = torch.xpu.set_device
+        torch.cuda.stream = torch.xpu.stream
+        torch.cuda.synchronize = torch.xpu.synchronize
+        torch.cuda.Event = torch.xpu.Event
+        torch.cuda.Stream = torch.xpu.Stream
+        torch.cuda.FloatTensor = torch.xpu.FloatTensor
+        torch.Tensor.cuda = torch.Tensor.xpu
+        torch.Tensor.is_cuda = torch.Tensor.is_xpu
+        torch.cuda._initialization_lock = torch.xpu.lazy_init._initialization_lock
+        torch.cuda._initialized = torch.xpu.lazy_init._initialized
+        torch.cuda._lazy_seed_tracker = torch.xpu.lazy_init._lazy_seed_tracker
+        torch.cuda._queued_calls = torch.xpu.lazy_init._queued_calls
+        torch.cuda._tls = torch.xpu.lazy_init._tls
+        torch.cuda.threading = torch.xpu.lazy_init.threading
+        torch.cuda.traceback = torch.xpu.lazy_init.traceback
+        torch.cuda.Optional = torch.xpu.Optional
+        torch.cuda.__cached__ = torch.xpu.__cached__
+        torch.cuda.__loader__ = torch.xpu.__loader__
+        torch.cuda.ComplexFloatStorage = torch.xpu.ComplexFloatStorage
+        torch.cuda.Tuple = torch.xpu.Tuple
+        torch.cuda.streams = torch.xpu.streams
+        torch.cuda._lazy_new = torch.xpu._lazy_new
+        torch.cuda.FloatStorage = torch.xpu.FloatStorage
+        torch.cuda.Any = torch.xpu.Any
+        torch.cuda.__doc__ = torch.xpu.__doc__
+        torch.cuda.default_generators = torch.xpu.default_generators
+        torch.cuda.HalfTensor = torch.xpu.HalfTensor
+        torch.cuda._get_device_index = torch.xpu._get_device_index
+        torch.cuda.__path__ = torch.xpu.__path__
+        torch.cuda.Device = torch.xpu.Device
+        torch.cuda.IntTensor = torch.xpu.IntTensor
+        torch.cuda.ByteStorage = torch.xpu.ByteStorage
+        torch.cuda.set_stream = torch.xpu.set_stream
+        torch.cuda.BoolStorage = torch.xpu.BoolStorage
+        torch.cuda.os = torch.xpu.os
+        torch.cuda.torch = torch.xpu.torch
+        torch.cuda.BFloat16Storage = torch.xpu.BFloat16Storage
+        torch.cuda.Union = torch.xpu.Union
+        torch.cuda.DoubleTensor = torch.xpu.DoubleTensor
+        torch.cuda.ShortTensor = torch.xpu.ShortTensor
+        torch.cuda.LongTensor = torch.xpu.LongTensor
+        torch.cuda.IntStorage = torch.xpu.IntStorage
+        torch.cuda.LongStorage = torch.xpu.LongStorage
+        torch.cuda.__annotations__ = torch.xpu.__annotations__
+        torch.cuda.__package__ = torch.xpu.__package__
+        torch.cuda.__builtins__ = torch.xpu.__builtins__
+        torch.cuda.CharTensor = torch.xpu.CharTensor
+        torch.cuda.List = torch.xpu.List
+        torch.cuda._lazy_init = torch.xpu._lazy_init
+        torch.cuda.BFloat16Tensor = torch.xpu.BFloat16Tensor
+        torch.cuda.DoubleStorage = torch.xpu.DoubleStorage
+        torch.cuda.ByteTensor = torch.xpu.ByteTensor
+        torch.cuda.StreamContext = torch.xpu.StreamContext
+        torch.cuda.ComplexDoubleStorage = torch.xpu.ComplexDoubleStorage
+        torch.cuda.ShortStorage = torch.xpu.ShortStorage
+        torch.cuda._lazy_call = torch.xpu._lazy_call
+        torch.cuda.HalfStorage = torch.xpu.HalfStorage
+        torch.cuda.random = torch.xpu.random
+        torch.cuda._device = torch.xpu._device
+        torch.cuda.classproperty = torch.xpu.classproperty
+        torch.cuda.__name__ = torch.xpu.__name__
+        torch.cuda._device_t = torch.xpu._device_t
+        torch.cuda.warnings = torch.xpu.warnings
+        torch.cuda.__spec__ = torch.xpu.__spec__
+        torch.cuda.BoolTensor = torch.xpu.BoolTensor
+        torch.cuda.CharStorage = torch.xpu.CharStorage
+        torch.cuda.__file__ = torch.xpu.__file__
+        torch.cuda._is_in_bad_fork = torch.xpu.lazy_init._is_in_bad_fork
+        #torch.cuda.is_current_stream_capturing = torch.xpu.is_current_stream_capturing
+
+        #Memory:
+        torch.cuda.memory = torch.xpu.memory
+        if 'linux' in sys.platform and "WSL2" in os.popen("uname -a").read():
+            torch.xpu.empty_cache = lambda: None
+        torch.cuda.empty_cache = torch.xpu.empty_cache
+        torch.cuda.memory_stats = torch.xpu.memory_stats
+        torch.cuda.memory_summary = torch.xpu.memory_summary
+        torch.cuda.memory_snapshot = torch.xpu.memory_snapshot
+        torch.cuda.memory_allocated = torch.xpu.memory_allocated
+        torch.cuda.max_memory_allocated = torch.xpu.max_memory_allocated
+        torch.cuda.memory_reserved = torch.xpu.memory_reserved
+        torch.cuda.memory_cached = torch.xpu.memory_reserved
+        torch.cuda.max_memory_reserved = torch.xpu.max_memory_reserved
+        torch.cuda.max_memory_cached = torch.xpu.max_memory_reserved
+        torch.cuda.reset_peak_memory_stats = torch.xpu.reset_peak_memory_stats
+        torch.cuda.reset_max_memory_cached = torch.xpu.reset_peak_memory_stats
+        torch.cuda.reset_max_memory_allocated = torch.xpu.reset_peak_memory_stats
+        torch.cuda.memory_stats_as_nested_dict = torch.xpu.memory_stats_as_nested_dict
+        torch.cuda.reset_accumulated_memory_stats = torch.xpu.reset_accumulated_memory_stats
+
+        #RNG:
+        torch.cuda.get_rng_state = torch.xpu.get_rng_state
+        torch.cuda.get_rng_state_all = torch.xpu.get_rng_state_all
+        torch.cuda.set_rng_state = torch.xpu.set_rng_state
+        torch.cuda.set_rng_state_all = torch.xpu.set_rng_state_all
+        torch.cuda.manual_seed = torch.xpu.manual_seed
+        torch.cuda.manual_seed_all = torch.xpu.manual_seed_all
+        torch.cuda.seed = torch.xpu.seed
+        torch.cuda.seed_all = torch.xpu.seed_all
+        torch.cuda.initial_seed = torch.xpu.initial_seed
+
+        #AMP:
+        torch.cuda.amp = torch.xpu.amp
+        if not hasattr(torch.cuda.amp, "common"):
+            torch.cuda.amp.common = contextlib.nullcontext()
+        torch.cuda.amp.common.amp_definitely_not_available = lambda: False
+        try:
+            torch.cuda.amp.GradScaler = torch.xpu.amp.GradScaler
+        except Exception: # pylint: disable=broad-exception-caught
+            try:
+                from .gradscaler import gradscaler_init # pylint: disable=import-outside-toplevel, import-error
+                gradscaler_init()
+                torch.cuda.amp.GradScaler = torch.xpu.amp.GradScaler
+            except Exception: # pylint: disable=broad-exception-caught
+                torch.cuda.amp.GradScaler = ipex.cpu.autocast._grad_scaler.GradScaler
+
+        #C
+        torch._C._cuda_getCurrentRawStream = ipex._C._getCurrentStream
+        ipex._C._DeviceProperties.major = 2023
+        ipex._C._DeviceProperties.minor = 2
+
+        #Fix functions with ipex:
+        torch.cuda.mem_get_info = lambda device=None: [(torch.xpu.get_device_properties(device).total_memory - torch.xpu.memory_allocated(device)), torch.xpu.get_device_properties(device).total_memory]
+        torch._utils._get_available_device_type = lambda: "xpu"
+        torch.has_cuda = True
+        torch.cuda.has_half = True
+        torch.cuda.is_bf16_supported = lambda *args, **kwargs: True
+        torch.cuda.is_fp16_supported = lambda *args, **kwargs: True
+        torch.version.cuda = "11.7"
+        torch.cuda.get_device_capability = lambda *args, **kwargs: [11,7]
+        torch.cuda.get_device_properties.major = 11
+        torch.cuda.get_device_properties.minor = 7
+        torch.cuda.ipc_collect = lambda *args, **kwargs: None
+        torch.cuda.utilization = lambda *args, **kwargs: 0
+
+        ipex_hijacks()
+        attention_init()
+        try:
+            from .diffusers import ipex_diffusers
+            ipex_diffusers()
+        except Exception: # pylint: disable=broad-exception-caught
+            pass
+    except Exception as e:
+        return False, e
+    return True, None
diff --git a/library/ipex/attention.py b/library/ipex/attention.py
new file mode 100644
index 000000000..e38689f21
--- /dev/null
+++ b/library/ipex/attention.py
@@ -0,0 +1,152 @@
+import torch
+import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unused-import
+
+# pylint: disable=protected-access, missing-function-docstring, line-too-long
+
+original_torch_bmm = torch.bmm
+def torch_bmm(input, mat2, *, out=None):
+    if input.dtype != mat2.dtype:
+        mat2 = mat2.to(input.dtype)
+
+    #ARC GPUs can't allocate more than 4GB to a single block, Slice it:
+    batch_size_attention, input_tokens, mat2_shape = input.shape[0], input.shape[1], mat2.shape[2]
+    block_multiply = 2.4 if input.dtype == torch.float32 else 1.2
+    block_size = (batch_size_attention * input_tokens * mat2_shape) / 1024 * block_multiply #MB
+    split_slice_size = batch_size_attention
+    if block_size >= 4000:
+        do_split = True
+        #Find something divisible with the input_tokens
+        while ((split_slice_size * input_tokens * mat2_shape) / 1024 * block_multiply) > 4000:
+            split_slice_size = split_slice_size // 2
+            if split_slice_size <= 1:
+                split_slice_size = 1
+                break
+    else:
+        do_split = False
+
+    split_block_size = (split_slice_size * input_tokens * mat2_shape) / 1024 * block_multiply #MB
+    split_2_slice_size = input_tokens
+    if split_block_size >= 4000:
+        do_split_2 = True
+        #Find something divisible with the input_tokens
+        while ((split_slice_size * split_2_slice_size * mat2_shape) / 1024 * block_multiply) > 4000:
+            split_2_slice_size = split_2_slice_size // 2
+            if split_2_slice_size <= 1:
+                split_2_slice_size = 1
+                break
+    else:
+        do_split_2 = False
+
+    if do_split:
+        hidden_states = torch.zeros(input.shape[0], input.shape[1], mat2.shape[2], device=input.device, dtype=input.dtype)
+        for i in range(batch_size_attention // split_slice_size):
+            start_idx = i * split_slice_size
+            end_idx = (i + 1) * split_slice_size
+            if do_split_2:
+                for i2 in range(input_tokens // split_2_slice_size): # pylint: disable=invalid-name
+                    start_idx_2 = i2 * split_2_slice_size
+                    end_idx_2 = (i2 + 1) * split_2_slice_size
+                    hidden_states[start_idx:end_idx, start_idx_2:end_idx_2] = original_torch_bmm(
+                        input[start_idx:end_idx, start_idx_2:end_idx_2],
+                        mat2[start_idx:end_idx, start_idx_2:end_idx_2],
+                        out=out
+                    )
+            else:
+                hidden_states[start_idx:end_idx] = original_torch_bmm(
+                    input[start_idx:end_idx],
+                    mat2[start_idx:end_idx],
+                    out=out
+                )
+    else:
+        return original_torch_bmm(input, mat2, out=out)
+    return hidden_states
+
+original_scaled_dot_product_attention = torch.nn.functional.scaled_dot_product_attention
+def scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False):
+    #ARC GPUs can't allocate more than 4GB to a single block, Slice it:
+    if len(query.shape) == 3:
+        batch_size_attention, query_tokens, shape_four = query.shape
+        shape_one = 1
+        no_shape_one = True
+    else:
+        shape_one, batch_size_attention, query_tokens, shape_four = query.shape
+        no_shape_one = False
+    block_multiply = 3.6 if query.dtype == torch.float32 else 1.8
+    block_size = (shape_one * batch_size_attention * query_tokens * shape_four) / 1024 * block_multiply #MB
+    split_slice_size = batch_size_attention
+    if block_size >= 4000:
+        do_split = True
+        #Find something divisible with the shape_one
+        while ((shape_one * split_slice_size * query_tokens * shape_four) / 1024 * block_multiply) > 4000:
+            split_slice_size = split_slice_size // 2
+            if split_slice_size <= 1:
+                split_slice_size = 1
+                break
+    else:
+        do_split = False
+
+    split_block_size = (shape_one * split_slice_size * query_tokens * shape_four) / 1024 * block_multiply #MB
+    split_2_slice_size = query_tokens
+    if split_block_size >= 4000:
+        do_split_2 = True
+        #Find something divisible with the batch_size_attention
+        while ((shape_one * split_slice_size * split_2_slice_size * shape_four) / 1024 * block_multiply) > 4000:
+            split_2_slice_size = split_2_slice_size // 2
+            if split_2_slice_size <= 1:
+                split_2_slice_size = 1
+                break
+    else:
+        do_split_2 = False
+
+    if do_split:
+        hidden_states = torch.zeros(query.shape, device=query.device, dtype=query.dtype)
+        for i in range(batch_size_attention // split_slice_size):
+            start_idx = i * split_slice_size
+            end_idx = (i + 1) * split_slice_size
+            if do_split_2:
+                for i2 in range(query_tokens // split_2_slice_size): # pylint: disable=invalid-name
+                    start_idx_2 = i2 * split_2_slice_size
+                    end_idx_2 = (i2 + 1) * split_2_slice_size
+                    if no_shape_one:
+                        hidden_states[start_idx:end_idx, start_idx_2:end_idx_2] = original_scaled_dot_product_attention(
+                            query[start_idx:end_idx, start_idx_2:end_idx_2],
+                            key[start_idx:end_idx, start_idx_2:end_idx_2],
+                            value[start_idx:end_idx, start_idx_2:end_idx_2],
+                            attn_mask=attn_mask[start_idx:end_idx, start_idx_2:end_idx_2] if attn_mask is not None else attn_mask,
+                            dropout_p=dropout_p, is_causal=is_causal
+                        )
+                    else:
+                        hidden_states[:, start_idx:end_idx, start_idx_2:end_idx_2] = original_scaled_dot_product_attention(
+                            query[:, start_idx:end_idx, start_idx_2:end_idx_2],
+                            key[:, start_idx:end_idx, start_idx_2:end_idx_2],
+                            value[:, start_idx:end_idx, start_idx_2:end_idx_2],
+                            attn_mask=attn_mask[:, start_idx:end_idx, start_idx_2:end_idx_2] if attn_mask is not None else attn_mask,
+                            dropout_p=dropout_p, is_causal=is_causal
+                        )
+            else:
+                if no_shape_one:
+                    hidden_states[start_idx:end_idx] = original_scaled_dot_product_attention(
+                        query[start_idx:end_idx],
+                        key[start_idx:end_idx],
+                        value[start_idx:end_idx],
+                        attn_mask=attn_mask[start_idx:end_idx] if attn_mask is not None else attn_mask,
+                        dropout_p=dropout_p, is_causal=is_causal
+                    )
+                else:
+                    hidden_states[:, start_idx:end_idx] = original_scaled_dot_product_attention(
+                        query[:, start_idx:end_idx],
+                        key[:, start_idx:end_idx],
+                        value[:, start_idx:end_idx],
+                        attn_mask=attn_mask[:, start_idx:end_idx] if attn_mask is not None else attn_mask,
+                        dropout_p=dropout_p, is_causal=is_causal
+                    )
+    else:
+        return original_scaled_dot_product_attention(
+            query, key, value, attn_mask=attn_mask, dropout_p=dropout_p, is_causal=is_causal
+        )
+    return hidden_states
+
+def attention_init():
+    #ARC GPUs can't allocate more than 4GB to a single block:
+    torch.bmm = torch_bmm
+    torch.nn.functional.scaled_dot_product_attention = scaled_dot_product_attention
diff --git a/library/ipex/diffusers.py b/library/ipex/diffusers.py
new file mode 100644
index 000000000..4c39896ed
--- /dev/null
+++ b/library/ipex/diffusers.py
@@ -0,0 +1,119 @@
+import torch
+import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unused-import
+import diffusers #0.21.1 # pylint: disable=import-error
+from diffusers.models.attention_processor import Attention
+
+# pylint: disable=protected-access, missing-function-docstring, line-too-long
+
+class SlicedAttnProcessor: # pylint: disable=too-few-public-methods
+    r"""
+    Processor for implementing sliced attention.
+
+    Args:
+        slice_size (`int`, *optional*):
+            The number of steps to compute attention. Uses as many slices as `attention_head_dim // slice_size`, and
+            `attention_head_dim` must be a multiple of the `slice_size`.
+    """
+
+    def __init__(self, slice_size):
+        self.slice_size = slice_size
+
+    def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None): # pylint: disable=too-many-statements, too-many-locals, too-many-branches
+        residual = hidden_states
+
+        input_ndim = hidden_states.ndim
+
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        query = attn.to_q(hidden_states)
+        dim = query.shape[-1]
+        query = attn.head_to_batch_dim(query)
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+
+        batch_size_attention, query_tokens, shape_three = query.shape
+        hidden_states = torch.zeros(
+            (batch_size_attention, query_tokens, dim // attn.heads), device=query.device, dtype=query.dtype
+        )
+
+        #ARC GPUs can't allocate more than 4GB to a single block, Slice it:
+        block_multiply = 2.4 if query.dtype == torch.float32 else 1.2
+        block_size = (batch_size_attention * query_tokens * shape_three) / 1024 * block_multiply #MB
+        split_2_slice_size = query_tokens
+        if block_size >= 4000:
+            do_split_2 = True
+            #Find something divisible with the query_tokens
+            while ((self.slice_size * split_2_slice_size * shape_three) / 1024 * block_multiply) > 4000:
+                split_2_slice_size = split_2_slice_size // 2
+                if split_2_slice_size <= 1:
+                    split_2_slice_size = 1
+                    break
+        else:
+            do_split_2 = False
+
+        for i in range(batch_size_attention // self.slice_size):
+            start_idx = i * self.slice_size
+            end_idx = (i + 1) * self.slice_size
+
+            if do_split_2:
+                for i2 in range(query_tokens // split_2_slice_size): # pylint: disable=invalid-name
+                    start_idx_2 = i2 * split_2_slice_size
+                    end_idx_2 = (i2 + 1) * split_2_slice_size
+
+                    query_slice = query[start_idx:end_idx, start_idx_2:end_idx_2]
+                    key_slice = key[start_idx:end_idx, start_idx_2:end_idx_2]
+                    attn_mask_slice = attention_mask[start_idx:end_idx, start_idx_2:end_idx_2] if attention_mask is not None else None
+
+                    attn_slice = attn.get_attention_scores(query_slice, key_slice, attn_mask_slice)
+                    attn_slice = torch.bmm(attn_slice, value[start_idx:end_idx, start_idx_2:end_idx_2])
+
+                    hidden_states[start_idx:end_idx, start_idx_2:end_idx_2] = attn_slice
+            else:
+                query_slice = query[start_idx:end_idx]
+                key_slice = key[start_idx:end_idx]
+                attn_mask_slice = attention_mask[start_idx:end_idx] if attention_mask is not None else None
+
+                attn_slice = attn.get_attention_scores(query_slice, key_slice, attn_mask_slice)
+
+                attn_slice = torch.bmm(attn_slice, value[start_idx:end_idx])
+
+                hidden_states[start_idx:end_idx] = attn_slice
+
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+
+        hidden_states = hidden_states / attn.rescale_output_factor
+
+        return hidden_states
+
+def ipex_diffusers():
+    #ARC GPUs can't allocate more than 4GB to a single block:
+    diffusers.models.attention_processor.SlicedAttnProcessor = SlicedAttnProcessor
diff --git a/library/ipex/gradscaler.py b/library/ipex/gradscaler.py
new file mode 100644
index 000000000..530212101
--- /dev/null
+++ b/library/ipex/gradscaler.py
@@ -0,0 +1,179 @@
+from collections import defaultdict
+import torch
+import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unused-import
+import intel_extension_for_pytorch._C as core # pylint: disable=import-error, unused-import
+
+# pylint: disable=protected-access, missing-function-docstring, line-too-long
+
+OptState = ipex.cpu.autocast._grad_scaler.OptState
+_MultiDeviceReplicator = ipex.cpu.autocast._grad_scaler._MultiDeviceReplicator
+_refresh_per_optimizer_state = ipex.cpu.autocast._grad_scaler._refresh_per_optimizer_state
+
+def _unscale_grads_(self, optimizer, inv_scale, found_inf, allow_fp16): # pylint: disable=unused-argument
+    per_device_inv_scale = _MultiDeviceReplicator(inv_scale)
+    per_device_found_inf = _MultiDeviceReplicator(found_inf)
+
+    # To set up _amp_foreach_non_finite_check_and_unscale_, split grads by device and dtype.
+    # There could be hundreds of grads, so we'd like to iterate through them just once.
+    # However, we don't know their devices or dtypes in advance.
+
+    # https://stackoverflow.com/questions/5029934/defaultdict-of-defaultdict
+    # Google says mypy struggles with defaultdicts type annotations.
+    per_device_and_dtype_grads = defaultdict(lambda: defaultdict(list))  # type: ignore[var-annotated]
+    # sync grad to master weight
+    if hasattr(optimizer, "sync_grad"):
+        optimizer.sync_grad()
+    with torch.no_grad():
+        for group in optimizer.param_groups:
+            for param in group["params"]:
+                if param.grad is None:
+                    continue
+                if (not allow_fp16) and param.grad.dtype == torch.float16:
+                    raise ValueError("Attempting to unscale FP16 gradients.")
+                if param.grad.is_sparse:
+                    # is_coalesced() == False means the sparse grad has values with duplicate indices.
+                    # coalesce() deduplicates indices and adds all values that have the same index.
+                    # For scaled fp16 values, there's a good chance coalescing will cause overflow,
+                    # so we should check the coalesced _values().
+                    if param.grad.dtype is torch.float16:
+                        param.grad = param.grad.coalesce()
+                    to_unscale = param.grad._values()
+                else:
+                    to_unscale = param.grad
+
+                # -: is there a way to split by device and dtype without appending in the inner loop?
+                to_unscale = to_unscale.to("cpu")
+                per_device_and_dtype_grads[to_unscale.device][
+                    to_unscale.dtype
+                ].append(to_unscale)
+
+        for _, per_dtype_grads in per_device_and_dtype_grads.items():
+            for grads in per_dtype_grads.values():
+                core._amp_foreach_non_finite_check_and_unscale_(
+                    grads,
+                    per_device_found_inf.get("cpu"),
+                    per_device_inv_scale.get("cpu"),
+                )
+
+    return per_device_found_inf._per_device_tensors
+
+def unscale_(self, optimizer):
+    """
+    Divides ("unscales") the optimizer's gradient tensors by the scale factor.
+    :meth:`unscale_` is optional, serving cases where you need to
+    :ref:`modify or inspect gradients<working-with-unscaled-gradients>`
+    between the backward pass(es) and :meth:`step`.
+    If :meth:`unscale_` is not called explicitly,  gradients will be unscaled  automatically during :meth:`step`.
+    Simple example, using :meth:`unscale_` to enable clipping of unscaled gradients::
+        ...
+        scaler.scale(loss).backward()
+        scaler.unscale_(optimizer)
+        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
+        scaler.step(optimizer)
+        scaler.update()
+    Args:
+        optimizer (torch.optim.Optimizer):  Optimizer that owns the gradients to be unscaled.
+    .. warning::
+        :meth:`unscale_` should only be called once per optimizer per :meth:`step` call,
+        and only after all gradients for that optimizer's assigned parameters have been accumulated.
+        Calling :meth:`unscale_` twice for a given optimizer between each :meth:`step` triggers a RuntimeError.
+    .. warning::
+        :meth:`unscale_` may unscale sparse gradients out of place, replacing the ``.grad`` attribute.
+    """
+    if not self._enabled:
+        return
+
+    self._check_scale_growth_tracker("unscale_")
+
+    optimizer_state = self._per_optimizer_states[id(optimizer)]
+
+    if optimizer_state["stage"] is OptState.UNSCALED: # pylint: disable=no-else-raise
+        raise RuntimeError(
+            "unscale_() has already been called on this optimizer since the last update()."
+        )
+    elif optimizer_state["stage"] is OptState.STEPPED:
+        raise RuntimeError("unscale_() is being called after step().")
+
+    # FP32 division can be imprecise for certain compile options, so we carry out the reciprocal in FP64.
+    assert self._scale is not None
+    inv_scale = self._scale.to("cpu").double().reciprocal().float().to(self._scale.device)
+    found_inf = torch.full(
+        (1,), 0.0, dtype=torch.float32, device=self._scale.device
+    )
+
+    optimizer_state["found_inf_per_device"] = self._unscale_grads_(
+        optimizer, inv_scale, found_inf, False
+    )
+    optimizer_state["stage"] = OptState.UNSCALED
+
+def update(self, new_scale=None):
+    """
+    Updates the scale factor.
+    If any optimizer steps were skipped the scale is multiplied by ``backoff_factor``
+    to reduce it. If ``growth_interval`` unskipped iterations occurred consecutively,
+    the scale is multiplied by ``growth_factor`` to increase it.
+    Passing ``new_scale`` sets the new scale value manually. (``new_scale`` is not
+    used directly, it's used to fill GradScaler's internal scale tensor. So if
+    ``new_scale`` was a tensor, later in-place changes to that tensor will not further
+    affect the scale GradScaler uses internally.)
+    Args:
+        new_scale (float or :class:`torch.FloatTensor`, optional, default=None):  New scale factor.
+    .. warning::
+        :meth:`update` should only be called at the end of the iteration, after ``scaler.step(optimizer)`` has
+        been invoked for all optimizers used this iteration.
+    """
+    if not self._enabled:
+        return
+
+    _scale, _growth_tracker = self._check_scale_growth_tracker("update")
+
+    if new_scale is not None:
+        # Accept a new user-defined scale.
+        if isinstance(new_scale, float):
+            self._scale.fill_(new_scale)  # type: ignore[union-attr]
+        else:
+            reason = "new_scale should be a float or a 1-element torch.FloatTensor with requires_grad=False."
+            assert isinstance(new_scale, torch.FloatTensor), reason  # type: ignore[attr-defined]
+            assert new_scale.numel() == 1, reason
+            assert new_scale.requires_grad is False, reason
+            self._scale.copy_(new_scale)  # type: ignore[union-attr]
+    else:
+        # Consume shared inf/nan data collected from optimizers to update the scale.
+        # If all found_inf tensors are on the same device as self._scale, this operation is asynchronous.
+        found_infs = [
+            found_inf.to(device="cpu", non_blocking=True)
+            for state in self._per_optimizer_states.values()
+            for found_inf in state["found_inf_per_device"].values()
+        ]
+
+        assert len(found_infs) > 0, "No inf checks were recorded prior to update."
+
+        found_inf_combined = found_infs[0]
+        if len(found_infs) > 1:
+            for i in range(1, len(found_infs)):
+                found_inf_combined += found_infs[i]
+
+        to_device = _scale.device
+        _scale = _scale.to("cpu")
+        _growth_tracker = _growth_tracker.to("cpu")
+
+        core._amp_update_scale_(
+            _scale,
+            _growth_tracker,
+            found_inf_combined,
+            self._growth_factor,
+            self._backoff_factor,
+            self._growth_interval,
+        )
+
+        _scale = _scale.to(to_device)
+        _growth_tracker = _growth_tracker.to(to_device)
+    # To prepare for next iteration, clear the data collected from optimizers this iteration.
+    self._per_optimizer_states = defaultdict(_refresh_per_optimizer_state)
+
+def gradscaler_init():
+    torch.xpu.amp.GradScaler = ipex.cpu.autocast._grad_scaler.GradScaler
+    torch.xpu.amp.GradScaler._unscale_grads_ = _unscale_grads_
+    torch.xpu.amp.GradScaler.unscale_ = unscale_
+    torch.xpu.amp.GradScaler.update = update
+    return torch.xpu.amp.GradScaler
diff --git a/library/ipex/hijacks.py b/library/ipex/hijacks.py
new file mode 100644
index 000000000..77ed5419a
--- /dev/null
+++ b/library/ipex/hijacks.py
@@ -0,0 +1,196 @@
+import contextlib
+import importlib
+import torch
+import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unused-import
+
+# pylint: disable=protected-access, missing-function-docstring, line-too-long, unnecessary-lambda, no-else-return
+
+class CondFunc: # pylint: disable=missing-class-docstring
+    def __new__(cls, orig_func, sub_func, cond_func):
+        self = super(CondFunc, cls).__new__(cls)
+        if isinstance(orig_func, str):
+            func_path = orig_func.split('.')
+            for i in range(len(func_path)-1, -1, -1):
+                try:
+                    resolved_obj = importlib.import_module('.'.join(func_path[:i]))
+                    break
+                except ImportError:
+                    pass
+            for attr_name in func_path[i:-1]:
+                resolved_obj = getattr(resolved_obj, attr_name)
+            orig_func = getattr(resolved_obj, func_path[-1])
+            setattr(resolved_obj, func_path[-1], lambda *args, **kwargs: self(*args, **kwargs))
+        self.__init__(orig_func, sub_func, cond_func)
+        return lambda *args, **kwargs: self(*args, **kwargs)
+    def __init__(self, orig_func, sub_func, cond_func):
+        self.__orig_func = orig_func
+        self.__sub_func = sub_func
+        self.__cond_func = cond_func
+    def __call__(self, *args, **kwargs):
+        if not self.__cond_func or self.__cond_func(self.__orig_func, *args, **kwargs):
+            return self.__sub_func(self.__orig_func, *args, **kwargs)
+        else:
+            return self.__orig_func(*args, **kwargs)
+
+_utils = torch.utils.data._utils
+def _shutdown_workers(self):
+    if torch.utils.data._utils is None or torch.utils.data._utils.python_exit_status is True or torch.utils.data._utils.python_exit_status is None:
+        return
+    if hasattr(self, "_shutdown") and not self._shutdown:
+        self._shutdown = True
+        try:
+            if hasattr(self, '_pin_memory_thread'):
+                self._pin_memory_thread_done_event.set()
+                self._worker_result_queue.put((None, None))
+                self._pin_memory_thread.join()
+                self._worker_result_queue.cancel_join_thread()
+                self._worker_result_queue.close()
+            self._workers_done_event.set()
+            for worker_id in range(len(self._workers)):
+                if self._persistent_workers or self._workers_status[worker_id]:
+                    self._mark_worker_as_unavailable(worker_id, shutdown=True)
+            for w in self._workers: # pylint: disable=invalid-name
+                w.join(timeout=torch.utils.data._utils.MP_STATUS_CHECK_INTERVAL)
+            for q in self._index_queues: # pylint: disable=invalid-name
+                q.cancel_join_thread()
+                q.close()
+        finally:
+            if self._worker_pids_set:
+                torch.utils.data._utils.signal_handling._remove_worker_pids(id(self))
+                self._worker_pids_set = False
+            for w in self._workers: # pylint: disable=invalid-name
+                if w.is_alive():
+                    w.terminate()
+
+class DummyDataParallel(torch.nn.Module): # pylint: disable=missing-class-docstring, unused-argument, too-few-public-methods
+    def __new__(cls, module, device_ids=None, output_device=None, dim=0): # pylint: disable=unused-argument
+        if isinstance(device_ids, list) and len(device_ids) > 1:
+            print("IPEX backend doesn't support DataParallel on multiple XPU devices")
+        return module.to("xpu")
+
+def return_null_context(*args, **kwargs): # pylint: disable=unused-argument
+    return contextlib.nullcontext()
+
+def check_device(device):
+    return bool((isinstance(device, torch.device) and device.type == "cuda") or (isinstance(device, str) and "cuda" in device) or isinstance(device, int))
+
+def return_xpu(device):
+    return f"xpu:{device.split(':')[-1]}" if isinstance(device, str) and ":" in device else f"xpu:{device}" if isinstance(device, int) else torch.device("xpu") if isinstance(device, torch.device) else "xpu"
+
+def ipex_no_cuda(orig_func, *args, **kwargs):
+    torch.cuda.is_available = lambda: False
+    orig_func(*args, **kwargs)
+    torch.cuda.is_available = torch.xpu.is_available
+
+original_autocast = torch.autocast
+def ipex_autocast(*args, **kwargs):
+    if len(args) > 0 and args[0] == "cuda":
+        return original_autocast("xpu", *args[1:], **kwargs)
+    else:
+        return original_autocast(*args, **kwargs)
+
+original_torch_cat = torch.cat
+def torch_cat(tensor, *args, **kwargs):
+    if len(tensor) == 3 and (tensor[0].dtype != tensor[1].dtype or tensor[2].dtype != tensor[1].dtype):
+        return original_torch_cat([tensor[0].to(tensor[1].dtype), tensor[1], tensor[2].to(tensor[1].dtype)], *args, **kwargs)
+    else:
+        return original_torch_cat(tensor, *args, **kwargs)
+
+original_interpolate = torch.nn.functional.interpolate
+def interpolate(tensor, size=None, scale_factor=None, mode='nearest', align_corners=None, recompute_scale_factor=None, antialias=False): # pylint: disable=too-many-arguments
+    if antialias or align_corners is not None:
+        return_device = tensor.device
+        return_dtype = tensor.dtype
+        return original_interpolate(tensor.to("cpu", dtype=torch.float32), size=size, scale_factor=scale_factor, mode=mode,
+        align_corners=align_corners, recompute_scale_factor=recompute_scale_factor, antialias=antialias).to(return_device, dtype=return_dtype)
+    else:
+        return original_interpolate(tensor, size=size, scale_factor=scale_factor, mode=mode,
+        align_corners=align_corners, recompute_scale_factor=recompute_scale_factor, antialias=antialias)
+
+original_linalg_solve = torch.linalg.solve
+def linalg_solve(A, B, *args, **kwargs): # pylint: disable=invalid-name
+    if A.device != torch.device("cpu") or B.device != torch.device("cpu"):
+        return_device = A.device
+        return original_linalg_solve(A.to("cpu"), B.to("cpu"), *args, **kwargs).to(return_device)
+    else:
+        return original_linalg_solve(A, B, *args, **kwargs)
+
+def ipex_hijacks():
+    CondFunc('torch.Tensor.to',
+        lambda orig_func, self, device=None, *args, **kwargs: orig_func(self, return_xpu(device), *args, **kwargs),
+        lambda orig_func, self, device=None, *args, **kwargs: check_device(device))
+    CondFunc('torch.Tensor.cuda',
+        lambda orig_func, self, device=None, *args, **kwargs: orig_func(self, return_xpu(device), *args, **kwargs),
+        lambda orig_func, self, device=None, *args, **kwargs: check_device(device))
+    CondFunc('torch.empty',
+        lambda orig_func, *args, device=None, **kwargs: orig_func(*args, device=return_xpu(device), **kwargs),
+        lambda orig_func, *args, device=None, **kwargs: check_device(device))
+    CondFunc('torch.load',
+        lambda orig_func, *args, map_location=None, **kwargs: orig_func(*args, return_xpu(map_location), **kwargs),
+        lambda orig_func, *args, map_location=None, **kwargs: map_location is None or check_device(map_location))
+    CondFunc('torch.randn',
+        lambda orig_func, *args, device=None, **kwargs: orig_func(*args, device=return_xpu(device), **kwargs),
+        lambda orig_func, *args, device=None, **kwargs: check_device(device))
+    CondFunc('torch.ones',
+        lambda orig_func, *args, device=None, **kwargs: orig_func(*args, device=return_xpu(device), **kwargs),
+        lambda orig_func, *args, device=None, **kwargs: check_device(device))
+    CondFunc('torch.zeros',
+        lambda orig_func, *args, device=None, **kwargs: orig_func(*args, device=return_xpu(device), **kwargs),
+        lambda orig_func, *args, device=None, **kwargs: check_device(device))
+    CondFunc('torch.tensor',
+        lambda orig_func, *args, device=None, **kwargs: orig_func(*args, device=return_xpu(device), **kwargs),
+        lambda orig_func, *args, device=None, **kwargs: check_device(device))
+    CondFunc('torch.linspace',
+        lambda orig_func, *args, device=None, **kwargs: orig_func(*args, device=return_xpu(device), **kwargs),
+        lambda orig_func, *args, device=None, **kwargs: check_device(device))
+
+    CondFunc('torch.Generator',
+        lambda orig_func, device=None: torch.xpu.Generator(device),
+        lambda orig_func, device=None: device is not None and device != torch.device("cpu") and device != "cpu")
+
+    CondFunc('torch.batch_norm',
+        lambda orig_func, input, weight, bias, *args, **kwargs: orig_func(input,
+        weight if weight is not None else torch.ones(input.size()[1], device=input.device),
+        bias if bias is not None else torch.zeros(input.size()[1], device=input.device), *args, **kwargs),
+        lambda orig_func, input, *args, **kwargs: input.device != torch.device("cpu"))
+    CondFunc('torch.instance_norm',
+        lambda orig_func, input, weight, bias, *args, **kwargs: orig_func(input,
+        weight if weight is not None else torch.ones(input.size()[1], device=input.device),
+        bias if bias is not None else torch.zeros(input.size()[1], device=input.device), *args, **kwargs),
+        lambda orig_func, input, *args, **kwargs: input.device != torch.device("cpu"))
+
+    #Functions with dtype errors:
+    CondFunc('torch.nn.modules.GroupNorm.forward',
+        lambda orig_func, self, input: orig_func(self, input.to(self.weight.data.dtype)),
+        lambda orig_func, self, input: input.dtype != self.weight.data.dtype)
+    CondFunc('torch.nn.modules.linear.Linear.forward',
+        lambda orig_func, self, input: orig_func(self, input.to(self.weight.data.dtype)),
+        lambda orig_func, self, input: input.dtype != self.weight.data.dtype)
+    CondFunc('torch.nn.modules.conv.Conv2d.forward',
+        lambda orig_func, self, input: orig_func(self, input.to(self.weight.data.dtype)),
+        lambda orig_func, self, input: input.dtype != self.weight.data.dtype)
+    CondFunc('torch.nn.functional.layer_norm',
+        lambda orig_func, input, normalized_shape=None, weight=None, *args, **kwargs:
+        orig_func(input.to(weight.data.dtype), normalized_shape, weight, *args, **kwargs),
+        lambda orig_func, input, normalized_shape=None, weight=None, *args, **kwargs:
+        weight is not None and input.dtype != weight.data.dtype)
+
+    #Diffusers Float64 (ARC GPUs doesn't support double or Float64):
+    if not torch.xpu.has_fp64_dtype():
+        CondFunc('torch.from_numpy',
+        lambda orig_func, ndarray: orig_func(ndarray.astype('float32')),
+        lambda orig_func, ndarray: ndarray.dtype == float)
+
+    #Broken functions when torch.cuda.is_available is True:
+    CondFunc('torch.utils.data.dataloader._BaseDataLoaderIter.__init__',
+        lambda orig_func, *args, **kwargs: ipex_no_cuda(orig_func, *args, **kwargs),
+        lambda orig_func, *args, **kwargs: True)
+
+    #Functions that make compile mad with CondFunc:
+    torch.utils.data.dataloader._MultiProcessingDataLoaderIter._shutdown_workers = _shutdown_workers
+    torch.nn.DataParallel = DummyDataParallel
+    torch.autocast = ipex_autocast
+    torch.cat = torch_cat
+    torch.linalg.solve = linalg_solve
+    torch.nn.functional.interpolate = interpolate
+    torch.backends.cuda.sdp_kernel = return_null_context
diff --git a/library/model_util.py b/library/model_util.py
index 860c170b2..00a3c0495 100644
--- a/library/model_util.py
+++ b/library/model_util.py
@@ -4,6 +4,13 @@
 import math
 import os
 import torch
+try:
+    import intel_extension_for_pytorch as ipex
+    if torch.xpu.is_available():
+        from library.ipex import ipex_init
+        ipex_init()
+except Exception:
+    pass
 import diffusers
 from transformers import CLIPTextModel, CLIPTokenizer, CLIPTextConfig, logging
 from diffusers import AutoencoderKL, DDIMScheduler, StableDiffusionPipeline  # , UNet2DConditionModel
diff --git a/sdxl_gen_img.py b/sdxl_gen_img.py
index 3500df575..ac01b76e0 100755
--- a/sdxl_gen_img.py
+++ b/sdxl_gen_img.py
@@ -17,6 +17,13 @@
 import diffusers
 import numpy as np
 import torch
+try:
+    import intel_extension_for_pytorch as ipex
+    if torch.xpu.is_available():
+        from library.ipex import ipex_init
+        ipex_init()
+except Exception:
+    pass
 import torchvision
 from diffusers import (
     AutoencoderKL,
diff --git a/sdxl_minimal_inference.py b/sdxl_minimal_inference.py
index 5c8a0bd89..ff865629e 100644
--- a/sdxl_minimal_inference.py
+++ b/sdxl_minimal_inference.py
@@ -9,6 +9,13 @@
 from einops import repeat
 import numpy as np
 import torch
+try:
+    import intel_extension_for_pytorch as ipex
+    if torch.xpu.is_available():
+        from library.ipex import ipex_init
+        ipex_init()
+except Exception:
+    pass
 from tqdm import tqdm
 from transformers import CLIPTokenizer
 from diffusers import EulerDiscreteScheduler
diff --git a/sdxl_train.py b/sdxl_train.py
index 195467b00..6b255d679 100644
--- a/sdxl_train.py
+++ b/sdxl_train.py
@@ -10,6 +10,13 @@
 
 from tqdm import tqdm
 import torch
+try:
+    import intel_extension_for_pytorch as ipex
+    if torch.xpu.is_available():
+        from library.ipex import ipex_init
+        ipex_init()
+except Exception:
+    pass
 from accelerate.utils import set_seed
 from diffusers import DDPMScheduler
 from library import sdxl_model_util
diff --git a/sdxl_train_control_net_lllite.py b/sdxl_train_control_net_lllite.py
index 09cf16438..f8169bdbf 100644
--- a/sdxl_train_control_net_lllite.py
+++ b/sdxl_train_control_net_lllite.py
@@ -11,6 +11,13 @@
 
 from tqdm import tqdm
 import torch
+try:
+    import intel_extension_for_pytorch as ipex
+    if torch.xpu.is_available():
+        from library.ipex import ipex_init
+        ipex_init()
+except Exception:
+    pass
 from torch.nn.parallel import DistributedDataParallel as DDP
 from accelerate.utils import set_seed
 from diffusers import DDPMScheduler, ControlNetModel
diff --git a/sdxl_train_control_net_lllite_alt.py b/sdxl_train_control_net_lllite_alt.py
index 757194a10..61ebfb581 100644
--- a/sdxl_train_control_net_lllite_alt.py
+++ b/sdxl_train_control_net_lllite_alt.py
@@ -14,6 +14,13 @@
 
 from tqdm import tqdm
 import torch
+try:
+    import intel_extension_for_pytorch as ipex
+    if torch.xpu.is_available():
+        from library.ipex import ipex_init
+        ipex_init()
+except Exception:
+    pass
 from torch.nn.parallel import DistributedDataParallel as DDP
 from accelerate.utils import set_seed
 import accelerate
diff --git a/sdxl_train_network.py b/sdxl_train_network.py
index 8d3a81c3a..2de57c0ac 100644
--- a/sdxl_train_network.py
+++ b/sdxl_train_network.py
@@ -1,5 +1,12 @@
 import argparse
 import torch
+try:
+    import intel_extension_for_pytorch as ipex
+    if torch.xpu.is_available():
+        from library.ipex import ipex_init
+        ipex_init()
+except Exception:
+    pass
 from library import sdxl_model_util, sdxl_train_util, train_util
 import train_network
 
diff --git a/sdxl_train_textual_inversion.py b/sdxl_train_textual_inversion.py
index 123ca35a1..f5cca17b2 100644
--- a/sdxl_train_textual_inversion.py
+++ b/sdxl_train_textual_inversion.py
@@ -3,6 +3,13 @@
 
 import regex
 import torch
+try:
+    import intel_extension_for_pytorch as ipex
+    if torch.xpu.is_available():
+        from library.ipex import ipex_init
+        ipex_init()
+except Exception:
+    pass
 import open_clip
 from library import sdxl_model_util, sdxl_train_util, train_util
 
diff --git a/train_controlnet.py b/train_controlnet.py
index 988304f62..42da44125 100644
--- a/train_controlnet.py
+++ b/train_controlnet.py
@@ -11,6 +11,13 @@
 
 from tqdm import tqdm
 import torch
+try:
+    import intel_extension_for_pytorch as ipex
+    if torch.xpu.is_available():
+        from library.ipex import ipex_init
+        ipex_init()
+except Exception:
+    pass
 from torch.nn.parallel import DistributedDataParallel as DDP
 from accelerate.utils import set_seed
 from diffusers import DDPMScheduler, ControlNetModel
diff --git a/train_db.py b/train_db.py
index 6dde7e9bf..feb147787 100644
--- a/train_db.py
+++ b/train_db.py
@@ -11,6 +11,13 @@
 
 from tqdm import tqdm
 import torch
+try:
+    import intel_extension_for_pytorch as ipex
+    if torch.xpu.is_available():
+        from library.ipex import ipex_init
+        ipex_init()
+except Exception:
+    pass
 from accelerate.utils import set_seed
 from diffusers import DDPMScheduler
 
diff --git a/train_network.py b/train_network.py
index f752607e9..200fc2cfe 100644
--- a/train_network.py
+++ b/train_network.py
@@ -12,6 +12,13 @@
 
 from tqdm import tqdm
 import torch
+try:
+    import intel_extension_for_pytorch as ipex
+    if torch.xpu.is_available():
+        from library.ipex import ipex_init
+        ipex_init()
+except Exception:
+    pass
 from accelerate.utils import set_seed
 from diffusers import DDPMScheduler
 from library import model_util
diff --git a/train_textual_inversion.py b/train_textual_inversion.py
index b65d524cf..1c7b7fcb2 100644
--- a/train_textual_inversion.py
+++ b/train_textual_inversion.py
@@ -7,6 +7,13 @@
 
 from tqdm import tqdm
 import torch
+try:
+    import intel_extension_for_pytorch as ipex
+    if torch.xpu.is_available():
+        from library.ipex import ipex_init
+        ipex_init()
+except Exception:
+    pass
 from accelerate.utils import set_seed
 from diffusers import DDPMScheduler
 from transformers import CLIPTokenizer
diff --git a/train_textual_inversion_XTI.py b/train_textual_inversion_XTI.py
index 79c64cbeb..2c5673be1 100644
--- a/train_textual_inversion_XTI.py
+++ b/train_textual_inversion_XTI.py
@@ -8,6 +8,13 @@
 
 from tqdm import tqdm
 import torch
+try:
+    import intel_extension_for_pytorch as ipex
+    if torch.xpu.is_available():
+        from library.ipex import ipex_init
+        ipex_init()
+except Exception:
+    pass
 from accelerate.utils import set_seed
 import diffusers
 from diffusers import DDPMScheduler

From b99cd2a9209a6bd46c43dc6d46965be06348fd01 Mon Sep 17 00:00:00 2001
From: Disty0 <semihgulec2005@gmail.com>
Date: Wed, 20 Sep 2023 17:16:06 +0300
Subject: [PATCH 10/31] Update getDeviceIdListForCard

---
 library/ipex/__init__.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/library/ipex/__init__.py b/library/ipex/__init__.py
index 9ec69012f..19ec8eea1 100644
--- a/library/ipex/__init__.py
+++ b/library/ipex/__init__.py
@@ -16,7 +16,6 @@ def ipex_init(): # pylint: disable=too-many-statements
         torch.cuda.device = torch.xpu.device
         torch.cuda.device_count = torch.xpu.device_count
         torch.cuda.device_of = torch.xpu.device_of
-        torch.cuda.getDeviceIdListForCard = torch.xpu.getDeviceIdListForCard
         torch.cuda.get_device_name = torch.xpu.get_device_name
         torch.cuda.get_device_properties = torch.xpu.get_device_properties
         torch.cuda.init = torch.xpu.init
@@ -157,6 +156,13 @@ def ipex_init(): # pylint: disable=too-many-statements
         torch.cuda.get_device_properties.minor = 7
         torch.cuda.ipc_collect = lambda *args, **kwargs: None
         torch.cuda.utilization = lambda *args, **kwargs: 0
+        # getDeviceIdListForCard is renamed since https://github.com/intel/intel-extension-for-pytorch/commit/835b41fd5c8b6facf9efee8312f20699850ee592
+        if hasattr(torch.xpu, 'getDeviceIdListForCard'):
+            torch.cuda.getDeviceIdListForCard = torch.xpu.getDeviceIdListForCard
+            torch.cuda.get_device_id_list_per_card = torch.xpu.getDeviceIdListForCard
+        else:
+            torch.cuda.getDeviceIdListForCard = torch.xpu.get_device_id_list_per_card
+            torch.cuda.get_device_id_list_per_card = torch.xpu.get_device_id_list_per_card
 
         ipex_hijacks()
         attention_init()

From d5be8125b0b007eb702a33822147e986ea449071 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=9D=92=E9=BE=8D=E8=81=96=E8=80=85=40bdsqlsz?=
 <865105819@qq.com>
Date: Sun, 24 Sep 2023 09:51:47 +0800
Subject: [PATCH 11/31] update bitsandbytes for 0.41.1 and fixed bugs with
 generate_controlnet_subsets_config for training (#823)

* update for bnb 0.41.1

* fixed generate_controlnet_subsets_config for training

* Revert "update for bnb 0.41.1"

This reverts commit 70bd3612d84778d491fc8006b8b9f9e21c4d2eb8.
---
 library/config_util.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/library/config_util.py b/library/config_util.py
index 813483e7e..e8e0fda7c 100644
--- a/library/config_util.py
+++ b/library/config_util.py
@@ -547,17 +547,13 @@ def generate(base_dir: Optional[str]):
       return []
 
     subsets_config = []
-    for subdir in base_dir.iterdir():
-      if not subdir.is_dir():
-        continue
-
-      subset_config = {"image_dir": str(subdir), "conditioning_data_dir": conditioning_data_dir, "caption_extension": caption_extension, "num_repeats": 1}
-      subsets_config.append(subset_config)
+    subset_config = {"image_dir": train_data_dir, "conditioning_data_dir": conditioning_data_dir, "caption_extension": caption_extension, "num_repeats": 1}
+    subsets_config.append(subset_config)
 
     return subsets_config
 
   subsets_config = []
-  subsets_config += generate(train_data_dir, False)
+  subsets_config += generate(train_data_dir)
 
   return subsets_config
 

From 55886a01168bcd921145ab19f4635bf987acc472 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 24 Sep 2023 11:25:54 +0900
Subject: [PATCH 12/31] add .pt and .pth for available extension

---
 networks/resize_lora.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/networks/resize_lora.py b/networks/resize_lora.py
index 41585f79c..0bc263991 100644
--- a/networks/resize_lora.py
+++ b/networks/resize_lora.py
@@ -283,8 +283,8 @@ def resize_lora_model(lora_sd, new_rank, save_dtype, device, dynamic_method, dyn
 
 
 def resize(args):
-  if args.save_to is None or not (args.save_to.endswith('.ckpt') or args.save_to.endswith('.safetensors')):
-    raise Exception("The --save_to argument must be specified and must be a .ckpt or .safetensors file.")
+  if args.save_to is None or not (args.save_to.endswith('.ckpt') or args.save_to.endswith('.pt') or args.save_to.endswith('.pth') or args.save_to.endswith('.safetensors')):
+    raise Exception("The --save_to argument must be specified and must be a .ckpt , .pt, .pth or .safetensors file.")
 
     
   def str_to_dtype(p):

From 8052bcd5cd1f690a9899da93e3a516b7a7fda9f8 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 24 Sep 2023 11:26:28 +0900
Subject: [PATCH 13/31] format by black

---
 networks/resize_lora.py | 474 +++++++++++++++++++++-------------------
 1 file changed, 248 insertions(+), 226 deletions(-)

diff --git a/networks/resize_lora.py b/networks/resize_lora.py
index 0bc263991..8d4dafb96 100644
--- a/networks/resize_lora.py
+++ b/networks/resize_lora.py
@@ -13,69 +13,71 @@
 
 # Model save and load functions
 
+
 def load_state_dict(file_name, dtype):
-  if model_util.is_safetensors(file_name):
-    sd = load_file(file_name)
-    with safe_open(file_name, framework="pt") as f:
-      metadata = f.metadata()
-  else:
-    sd = torch.load(file_name, map_location='cpu')
-    metadata = None
+    if model_util.is_safetensors(file_name):
+        sd = load_file(file_name)
+        with safe_open(file_name, framework="pt") as f:
+            metadata = f.metadata()
+    else:
+        sd = torch.load(file_name, map_location="cpu")
+        metadata = None
 
-  for key in list(sd.keys()):
-    if type(sd[key]) == torch.Tensor:
-      sd[key] = sd[key].to(dtype)
+    for key in list(sd.keys()):
+        if type(sd[key]) == torch.Tensor:
+            sd[key] = sd[key].to(dtype)
 
-  return sd, metadata
+    return sd, metadata
 
 
 def save_to_file(file_name, model, state_dict, dtype, metadata):
-  if dtype is not None:
-    for key in list(state_dict.keys()):
-      if type(state_dict[key]) == torch.Tensor:
-        state_dict[key] = state_dict[key].to(dtype)
+    if dtype is not None:
+        for key in list(state_dict.keys()):
+            if type(state_dict[key]) == torch.Tensor:
+                state_dict[key] = state_dict[key].to(dtype)
 
-  if model_util.is_safetensors(file_name):
-    save_file(model, file_name, metadata)
-  else:
-    torch.save(model, file_name)
+    if model_util.is_safetensors(file_name):
+        save_file(model, file_name, metadata)
+    else:
+        torch.save(model, file_name)
 
 
 # Indexing functions
 
+
 def index_sv_cumulative(S, target):
-  original_sum = float(torch.sum(S))
-  cumulative_sums = torch.cumsum(S, dim=0)/original_sum
-  index = int(torch.searchsorted(cumulative_sums, target)) + 1
-  index = max(1, min(index, len(S)-1))
+    original_sum = float(torch.sum(S))
+    cumulative_sums = torch.cumsum(S, dim=0) / original_sum
+    index = int(torch.searchsorted(cumulative_sums, target)) + 1
+    index = max(1, min(index, len(S) - 1))
 
-  return index
+    return index
 
 
 def index_sv_fro(S, target):
-  S_squared = S.pow(2)
-  s_fro_sq = float(torch.sum(S_squared))
-  sum_S_squared = torch.cumsum(S_squared, dim=0)/s_fro_sq
-  index = int(torch.searchsorted(sum_S_squared, target**2)) + 1
-  index = max(1, min(index, len(S)-1))
+    S_squared = S.pow(2)
+    s_fro_sq = float(torch.sum(S_squared))
+    sum_S_squared = torch.cumsum(S_squared, dim=0) / s_fro_sq
+    index = int(torch.searchsorted(sum_S_squared, target**2)) + 1
+    index = max(1, min(index, len(S) - 1))
 
-  return index
+    return index
 
 
 def index_sv_ratio(S, target):
-  max_sv = S[0]
-  min_sv = max_sv/target
-  index = int(torch.sum(S > min_sv).item())
-  index = max(1, min(index, len(S)-1))
+    max_sv = S[0]
+    min_sv = max_sv / target
+    index = int(torch.sum(S > min_sv).item())
+    index = max(1, min(index, len(S) - 1))
 
-  return index
+    return index
 
 
 # Modified from Kohaku-blueleaf's extract/merge functions
 def extract_conv(weight, lora_rank, dynamic_method, dynamic_param, device, scale=1):
     out_size, in_size, kernel_size, _ = weight.size()
     U, S, Vh = torch.linalg.svd(weight.reshape(out_size, -1).to(device))
-    
+
     param_dict = rank_resize(S, lora_rank, dynamic_method, dynamic_param, scale)
     lora_rank = param_dict["new_rank"]
 
@@ -92,17 +94,17 @@ def extract_conv(weight, lora_rank, dynamic_method, dynamic_param, device, scale
 
 def extract_linear(weight, lora_rank, dynamic_method, dynamic_param, device, scale=1):
     out_size, in_size = weight.size()
-    
+
     U, S, Vh = torch.linalg.svd(weight.to(device))
-    
+
     param_dict = rank_resize(S, lora_rank, dynamic_method, dynamic_param, scale)
     lora_rank = param_dict["new_rank"]
-    
+
     U = U[:, :lora_rank]
     S = S[:lora_rank]
     U = U @ torch.diag(S)
     Vh = Vh[:lora_rank, :]
-    
+
     param_dict["lora_down"] = Vh.reshape(lora_rank, in_size).cpu()
     param_dict["lora_up"] = U.reshape(out_size, lora_rank).cpu()
     del U, S, Vh, weight
@@ -113,7 +115,7 @@ def merge_conv(lora_down, lora_up, device):
     in_rank, in_size, kernel_size, k_ = lora_down.shape
     out_size, out_rank, _, _ = lora_up.shape
     assert in_rank == out_rank and kernel_size == k_, f"rank {in_rank} {out_rank} or kernel {kernel_size} {k_} mismatch"
-    
+
     lora_down = lora_down.to(device)
     lora_up = lora_up.to(device)
 
@@ -127,236 +129,256 @@ def merge_linear(lora_down, lora_up, device):
     in_rank, in_size = lora_down.shape
     out_size, out_rank = lora_up.shape
     assert in_rank == out_rank, f"rank {in_rank} {out_rank} mismatch"
-    
+
     lora_down = lora_down.to(device)
     lora_up = lora_up.to(device)
-    
+
     weight = lora_up @ lora_down
     del lora_up, lora_down
     return weight
-  
+
 
 # Calculate new rank
 
+
 def rank_resize(S, rank, dynamic_method, dynamic_param, scale=1):
     param_dict = {}
 
-    if dynamic_method=="sv_ratio":
+    if dynamic_method == "sv_ratio":
         # Calculate new dim and alpha based off ratio
         new_rank = index_sv_ratio(S, dynamic_param) + 1
-        new_alpha = float(scale*new_rank)
+        new_alpha = float(scale * new_rank)
 
-    elif dynamic_method=="sv_cumulative":
+    elif dynamic_method == "sv_cumulative":
         # Calculate new dim and alpha based off cumulative sum
         new_rank = index_sv_cumulative(S, dynamic_param) + 1
-        new_alpha = float(scale*new_rank)
+        new_alpha = float(scale * new_rank)
 
-    elif dynamic_method=="sv_fro":
+    elif dynamic_method == "sv_fro":
         # Calculate new dim and alpha based off sqrt sum of squares
         new_rank = index_sv_fro(S, dynamic_param) + 1
-        new_alpha = float(scale*new_rank)
+        new_alpha = float(scale * new_rank)
     else:
         new_rank = rank
-        new_alpha = float(scale*new_rank)
+        new_alpha = float(scale * new_rank)
 
-    
-    if S[0] <= MIN_SV: # Zero matrix, set dim to 1
+    if S[0] <= MIN_SV:  # Zero matrix, set dim to 1
         new_rank = 1
-        new_alpha = float(scale*new_rank)
-    elif new_rank > rank: # cap max rank at rank
+        new_alpha = float(scale * new_rank)
+    elif new_rank > rank:  # cap max rank at rank
         new_rank = rank
-        new_alpha = float(scale*new_rank)
-
+        new_alpha = float(scale * new_rank)
 
     # Calculate resize info
     s_sum = torch.sum(torch.abs(S))
     s_rank = torch.sum(torch.abs(S[:new_rank]))
-    
+
     S_squared = S.pow(2)
     s_fro = torch.sqrt(torch.sum(S_squared))
     s_red_fro = torch.sqrt(torch.sum(S_squared[:new_rank]))
-    fro_percent = float(s_red_fro/s_fro)
+    fro_percent = float(s_red_fro / s_fro)
 
     param_dict["new_rank"] = new_rank
     param_dict["new_alpha"] = new_alpha
-    param_dict["sum_retained"] = (s_rank)/s_sum
+    param_dict["sum_retained"] = (s_rank) / s_sum
     param_dict["fro_retained"] = fro_percent
-    param_dict["max_ratio"] = S[0]/S[new_rank - 1]
+    param_dict["max_ratio"] = S[0] / S[new_rank - 1]
 
     return param_dict
 
 
 def resize_lora_model(lora_sd, new_rank, save_dtype, device, dynamic_method, dynamic_param, verbose):
-  network_alpha = None
-  network_dim = None
-  verbose_str = "\n"
-  fro_list = []
-
-  # Extract loaded lora dim and alpha
-  for key, value in lora_sd.items():
-    if network_alpha is None and 'alpha' in key:
-      network_alpha = value
-    if network_dim is None and 'lora_down' in key and len(value.size()) == 2:
-      network_dim = value.size()[0]
-    if network_alpha is not None and network_dim is not None:
-      break
-    if network_alpha is None:
-      network_alpha = network_dim
-
-  scale = network_alpha/network_dim
-
-  if dynamic_method:
-    print(f"Dynamically determining new alphas and dims based off {dynamic_method}: {dynamic_param}, max rank is {new_rank}")
-
-  lora_down_weight = None
-  lora_up_weight = None
-
-  o_lora_sd = lora_sd.copy()
-  block_down_name = None
-  block_up_name = None
-
-  with torch.no_grad():
-    for key, value in tqdm(lora_sd.items()):
-      weight_name = None
-      if 'lora_down' in key:
-        block_down_name = key.split(".")[0]
-        weight_name = key.split(".")[-1]
-        lora_down_weight = value
-      else:
-        continue
-
-      # find corresponding lora_up and alpha
-      block_up_name = block_down_name
-      lora_up_weight = lora_sd.get(block_up_name + '.lora_up.' + weight_name, None)
-      lora_alpha = lora_sd.get(block_down_name + '.alpha', None)
-
-      weights_loaded = (lora_down_weight is not None and lora_up_weight is not None)
-
-      if weights_loaded:
-
-        conv2d = (len(lora_down_weight.size()) == 4)
-        if lora_alpha is None:
-          scale = 1.0
-        else:
-          scale = lora_alpha/lora_down_weight.size()[0]
-
-        if conv2d:
-          full_weight_matrix = merge_conv(lora_down_weight, lora_up_weight, device)
-          param_dict = extract_conv(full_weight_matrix, new_rank, dynamic_method, dynamic_param, device, scale)
-        else:
-          full_weight_matrix = merge_linear(lora_down_weight, lora_up_weight, device)
-          param_dict = extract_linear(full_weight_matrix, new_rank, dynamic_method, dynamic_param, device, scale)
-
-        if verbose:
-          max_ratio = param_dict['max_ratio']
-          sum_retained = param_dict['sum_retained']
-          fro_retained = param_dict['fro_retained']
-          if not np.isnan(fro_retained):
-            fro_list.append(float(fro_retained))
-
-          verbose_str+=f"{block_down_name:75} | "
-          verbose_str+=f"sum(S) retained: {sum_retained:.1%}, fro retained: {fro_retained:.1%}, max(S) ratio: {max_ratio:0.1f}"
-
-        if verbose and dynamic_method:
-          verbose_str+=f", dynamic | dim: {param_dict['new_rank']}, alpha: {param_dict['new_alpha']}\n"
-        else:
-          verbose_str+=f"\n"
-
-        new_alpha = param_dict['new_alpha']
-        o_lora_sd[block_down_name + "." + "lora_down.weight"] = param_dict["lora_down"].to(save_dtype).contiguous()
-        o_lora_sd[block_up_name + "." + "lora_up.weight"] = param_dict["lora_up"].to(save_dtype).contiguous()
-        o_lora_sd[block_up_name + "." "alpha"] = torch.tensor(param_dict['new_alpha']).to(save_dtype)
-
-        block_down_name = None
-        block_up_name = None
-        lora_down_weight = None
-        lora_up_weight = None
-        weights_loaded = False
-        del param_dict
-
-  if verbose:
-    print(verbose_str)
-
-    print(f"Average Frobenius norm retention: {np.mean(fro_list):.2%} | std: {np.std(fro_list):0.3f}")
-  print("resizing complete")
-  return o_lora_sd, network_dim, new_alpha
+    network_alpha = None
+    network_dim = None
+    verbose_str = "\n"
+    fro_list = []
+
+    # Extract loaded lora dim and alpha
+    for key, value in lora_sd.items():
+        if network_alpha is None and "alpha" in key:
+            network_alpha = value
+        if network_dim is None and "lora_down" in key and len(value.size()) == 2:
+            network_dim = value.size()[0]
+        if network_alpha is not None and network_dim is not None:
+            break
+        if network_alpha is None:
+            network_alpha = network_dim
+
+    scale = network_alpha / network_dim
+
+    if dynamic_method:
+        print(f"Dynamically determining new alphas and dims based off {dynamic_method}: {dynamic_param}, max rank is {new_rank}")
+
+    lora_down_weight = None
+    lora_up_weight = None
+
+    o_lora_sd = lora_sd.copy()
+    block_down_name = None
+    block_up_name = None
+
+    with torch.no_grad():
+        for key, value in tqdm(lora_sd.items()):
+            weight_name = None
+            if "lora_down" in key:
+                block_down_name = key.split(".")[0]
+                weight_name = key.split(".")[-1]
+                lora_down_weight = value
+            else:
+                continue
+
+            # find corresponding lora_up and alpha
+            block_up_name = block_down_name
+            lora_up_weight = lora_sd.get(block_up_name + ".lora_up." + weight_name, None)
+            lora_alpha = lora_sd.get(block_down_name + ".alpha", None)
+
+            weights_loaded = lora_down_weight is not None and lora_up_weight is not None
+
+            if weights_loaded:
+                conv2d = len(lora_down_weight.size()) == 4
+                if lora_alpha is None:
+                    scale = 1.0
+                else:
+                    scale = lora_alpha / lora_down_weight.size()[0]
+
+                if conv2d:
+                    full_weight_matrix = merge_conv(lora_down_weight, lora_up_weight, device)
+                    param_dict = extract_conv(full_weight_matrix, new_rank, dynamic_method, dynamic_param, device, scale)
+                else:
+                    full_weight_matrix = merge_linear(lora_down_weight, lora_up_weight, device)
+                    param_dict = extract_linear(full_weight_matrix, new_rank, dynamic_method, dynamic_param, device, scale)
+
+                if verbose:
+                    max_ratio = param_dict["max_ratio"]
+                    sum_retained = param_dict["sum_retained"]
+                    fro_retained = param_dict["fro_retained"]
+                    if not np.isnan(fro_retained):
+                        fro_list.append(float(fro_retained))
+
+                    verbose_str += f"{block_down_name:75} | "
+                    verbose_str += (
+                        f"sum(S) retained: {sum_retained:.1%}, fro retained: {fro_retained:.1%}, max(S) ratio: {max_ratio:0.1f}"
+                    )
+
+                if verbose and dynamic_method:
+                    verbose_str += f", dynamic | dim: {param_dict['new_rank']}, alpha: {param_dict['new_alpha']}\n"
+                else:
+                    verbose_str += f"\n"
+
+                new_alpha = param_dict["new_alpha"]
+                o_lora_sd[block_down_name + "." + "lora_down.weight"] = param_dict["lora_down"].to(save_dtype).contiguous()
+                o_lora_sd[block_up_name + "." + "lora_up.weight"] = param_dict["lora_up"].to(save_dtype).contiguous()
+                o_lora_sd[block_up_name + "." "alpha"] = torch.tensor(param_dict["new_alpha"]).to(save_dtype)
+
+                block_down_name = None
+                block_up_name = None
+                lora_down_weight = None
+                lora_up_weight = None
+                weights_loaded = False
+                del param_dict
+
+    if verbose:
+        print(verbose_str)
+
+        print(f"Average Frobenius norm retention: {np.mean(fro_list):.2%} | std: {np.std(fro_list):0.3f}")
+    print("resizing complete")
+    return o_lora_sd, network_dim, new_alpha
 
 
 def resize(args):
-  if args.save_to is None or not (args.save_to.endswith('.ckpt') or args.save_to.endswith('.pt') or args.save_to.endswith('.pth') or args.save_to.endswith('.safetensors')):
-    raise Exception("The --save_to argument must be specified and must be a .ckpt , .pt, .pth or .safetensors file.")
-
-    
-  def str_to_dtype(p):
-    if p == 'float':
-      return torch.float
-    if p == 'fp16':
-      return torch.float16
-    if p == 'bf16':
-      return torch.bfloat16
-    return None
-
-  if args.dynamic_method and not args.dynamic_param:
-    raise Exception("If using dynamic_method, then dynamic_param is required")
-
-  merge_dtype = str_to_dtype('float')  # matmul method above only seems to work in float32
-  save_dtype = str_to_dtype(args.save_precision)
-  if save_dtype is None:
-    save_dtype = merge_dtype
-
-  print("loading Model...")
-  lora_sd, metadata = load_state_dict(args.model, merge_dtype)
-
-  print("Resizing Lora...")
-  state_dict, old_dim, new_alpha = resize_lora_model(lora_sd, args.new_rank, save_dtype, args.device, args.dynamic_method, args.dynamic_param, args.verbose)
-
-  # update metadata
-  if metadata is None:
-    metadata = {}
-
-  comment = metadata.get("ss_training_comment", "")
-
-  if not args.dynamic_method:
-    metadata["ss_training_comment"] = f"dimension is resized from {old_dim} to {args.new_rank}; {comment}"
-    metadata["ss_network_dim"] = str(args.new_rank)
-    metadata["ss_network_alpha"] = str(new_alpha)
-  else:
-    metadata["ss_training_comment"] = f"Dynamic resize with {args.dynamic_method}: {args.dynamic_param} from {old_dim}; {comment}"
-    metadata["ss_network_dim"] = 'Dynamic'
-    metadata["ss_network_alpha"] = 'Dynamic'
+    if args.save_to is None or not (
+        args.save_to.endswith(".ckpt")
+        or args.save_to.endswith(".pt")
+        or args.save_to.endswith(".pth")
+        or args.save_to.endswith(".safetensors")
+    ):
+        raise Exception("The --save_to argument must be specified and must be a .ckpt , .pt, .pth or .safetensors file.")
+
+    def str_to_dtype(p):
+        if p == "float":
+            return torch.float
+        if p == "fp16":
+            return torch.float16
+        if p == "bf16":
+            return torch.bfloat16
+        return None
+
+    if args.dynamic_method and not args.dynamic_param:
+        raise Exception("If using dynamic_method, then dynamic_param is required")
+
+    merge_dtype = str_to_dtype("float")  # matmul method above only seems to work in float32
+    save_dtype = str_to_dtype(args.save_precision)
+    if save_dtype is None:
+        save_dtype = merge_dtype
+
+    print("loading Model...")
+    lora_sd, metadata = load_state_dict(args.model, merge_dtype)
+
+    print("Resizing Lora...")
+    state_dict, old_dim, new_alpha = resize_lora_model(
+        lora_sd, args.new_rank, save_dtype, args.device, args.dynamic_method, args.dynamic_param, args.verbose
+    )
+
+    # update metadata
+    if metadata is None:
+        metadata = {}
+
+    comment = metadata.get("ss_training_comment", "")
+
+    if not args.dynamic_method:
+        metadata["ss_training_comment"] = f"dimension is resized from {old_dim} to {args.new_rank}; {comment}"
+        metadata["ss_network_dim"] = str(args.new_rank)
+        metadata["ss_network_alpha"] = str(new_alpha)
+    else:
+        metadata[
+            "ss_training_comment"
+        ] = f"Dynamic resize with {args.dynamic_method}: {args.dynamic_param} from {old_dim}; {comment}"
+        metadata["ss_network_dim"] = "Dynamic"
+        metadata["ss_network_alpha"] = "Dynamic"
 
-  model_hash, legacy_hash = train_util.precalculate_safetensors_hashes(state_dict, metadata)
-  metadata["sshs_model_hash"] = model_hash
-  metadata["sshs_legacy_hash"] = legacy_hash
+    model_hash, legacy_hash = train_util.precalculate_safetensors_hashes(state_dict, metadata)
+    metadata["sshs_model_hash"] = model_hash
+    metadata["sshs_legacy_hash"] = legacy_hash
 
-  print(f"saving model to: {args.save_to}")
-  save_to_file(args.save_to, state_dict, state_dict, save_dtype, metadata)
+    print(f"saving model to: {args.save_to}")
+    save_to_file(args.save_to, state_dict, state_dict, save_dtype, metadata)
 
 
 def setup_parser() -> argparse.ArgumentParser:
-  parser = argparse.ArgumentParser()
-
-  parser.add_argument("--save_precision", type=str, default=None,
-                      choices=[None, "float", "fp16", "bf16"], help="precision in saving, float if omitted / 保存時の精度、未指定時はfloat")
-  parser.add_argument("--new_rank", type=int, default=4,
-                      help="Specify rank of output LoRA / 出力するLoRAのrank (dim)")
-  parser.add_argument("--save_to", type=str, default=None,
-                      help="destination file name: ckpt or safetensors file / 保存先のファイル名、ckptまたはsafetensors")
-  parser.add_argument("--model", type=str, default=None,
-                      help="LoRA model to resize at to new rank: ckpt or safetensors file / 読み込むLoRAモデル、ckptまたはsafetensors")
-  parser.add_argument("--device", type=str, default=None, help="device to use, cuda for GPU / 計算を行うデバイス、cuda でGPUを使う")
-  parser.add_argument("--verbose", action="store_true", 
-                      help="Display verbose resizing information / rank変更時の詳細情報を出力する")
-  parser.add_argument("--dynamic_method", type=str, default=None, choices=[None, "sv_ratio", "sv_fro", "sv_cumulative"],
-                      help="Specify dynamic resizing method, --new_rank is used as a hard limit for max rank")
-  parser.add_argument("--dynamic_param", type=float, default=None,
-                      help="Specify target for dynamic reduction")
-       
-  return parser
-
-
-if __name__ == '__main__':
-  parser = setup_parser()
-
-  args = parser.parse_args()
-  resize(args)
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--save_precision",
+        type=str,
+        default=None,
+        choices=[None, "float", "fp16", "bf16"],
+        help="precision in saving, float if omitted / 保存時の精度、未指定時はfloat",
+    )
+    parser.add_argument("--new_rank", type=int, default=4, help="Specify rank of output LoRA / 出力するLoRAのrank (dim)")
+    parser.add_argument(
+        "--save_to", type=str, default=None, help="destination file name: ckpt or safetensors file / 保存先のファイル名、ckptまたはsafetensors"
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        default=None,
+        help="LoRA model to resize at to new rank: ckpt or safetensors file / 読み込むLoRAモデル、ckptまたはsafetensors",
+    )
+    parser.add_argument("--device", type=str, default=None, help="device to use, cuda for GPU / 計算を行うデバイス、cuda でGPUを使う")
+    parser.add_argument("--verbose", action="store_true", help="Display verbose resizing information / rank変更時の詳細情報を出力する")
+    parser.add_argument(
+        "--dynamic_method",
+        type=str,
+        default=None,
+        choices=[None, "sv_ratio", "sv_fro", "sv_cumulative"],
+        help="Specify dynamic resizing method, --new_rank is used as a hard limit for max rank",
+    )
+    parser.add_argument("--dynamic_param", type=float, default=None, help="Specify target for dynamic reduction")
+
+    return parser
+
+
+if __name__ == "__main__":
+    parser = setup_parser()
+
+    args = parser.parse_args()
+    resize(args)

From f8af6f9ba5044570c9e042753e8f235ca88dd6e6 Mon Sep 17 00:00:00 2001
From: chrisbbo <chrisbbo812@gmail.com>
Date: Sun, 24 Sep 2023 10:48:08 +0800
Subject: [PATCH 14/31] Add Chinese zh-CN

---
 .vscode/settings.json    |   3 +-
 localizations/zh-CN.json | 361 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 363 insertions(+), 1 deletion(-)
 create mode 100644 localizations/zh-CN.json

diff --git a/.vscode/settings.json b/.vscode/settings.json
index c08f3e9a0..bea2e8fdf 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -1,4 +1,5 @@
 {
     "python.linting.enabled": true,
-    "python.formatting.provider": "yapf"
+    "python.formatting.provider": "yapf",
+    "DockerRun.DisableDockerrc": true
 }
\ No newline at end of file
diff --git a/localizations/zh-CN.json b/localizations/zh-CN.json
new file mode 100644
index 000000000..4d6037538
--- /dev/null
+++ b/localizations/zh-CN.json
@@ -0,0 +1,361 @@
+
+ {
+  
+  "-Need to add resources here": "-需要在这里添加资源",
+  "(Experimental, Optional) Since the latent is close to a normal distribution, it may be a good idea to specify a value around 1/10 the noise offset.": "（实验性，可选）由于潜在变量接近正态分布，指定一个接近噪声偏移1/10的值可能是个好主意。",
+  "(Optional) Add training comment to be included in metadata": "(可选)增加训练注释到元数据",
+  "(Optional) Enforce number of epoch": "(可选)强制每批数量",
+  "(Optional) Save only the specified number of models (old models will be deleted)": "（可选）仅保存指定数量的模型（旧模型将被删除）",
+  "(Optional) Save only the specified number of states (old models will be deleted)": "（可选）仅保存指定数量的状态（旧模型将被删除）",
+  "(Optional) Stable Diffusion base model": "（可选）稳定扩散基础模型",
+  "(Optional) Stable Diffusion model": "（可选）稳定扩散模型",
+  "(Optional) The model is saved every specified steps": "（可选）模型每隔指定的步数保存一次",
+  "(Optional)": "（可选）",
+  "About SDXL training": "关于SDXL培训",
+  "Adaptive noise scale": "自适应噪声比例",
+  "Additional parameters": "额外参数",
+  "Advanced options": "高级选项",
+  "Advanced parameters": "高级参数",
+  "Advanced": "增强",
+  "ashleykleynhans runpod docker builds": "ashleykleynhans runpod docker构建",
+  "Automatically determine the dim(rank) from the weight file.": "从权重文件自动确定dim（排名）。",
+  "Autosave": "自动保存",
+  "Basic Captioning": "基本字幕",
+  "Basic": "基础",
+  "Batch size": "批量大小",
+  "BLIP Captioning": "BLIP字幕",
+  "Bucket resolution steps": "桶分辨率步骤",
+  "Built with Gradio": "使用Gradio构建",
+  "Cache latents to disk": "缓存潜变量到磁盘",
+  "Cache latents": "缓存潜变量",
+  "Caption file extension": "标题文件扩展名",
+  "Caption text": "标题文本",
+  "caption": "标题",
+  "Change History": "更改历史",
+  "Class prompt": "Class类提示",
+  "Color augmentation": "颜色增强",
+  "Configuration file": "配置文件",
+  "constant_with_warmup": "带预热的常数",
+  "constant": "常数",
+  "Conv Dimension (Rank)": "卷积维度(Rank)",
+  "Conv Dimension": "卷积维度",
+  "Convert model": "转换模型",
+  "Copy info to Folders Tab": "复制信息到文件夹",
+  "cosine_with_restarts": "带重启的余弦函数学习率的方法",
+  "cosine": "余弦函数",
+  "CrossAttention": "交叉注意力",
+  "DANGER!!! -- Insecure folder renaming -- DANGER!!!": "危险！！！-- 不安全的文件夹重命名 -- 危险！！！",
+  "Dataset folder": "数据集文件夹",
+  "Dataset preparation": "数据集准备",
+  "Dataset Preparation": "数据集准备",
+  "Dataset repeats": "数据集重复",
+  "Desired LoRA rank": "期望的LoRA秩",
+  "Destination training directory": "训练结果目录",
+  "Device": "设备",
+  "DIM from weights": "从权重获取DIM",
+  "Directory containing the images to caption": "包含要添加标题的图像的目录",
+  "Directory containing the training images": "直接包含训练图片",
+  "Directory where formatted training and regularisation folders will be placed": "训练和正则化文件会被取代",
+  "Disable CP decomposition": "禁用CP分解",
+  "Do not copy other files in the input folder to the output folder": "不要将输入文件夹中的其他文件复制到输出文件夹",
+  "Do not copy other files": "不复制其他文件",
+  "Don't upscale bucket resolution": "不要放大桶分辨率",
+  "Dreambooth/LoRA Dataset balancing": "Dreambooth/LoRA数据集平衡",
+  "Dreambooth/LoRA Folder preparation": "Dreambooth/LoRA文件准备",
+  "Dropout caption every n epochs": "每n个时代丢弃标题",
+  "DyLoRA model": "DyLoRA模型",
+  "Dynamic method": "动态方法",
+  "Dynamic parameter": "动态参数",
+  "e.g., \"by some artist\". Leave empty if you only want to add a prefix or postfix.": "例如，\"由某个艺术家创作\"。如果您只想添加前缀或后缀，请留空。",
+  "e.g., \"by some artist\". Leave empty if you want to replace with nothing.": "例如，\"由某个艺术家创作\"。如果您想用空白替换，请留空。",
+  "Enable buckets": "启用数据容器buckets",
+  "Enable for Hugging Face's stabilityai models": "启用Hugging Face的stabilityai模型",
+  "Enter one sample prompt per line to generate multiple samples per cycle. Optional specifiers include: --w (width), --h (height), --d (seed), --l (cfg scale), --s (sampler steps) and --n (negative prompt). To modify sample prompts during training, edit the prompt.txt file in the samples directory.": "每行输入一个样本提示以生成每个周期的多个样本。可选指定符包括：--w（宽度），--h（高度），--d（种子），--l（cfg比例），--s（采样器步骤）和--n（负提示）。要在训练期间修改样本提示，请编辑样本目录中的prompt.txt文件。",
+  "Epoch": "数量增加",
+  "Error": "错误",
+  "Example of the optimizer settings for Adafactor with the fixed learning rate:": "具有固定学习率的Adafactor优化器设置的示例：",
+  "Extract DyLoRA": "提取DyLoRA",
+  "Extract LoRA model": "提取LoRA模型",
+  "Extract LoRA": "提取LoRA",
+  "Extract LyCORIS LoCon": "提取LyCORIS LoCon",
+  "Extract LyCORIS LoCON": "提取LyCORIS LoCON",
+  "FileNotFoundError": "FileNotFoundError",
+  "Find text": "查找文本",
+  "Finetune": "微调",
+  "Finetuned model": "微调模型",
+  "Finetuning Resource Guide": "微调资源指南",
+  "fixed": "固定",
+  "Flip augmentation": "翻转增强",
+  "float16": "float16",
+  "Folders": "文件夹",
+  "Full bf16 training (experimental)": "完全bf16训练（实验性）",
+  "Full fp16 training (experimental)": "完全fp16训练（实验性）",
+  "Generate caption files for the grouped images based on their folder name": "根据其文件夹名称为分组图片生成标题文件",
+  "Generate caption metadata": "生成标题元数据",
+  "Generate Captions": "生成标题",
+  "Generate image buckets metadata": "生成图像存储桶元数据",
+  "GIT Captioning": "GIT字幕",
+  "Gradient accumulate steps": "渐变积累步骤",
+  "Gradient checkpointing": "渐变检查点",
+  "Group size": "Group大小",
+  "Guidelines for SDXL Finetuning": "SDXL微调指南",
+  "Guides": "指南",
+  "How to Create a LoRA Part 1: Dataset Preparation:": "如何创建LoRA第1部分：数据集准备：",
+  "If unchecked, tensorboard will be used as the default for logging.": "如果未选中，tensorboard将用作日志记录的默认选项。",
+  "If you have valuable resources to add, kindly create a PR on Github.": "如果您有有价值的资源要添加，请在Github上创建一个PR。",
+  "Ignore Imported Tags Above Word Count": "忽略高于字数计数的导入标签",
+  "Image folder to caption": "要添加标题的图像文件夹",
+  "Image folder": "图片文件夹",
+  "Include images in subfolders as well": "同时包括子文件夹中的图片",
+  "Include Subfolders": "包括子文件夹",
+  "Init word": "初始化词",
+  "Input folder": "输入文件夹",
+  "Install Location": "安装位置",
+  "Installation": "安装",
+  "Instance prompt": "实例提示",
+  "Keep n tokens": "保留n个令牌",
+  "Launching the GUI on Linux and macOS": "在Linux和macOS上启动GUI",
+  "Launching the GUI on Windows": "在Windows上启动GUI",
+  "Learning rate": "学习率",
+  "linear": "线性",
+  "Linux and macOS Upgrade": "Linux和macOS升级",
+  "Linux and macOS": "Linux和macOS",
+  "Linux Pre-requirements": "Linux预先要求",
+  "Load": "加载",
+  "Loading...": "载入中...",
+  "Local docker build": "本地Docker构建",
+  "Logging folder": "日志文件夹",
+  "LoRA model \"A\"": "LoRA模型“A”",
+  "LoRA model \"B\"": "LoRA模型“B”",
+  "LoRA model \"C\"": "LoRA模型“C”",
+  "LoRA model \"D\"": "LoRA模型“D”",
+  "LoRA model": "LoRA模型",
+  "LoRA network weights": "LoRA网络权重",
+  "LoRA": "LoRA",
+  "LR number of cycles": "学习率周期数",
+  "LR power": "学习率功率",
+  "LR scheduler extra arguments": "学习率调度器额外参数",
+  "LR Scheduler": "学习率调度器",
+  "LR warmup (% of steps)": "学习率预热（%的步数）",
+  "LyCORIS model": "LyCORIS模型",
+  "Macos is not great at the moment.": "目前MacOS的支持不是很好。",
+  "Manual Captioning": "手动字幕",
+  "Manual installation": "手动安装",
+  "Max bucket resolution": "最大存储桶分辨率",
+  "Max length": "最大长度",
+  "Max num workers for DataLoader": "DataLoader的最大工作人员数量",
+  "Max resolution": "最大分辨率",
+  "Max Timestep": "最大时间步",
+  "Max Token Length": "最大令牌长度",
+  "Max train epoch": "每批数量",
+  "Max train steps": "最大训练步数",
+  "Maximum bucket resolution": "最大数据容器存储桶分辨率",
+  "Maximum size in pixel a bucket can be (>= 64)": "可以达到的最大像素尺寸（>= 64）",
+  "Memory efficient attention": "内存高效注意力",
+  "Merge LoRA (SVD)": "合并LoRA（SVD）",
+  "Merge LoRA": "合并LoRA",
+  "Merge LyCORIS": "合并LyCORIS",
+  "Merge model": "合并模型",
+  "Merge precision": "合并精度",
+  "Merge ratio model A": "模型A合并比例",
+  "Merge ratio model B": "模型B合并比例",
+  "Merge ratio model C": "模型C合并比例",
+  "Merge ratio model D": "模型D合并比例",
+  "Min bucket resolution": "最小数据容器存储桶分辨率",
+  "Min length": "最小长度",
+  "Min SNR gamma": "最小SNR伽玛",
+  "Min Timestep": "最小时间步",
+  "Minimum bucket resolution": "最小数据容器存储桶分辨率",
+  "Minimum size in pixel a bucket can be": "数据容器存储桶的最小像素大小",
+  "Mixed precision": "混合精度",
+  "Mnimum difference": "最小差异",
+  "Mode": "模式",
+  "Model A merge ratio (eg: 0.5 mean 50%)": "模型A合并比率（例如：0.5意味着50%）",
+  "Model B merge ratio (eg: 0.5 mean 50%)": "模型B合并比率（例如：0.5意味着50%）",
+  "Model C merge ratio (eg: 0.5 mean 50%)": "模型C合并比率（例如：0.5意味着50%）",
+  "Model D merge ratio (eg: 0.5 mean 50%)": "模型D合并比率（例如：0.5意味着50%）",
+  "Model output folder": "模型输出文件夹",
+  "Model output name": "模型输出文件夹",
+  "Model Quick Pick": "快速选择模型",
+  "Module dropout": "模块丢失",
+  "Network Dimension (Rank)": "网络维度（秩）",
+  "Network Dimension": "网络维度",
+  "Network dropout": "网络丢失",
+  "No module called tkinter": "没有名为tkinter的模块",
+  "No token padding": "无令牌填充",
+  "Noise offset type": "噪声偏移类型",
+  "Noise offset": "噪声偏移",
+  "Number of beams": "beam的数量 - 由于同时考虑多个解决方案，beam搜索能够减少错误累积，从而提高最终解决方案的质量。",
+  "Number of CPU threads per core": "每个核心的CPU线程数",
+  "Number of images to group together": "要一起分组的图像数量",
+  "Number of updates steps to accumulate before performing a backward/update pass": "执行反向/更新传递之前需要积累的更新步骤数",
+  "object template": "对象模板",
+  "Only for SD v2 models. By scaling the loss according to the time step, the weights of global noise prediction and local noise prediction become the same, and the improvement of details may be expected.": "仅适用于SD v2模型。通过根据时间步长缩放损失，全局噪声预测和局部噪声预测的权重变得相同，可以期望细节的改进。",
+  "Open": "打开",
+  "Optimizer extra arguments": "优化器额外参数",
+  "Optimizer": "优化器",
+  "Optional: CUDNN 8.6": "可选：CUDNN 8.6",
+  "Original": "原始",
+  "Output folder": "输出文件夹",
+  "Output": "输出",
+  "Overwrite existing captions in folder": "覆盖文件夹中现有的标题",
+  "Page File Limit": "页面文件限制",
+  "PagedAdamW8bit": "分页AdamW8位",
+  "PagedLion8bit": "分页Lion8位",
+  "Parameters": "参数",
+  "path for the checkpoint file to save...": "保存检查点文件的路径...",
+  "path for the LoRA file to save...": "保存LoRA文件的路径...",
+  "path for the new LoRA file to save...": "保存新LoRA文件的路径...",
+  "path to \"last-state\" state folder to resume from": "从中恢复的“最后状态”状态文件夹的路径",
+  "Path to the DyLoRA model to extract from": "要从中提取的DyLoRA模型的路径",
+  "Path to the finetuned model to extract": "要提取的微调模型的路径",
+  "Path to the LoRA A model": "LoRA A模型的路径",
+  "Path to the LoRA B model": "LoRA B模型的路径",
+  "Path to the LoRA C model": "LoRA C模型的路径",
+  "Path to the LoRA D model": "LoRA D模型的路径",
+  "Path to the LoRA model to verify": "要验证的LoRA模型的路径",
+  "Path to the LoRA to resize": "要调整大小的LoRA的路径",
+  "Path to the LyCORIS model": "LyCORIS模型的路径",
+  "path where to save the extracted LoRA model...": "保存提取出的LoRA模型的路径...",
+  "Persistent data loader": "持久数据加载器",
+  "polynomial": "多项式",
+  "Postfix to add to BLIP caption": "添加到BLIP标题的后缀",
+  "Postfix to add to caption": "添加到标题的后缀",
+  "Pre-built Runpod template": "预构建的Runpod模板",
+  "Prefix to add to BLIP caption": "添加到BLIP标题的前缀",
+  "Prefix to add to caption": "添加到标题的前缀",
+  "Prepare training data": "准备训练数据",
+  "Print training command": "打印训练命令",
+  "Prior loss weight": "先验损失权重",
+  "Prodigy": "神童",
+  "Provide a SD file path IF you want to merge it with LoRA files": "如果您想将其与LoRA文件合并，请提供SD文件路径",
+  "Provide a SD file path that you want to merge with the LyCORIS file": "提供您想与LyCORIS文件合并的SD文件路径",
+  "PyTorch 2 seems to use slightly less GPU memory than PyTorch 1.": "PyTorch 2似乎使用的GPU内存比PyTorch 1略少。",
+  "Quick Tags": "快速标签",
+  "Random crop instead of center crop": "随机裁剪而非中心裁剪",
+  "Rank dropout": "排名丢失",
+  "Rate of caption dropout": "标题丢失率",
+  "Recommended value of 0.5 when used": "使用时推荐值为0.5",
+  "Recommended value of 5 when used": "使用时推荐值为5",
+  "recommended values are 0.05 - 0.15": "推荐值为0.05 - 0.15",
+  "Regularisation folder": "正则化文件夹",
+  "Regularisation images": "正则化图像",
+  "Repeats": "重复",
+  "Replacement text": "替换文本",
+  "Required bitsandbytes >= 0.36.0": "所需的bitsandbytes >= 0.36.0",
+  "Resize LoRA": "调整LoRA尺寸",
+  "Resize model": "调整模型大小",
+  "Resolution (width,height)": "分辨率（宽度,高度）",
+  "Resource Contributions": "资源贡献",
+  "Resume from saved training state": "从保存的训练状态恢复",
+  "Resume TI training": "恢复TI训练",
+  "Runpod": "Runpod",
+  "Sample every n epochs": "每n个时代采样一次",
+  "Sample every n steps": "每n步采样一次",
+  "Sample image generation during training": "培训期间的样本图像生成",
+  "Sample prompts": "样本提示",
+  "Sample sampler": "样本采样器",
+  "Samples": "样例",
+  "Save dtype": "保存数据类型",
+  "Save every N epochs": "每N个epochs保存",
+  "Save every N steps": "每N步保存一次",
+  "Save last N steps state": "保存最后N步状态",
+  "Save last N steps": "保存最后N步",
+  "Save precision": "保存精度",
+  "Save to": "保存到",
+  "Save trained model as": "保存训练模型为",
+  "Save training state": "保存训练状态",
+  "Save": "保存",
+  "Scale v prediction loss": "缩放v预测损失",
+  "Scale weight norms": "缩放权重规范",
+  "SD Model": "SD模型",
+  "SDXL model": "SDXL模型",
+  "Set the Max resolution to at least 1024x1024, as this is the standard resolution for SDXL. ": "将 最大分辨率 设置为至少 1024x1024，因为这是 SDXL 的标准分辨率。",
+  "Set the Max resolution to at least 1024x1024, as this is the standard resolution for SDXL.": "将最大分辨率设置为至少1024x1024，因为这是SDXL的标准分辨率。",
+  "Setup": "设置",
+  "SGDNesterov": "SGD Nesterov",
+  "SGDNesterov8bit": "SGD Nesterov 8位",
+  "Shuffle caption": "随机标题",
+  "Source LoRA": "源LoRA",
+  "Source model type": "源模型类型",
+  "Source model": "模型来源",
+  "Sparsity": "稀疏性",
+  "Stable Diffusion base model": "稳定扩散基础模型",
+  "Stable Diffusion original model: ckpt or safetensors file": "稳定扩散原始模型：ckpt或safetensors文件",
+  "Start tensorboard": "开始 tensorboard",
+  "Start training": "开始训练",
+  "Starting GUI Service": "启动GUI服务",
+  "Stop tensorboard": "结束 tensorboard",
+  "Stop text encoder training": "停止文本编码器训练",
+  "Stop training": "停止训练",
+  "style template": "样式模板",
+  "sv_fro": "sv_fro",
+  "Target model folder": "目标模型文件夹",
+  "Target model name": "目标模型名称",
+  "Target model precision": "目标模型精度",
+  "Target model type": "目标模型类型",
+  "Template": "模板",
+  "Text Encoder learning rate": "文本编码器学习率",
+  "The fine-tuning can be done with 24GB GPU memory with the batch size of 1.": "微调可以在具有1个批量大小的24GB GPU内存上完成。",
+  "The GUI allows you to set the training parameters and generate and run the required CLI commands to train the model.": "该GUI允许您设置训练参数，并生成并运行训练模型所需的CLI命令。",
+  "This guide is a resource compilation to facilitate the development of robust LoRA models.": "该指南是一个资源汇编，以促进强大LoRA模型的开发。",
+  "This section provide Dreambooth tools to help setup your dataset…": "这些选择帮助设置自己的数据集",
+  "This section provide LoRA tools to help setup your dataset…": "本节提供LoRA工具以帮助您设置数据集...",
+  "This section provide Various Finetuning guides and information…": "本节提供各种微调指南和信息",
+  "This utility allows quick captioning and tagging of images.": "此工具允许快速地为图像添加标题和标签。",
+  "This utility allows you to create simple caption files for each image in a folder.": "此工具允许您为文件夹中的每个图像创建简单的标题文件。",
+  "This utility can be used to convert from one stable diffusion model format to another.": "该工具可用于将一个稳定扩散模型格式转换为另一种格式",
+  "This utility can extract a DyLoRA network from a finetuned model.": "该工具可以从微调模型中提取DyLoRA网络。",
+  "This utility can extract a LoRA network from a finetuned model.": "该工具可以从微调模型中提取LoRA网络。",
+  "This utility can extract a LyCORIS LoCon network from a finetuned model.": "该工具可以从微调模型中提取LyCORIS LoCon网络。",
+  "This utility can merge a LyCORIS model into a SD checkpoint.": "该工具可以将LyCORIS模型合并到SD检查点中。",
+  "This utility can merge two LoRA networks together into a new LoRA.": "该工具可以将两个LoRA网络合并为一个新的LoRA。",
+  "This utility can merge up to 4 LoRA together or alternatively merge up to 4 LoRA into a SD checkpoint.": "该工具可以合并多达4个LoRA，或者选择性地将多达4个LoRA合并到SD检查点中。",
+  "This utility can resize a LoRA.": "该工具可以调整LoRA的大小。",
+  "This utility can verify a LoRA network to make sure it is properly trained.": "该工具可以验证LoRA网络以确保其得到适当的训练。",
+  "This utility uses BLIP to caption files for each image in a folder.": "此工具使用BLIP为文件夹中的每张图像添加标题。",
+  "This utility will create the necessary folder structure for the training images and optional regularization images needed for the kohys_ss Dreambooth/LoRA method to function correctly.": "为训练文件创建文件夹",
+  "This utility will ensure that each concept folder in the dataset folder is used equally during the training process of the dreambooth machine learning model, regardless of the number of images in each folder. It will do this by renaming the concept folders to indicate the number of times they should be repeated during training.": "此工具将确保在训练dreambooth机器学习模型的过程中，数据集文件夹中的每个概念文件夹都将被平等地使用，无论每个文件夹中有多少图像。它将通过重命名概念文件夹来指示在训练期间应重复使用它们的次数。",
+  "This utility will group images in a folder based on their aspect ratio.": "此工具将根据它们的纵横比将文件夹中的图像分组。",
+  "This utility will use GIT to caption files for each images in a folder.": "此工具将使用GIT为文件夹中的每张图像添加标题。",
+  "This utility will use WD14 to caption files for each images in a folder.": "此工具将使用WD14为文件夹中的每张图像添加标题。",
+  "Tips for SDXL training": "SDXL培训提示",
+  "Token string": "令牌字符串",
+  "Train a custom model using kohya finetune python code": "使用kohya微调Python代码训练个性化模型",
+  "Train a custom model using kohya train network LoRA python code…": "使用kohya训练网络LoRA Python代码训练自定义模型",
+  "Train batch size": "训练批次大小",
+  "Train Network": "训练网络",
+  "Train text encoder": "训练文本编码器",
+  "Train U-Net only.": "仅训练 U-Net",
+  "Training config folder": "训练配置文件夹",
+  "Training Image folder": "训练图像文件夹",
+  "Training images": "训练图像",
+  "Training steps per concept per epoch": "每个周期每个概念的训练步骤",
+  "Training": "训练",
+  "Troubleshooting": "故障排除",
+  "Tutorials": "教程",
+  "Unet learning rate": "Unet学习率",
+  "UNet linear projection": "UNet 线性投影",
+  "Upgrading": "升级",
+  "Use --cache_text_encoder_outputs option and caching latents.": "使用 --cache_text_encoder_outputs 选项和缓存潜在变量。",
+  "Use Adafactor optimizer. RMSprop 8bit or Adagrad 8bit may work. AdamW 8bit doesn’t seem to work.": "使用 Adafactor 优化器。 RMSprop 8bit 或 Adagrad 8bit 可能有效。 AdamW 8bit 好像不行。",
+  "Use beam search": "使用beam搜索-启发式图搜索算法,beam搜索可以用来生成更准确和自然的文本。",
+  "Use gradient checkpointing.": "使用梯度检查点。",
+  "Use latent files": "使用潜在文件",
+  "Use sparse biais": "使用稀疏偏见",
+  "Users can obtain and/or generate an api key in the their user settings on the website: https://wandb.ai/login": "用户可以在以下网站的用户设置中获取和/或生成API密钥：https://wandb.ai/login",
+  "V Pred like loss": "v预测损失",
+  "Values greater than 0 will make the model more img2img focussed. 0 = image only": "大于0的值会使模型更加聚焦在 img2img 上。0 = 仅图像。这应该表示时间步参数,大于0会使模型更加侧重 img2img 生成,0则仅关注图像生成。",
+  "Values lower than 1000 will make the model more img2img focussed. 1000 = noise only": "小于1000的值会使模型更加聚焦在 img2img 上。1000 = 仅噪声。这也应该表示时间步参数,小于1000会使模型更加侧重 img2img 生成,1000则仅从噪声生成图像。",
+  "Vectors": "向量",
+  "Verbose": "详细输出",
+  "WANDB API Key": "WANDB API 密钥。",
+  "WARNING! The use of this utility on the wrong folder can lead to unexpected folder renaming!!!": "警告！在错误的文件夹上使用此工具可能导致意外的文件夹重命名！",
+  "WD14 Captioning": "WD14字幕",
+  "Windows Upgrade": "Windows升级",
+  "Train a custom model using kohya dreambooth python code…": "使用kohya的dreambooth Python代码训练个性化模型",
+  "Training comment": "训练注释",
+  "Train a TI using kohya textual inversion python code…": "使用kohya的文本反转Python代码训练TI模型",
+  "Train a custom model using kohya finetune python code…": "使用kohya的微调Python代码训练个性化模型"
+
+}
\ No newline at end of file

From 66817992c17836b0ed7f77e7c6be33b09942e46b Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 24 Sep 2023 11:50:44 +0900
Subject: [PATCH 15/31] revert formatting

---
 networks/resize_lora.py | 474 +++++++++++++++++++---------------------
 1 file changed, 226 insertions(+), 248 deletions(-)

diff --git a/networks/resize_lora.py b/networks/resize_lora.py
index 8d4dafb96..0bc263991 100644
--- a/networks/resize_lora.py
+++ b/networks/resize_lora.py
@@ -13,71 +13,69 @@
 
 # Model save and load functions
 
-
 def load_state_dict(file_name, dtype):
-    if model_util.is_safetensors(file_name):
-        sd = load_file(file_name)
-        with safe_open(file_name, framework="pt") as f:
-            metadata = f.metadata()
-    else:
-        sd = torch.load(file_name, map_location="cpu")
-        metadata = None
+  if model_util.is_safetensors(file_name):
+    sd = load_file(file_name)
+    with safe_open(file_name, framework="pt") as f:
+      metadata = f.metadata()
+  else:
+    sd = torch.load(file_name, map_location='cpu')
+    metadata = None
 
-    for key in list(sd.keys()):
-        if type(sd[key]) == torch.Tensor:
-            sd[key] = sd[key].to(dtype)
+  for key in list(sd.keys()):
+    if type(sd[key]) == torch.Tensor:
+      sd[key] = sd[key].to(dtype)
 
-    return sd, metadata
+  return sd, metadata
 
 
 def save_to_file(file_name, model, state_dict, dtype, metadata):
-    if dtype is not None:
-        for key in list(state_dict.keys()):
-            if type(state_dict[key]) == torch.Tensor:
-                state_dict[key] = state_dict[key].to(dtype)
+  if dtype is not None:
+    for key in list(state_dict.keys()):
+      if type(state_dict[key]) == torch.Tensor:
+        state_dict[key] = state_dict[key].to(dtype)
 
-    if model_util.is_safetensors(file_name):
-        save_file(model, file_name, metadata)
-    else:
-        torch.save(model, file_name)
+  if model_util.is_safetensors(file_name):
+    save_file(model, file_name, metadata)
+  else:
+    torch.save(model, file_name)
 
 
 # Indexing functions
 
-
 def index_sv_cumulative(S, target):
-    original_sum = float(torch.sum(S))
-    cumulative_sums = torch.cumsum(S, dim=0) / original_sum
-    index = int(torch.searchsorted(cumulative_sums, target)) + 1
-    index = max(1, min(index, len(S) - 1))
+  original_sum = float(torch.sum(S))
+  cumulative_sums = torch.cumsum(S, dim=0)/original_sum
+  index = int(torch.searchsorted(cumulative_sums, target)) + 1
+  index = max(1, min(index, len(S)-1))
 
-    return index
+  return index
 
 
 def index_sv_fro(S, target):
-    S_squared = S.pow(2)
-    s_fro_sq = float(torch.sum(S_squared))
-    sum_S_squared = torch.cumsum(S_squared, dim=0) / s_fro_sq
-    index = int(torch.searchsorted(sum_S_squared, target**2)) + 1
-    index = max(1, min(index, len(S) - 1))
+  S_squared = S.pow(2)
+  s_fro_sq = float(torch.sum(S_squared))
+  sum_S_squared = torch.cumsum(S_squared, dim=0)/s_fro_sq
+  index = int(torch.searchsorted(sum_S_squared, target**2)) + 1
+  index = max(1, min(index, len(S)-1))
 
-    return index
+  return index
 
 
 def index_sv_ratio(S, target):
-    max_sv = S[0]
-    min_sv = max_sv / target
-    index = int(torch.sum(S > min_sv).item())
-    index = max(1, min(index, len(S) - 1))
+  max_sv = S[0]
+  min_sv = max_sv/target
+  index = int(torch.sum(S > min_sv).item())
+  index = max(1, min(index, len(S)-1))
 
-    return index
+  return index
 
 
 # Modified from Kohaku-blueleaf's extract/merge functions
 def extract_conv(weight, lora_rank, dynamic_method, dynamic_param, device, scale=1):
     out_size, in_size, kernel_size, _ = weight.size()
     U, S, Vh = torch.linalg.svd(weight.reshape(out_size, -1).to(device))
-
+    
     param_dict = rank_resize(S, lora_rank, dynamic_method, dynamic_param, scale)
     lora_rank = param_dict["new_rank"]
 
@@ -94,17 +92,17 @@ def extract_conv(weight, lora_rank, dynamic_method, dynamic_param, device, scale
 
 def extract_linear(weight, lora_rank, dynamic_method, dynamic_param, device, scale=1):
     out_size, in_size = weight.size()
-
+    
     U, S, Vh = torch.linalg.svd(weight.to(device))
-
+    
     param_dict = rank_resize(S, lora_rank, dynamic_method, dynamic_param, scale)
     lora_rank = param_dict["new_rank"]
-
+    
     U = U[:, :lora_rank]
     S = S[:lora_rank]
     U = U @ torch.diag(S)
     Vh = Vh[:lora_rank, :]
-
+    
     param_dict["lora_down"] = Vh.reshape(lora_rank, in_size).cpu()
     param_dict["lora_up"] = U.reshape(out_size, lora_rank).cpu()
     del U, S, Vh, weight
@@ -115,7 +113,7 @@ def merge_conv(lora_down, lora_up, device):
     in_rank, in_size, kernel_size, k_ = lora_down.shape
     out_size, out_rank, _, _ = lora_up.shape
     assert in_rank == out_rank and kernel_size == k_, f"rank {in_rank} {out_rank} or kernel {kernel_size} {k_} mismatch"
-
+    
     lora_down = lora_down.to(device)
     lora_up = lora_up.to(device)
 
@@ -129,256 +127,236 @@ def merge_linear(lora_down, lora_up, device):
     in_rank, in_size = lora_down.shape
     out_size, out_rank = lora_up.shape
     assert in_rank == out_rank, f"rank {in_rank} {out_rank} mismatch"
-
+    
     lora_down = lora_down.to(device)
     lora_up = lora_up.to(device)
-
+    
     weight = lora_up @ lora_down
     del lora_up, lora_down
     return weight
-
+  
 
 # Calculate new rank
 
-
 def rank_resize(S, rank, dynamic_method, dynamic_param, scale=1):
     param_dict = {}
 
-    if dynamic_method == "sv_ratio":
+    if dynamic_method=="sv_ratio":
         # Calculate new dim and alpha based off ratio
         new_rank = index_sv_ratio(S, dynamic_param) + 1
-        new_alpha = float(scale * new_rank)
+        new_alpha = float(scale*new_rank)
 
-    elif dynamic_method == "sv_cumulative":
+    elif dynamic_method=="sv_cumulative":
         # Calculate new dim and alpha based off cumulative sum
         new_rank = index_sv_cumulative(S, dynamic_param) + 1
-        new_alpha = float(scale * new_rank)
+        new_alpha = float(scale*new_rank)
 
-    elif dynamic_method == "sv_fro":
+    elif dynamic_method=="sv_fro":
         # Calculate new dim and alpha based off sqrt sum of squares
         new_rank = index_sv_fro(S, dynamic_param) + 1
-        new_alpha = float(scale * new_rank)
+        new_alpha = float(scale*new_rank)
     else:
         new_rank = rank
-        new_alpha = float(scale * new_rank)
+        new_alpha = float(scale*new_rank)
 
-    if S[0] <= MIN_SV:  # Zero matrix, set dim to 1
+    
+    if S[0] <= MIN_SV: # Zero matrix, set dim to 1
         new_rank = 1
-        new_alpha = float(scale * new_rank)
-    elif new_rank > rank:  # cap max rank at rank
+        new_alpha = float(scale*new_rank)
+    elif new_rank > rank: # cap max rank at rank
         new_rank = rank
-        new_alpha = float(scale * new_rank)
+        new_alpha = float(scale*new_rank)
+
 
     # Calculate resize info
     s_sum = torch.sum(torch.abs(S))
     s_rank = torch.sum(torch.abs(S[:new_rank]))
-
+    
     S_squared = S.pow(2)
     s_fro = torch.sqrt(torch.sum(S_squared))
     s_red_fro = torch.sqrt(torch.sum(S_squared[:new_rank]))
-    fro_percent = float(s_red_fro / s_fro)
+    fro_percent = float(s_red_fro/s_fro)
 
     param_dict["new_rank"] = new_rank
     param_dict["new_alpha"] = new_alpha
-    param_dict["sum_retained"] = (s_rank) / s_sum
+    param_dict["sum_retained"] = (s_rank)/s_sum
     param_dict["fro_retained"] = fro_percent
-    param_dict["max_ratio"] = S[0] / S[new_rank - 1]
+    param_dict["max_ratio"] = S[0]/S[new_rank - 1]
 
     return param_dict
 
 
 def resize_lora_model(lora_sd, new_rank, save_dtype, device, dynamic_method, dynamic_param, verbose):
-    network_alpha = None
-    network_dim = None
-    verbose_str = "\n"
-    fro_list = []
-
-    # Extract loaded lora dim and alpha
-    for key, value in lora_sd.items():
-        if network_alpha is None and "alpha" in key:
-            network_alpha = value
-        if network_dim is None and "lora_down" in key and len(value.size()) == 2:
-            network_dim = value.size()[0]
-        if network_alpha is not None and network_dim is not None:
-            break
-        if network_alpha is None:
-            network_alpha = network_dim
-
-    scale = network_alpha / network_dim
-
-    if dynamic_method:
-        print(f"Dynamically determining new alphas and dims based off {dynamic_method}: {dynamic_param}, max rank is {new_rank}")
-
-    lora_down_weight = None
-    lora_up_weight = None
-
-    o_lora_sd = lora_sd.copy()
-    block_down_name = None
-    block_up_name = None
-
-    with torch.no_grad():
-        for key, value in tqdm(lora_sd.items()):
-            weight_name = None
-            if "lora_down" in key:
-                block_down_name = key.split(".")[0]
-                weight_name = key.split(".")[-1]
-                lora_down_weight = value
-            else:
-                continue
-
-            # find corresponding lora_up and alpha
-            block_up_name = block_down_name
-            lora_up_weight = lora_sd.get(block_up_name + ".lora_up." + weight_name, None)
-            lora_alpha = lora_sd.get(block_down_name + ".alpha", None)
-
-            weights_loaded = lora_down_weight is not None and lora_up_weight is not None
-
-            if weights_loaded:
-                conv2d = len(lora_down_weight.size()) == 4
-                if lora_alpha is None:
-                    scale = 1.0
-                else:
-                    scale = lora_alpha / lora_down_weight.size()[0]
-
-                if conv2d:
-                    full_weight_matrix = merge_conv(lora_down_weight, lora_up_weight, device)
-                    param_dict = extract_conv(full_weight_matrix, new_rank, dynamic_method, dynamic_param, device, scale)
-                else:
-                    full_weight_matrix = merge_linear(lora_down_weight, lora_up_weight, device)
-                    param_dict = extract_linear(full_weight_matrix, new_rank, dynamic_method, dynamic_param, device, scale)
-
-                if verbose:
-                    max_ratio = param_dict["max_ratio"]
-                    sum_retained = param_dict["sum_retained"]
-                    fro_retained = param_dict["fro_retained"]
-                    if not np.isnan(fro_retained):
-                        fro_list.append(float(fro_retained))
-
-                    verbose_str += f"{block_down_name:75} | "
-                    verbose_str += (
-                        f"sum(S) retained: {sum_retained:.1%}, fro retained: {fro_retained:.1%}, max(S) ratio: {max_ratio:0.1f}"
-                    )
-
-                if verbose and dynamic_method:
-                    verbose_str += f", dynamic | dim: {param_dict['new_rank']}, alpha: {param_dict['new_alpha']}\n"
-                else:
-                    verbose_str += f"\n"
-
-                new_alpha = param_dict["new_alpha"]
-                o_lora_sd[block_down_name + "." + "lora_down.weight"] = param_dict["lora_down"].to(save_dtype).contiguous()
-                o_lora_sd[block_up_name + "." + "lora_up.weight"] = param_dict["lora_up"].to(save_dtype).contiguous()
-                o_lora_sd[block_up_name + "." "alpha"] = torch.tensor(param_dict["new_alpha"]).to(save_dtype)
-
-                block_down_name = None
-                block_up_name = None
-                lora_down_weight = None
-                lora_up_weight = None
-                weights_loaded = False
-                del param_dict
-
-    if verbose:
-        print(verbose_str)
-
-        print(f"Average Frobenius norm retention: {np.mean(fro_list):.2%} | std: {np.std(fro_list):0.3f}")
-    print("resizing complete")
-    return o_lora_sd, network_dim, new_alpha
+  network_alpha = None
+  network_dim = None
+  verbose_str = "\n"
+  fro_list = []
+
+  # Extract loaded lora dim and alpha
+  for key, value in lora_sd.items():
+    if network_alpha is None and 'alpha' in key:
+      network_alpha = value
+    if network_dim is None and 'lora_down' in key and len(value.size()) == 2:
+      network_dim = value.size()[0]
+    if network_alpha is not None and network_dim is not None:
+      break
+    if network_alpha is None:
+      network_alpha = network_dim
+
+  scale = network_alpha/network_dim
+
+  if dynamic_method:
+    print(f"Dynamically determining new alphas and dims based off {dynamic_method}: {dynamic_param}, max rank is {new_rank}")
+
+  lora_down_weight = None
+  lora_up_weight = None
+
+  o_lora_sd = lora_sd.copy()
+  block_down_name = None
+  block_up_name = None
+
+  with torch.no_grad():
+    for key, value in tqdm(lora_sd.items()):
+      weight_name = None
+      if 'lora_down' in key:
+        block_down_name = key.split(".")[0]
+        weight_name = key.split(".")[-1]
+        lora_down_weight = value
+      else:
+        continue
+
+      # find corresponding lora_up and alpha
+      block_up_name = block_down_name
+      lora_up_weight = lora_sd.get(block_up_name + '.lora_up.' + weight_name, None)
+      lora_alpha = lora_sd.get(block_down_name + '.alpha', None)
+
+      weights_loaded = (lora_down_weight is not None and lora_up_weight is not None)
+
+      if weights_loaded:
+
+        conv2d = (len(lora_down_weight.size()) == 4)
+        if lora_alpha is None:
+          scale = 1.0
+        else:
+          scale = lora_alpha/lora_down_weight.size()[0]
+
+        if conv2d:
+          full_weight_matrix = merge_conv(lora_down_weight, lora_up_weight, device)
+          param_dict = extract_conv(full_weight_matrix, new_rank, dynamic_method, dynamic_param, device, scale)
+        else:
+          full_weight_matrix = merge_linear(lora_down_weight, lora_up_weight, device)
+          param_dict = extract_linear(full_weight_matrix, new_rank, dynamic_method, dynamic_param, device, scale)
+
+        if verbose:
+          max_ratio = param_dict['max_ratio']
+          sum_retained = param_dict['sum_retained']
+          fro_retained = param_dict['fro_retained']
+          if not np.isnan(fro_retained):
+            fro_list.append(float(fro_retained))
+
+          verbose_str+=f"{block_down_name:75} | "
+          verbose_str+=f"sum(S) retained: {sum_retained:.1%}, fro retained: {fro_retained:.1%}, max(S) ratio: {max_ratio:0.1f}"
+
+        if verbose and dynamic_method:
+          verbose_str+=f", dynamic | dim: {param_dict['new_rank']}, alpha: {param_dict['new_alpha']}\n"
+        else:
+          verbose_str+=f"\n"
+
+        new_alpha = param_dict['new_alpha']
+        o_lora_sd[block_down_name + "." + "lora_down.weight"] = param_dict["lora_down"].to(save_dtype).contiguous()
+        o_lora_sd[block_up_name + "." + "lora_up.weight"] = param_dict["lora_up"].to(save_dtype).contiguous()
+        o_lora_sd[block_up_name + "." "alpha"] = torch.tensor(param_dict['new_alpha']).to(save_dtype)
+
+        block_down_name = None
+        block_up_name = None
+        lora_down_weight = None
+        lora_up_weight = None
+        weights_loaded = False
+        del param_dict
+
+  if verbose:
+    print(verbose_str)
+
+    print(f"Average Frobenius norm retention: {np.mean(fro_list):.2%} | std: {np.std(fro_list):0.3f}")
+  print("resizing complete")
+  return o_lora_sd, network_dim, new_alpha
 
 
 def resize(args):
-    if args.save_to is None or not (
-        args.save_to.endswith(".ckpt")
-        or args.save_to.endswith(".pt")
-        or args.save_to.endswith(".pth")
-        or args.save_to.endswith(".safetensors")
-    ):
-        raise Exception("The --save_to argument must be specified and must be a .ckpt , .pt, .pth or .safetensors file.")
-
-    def str_to_dtype(p):
-        if p == "float":
-            return torch.float
-        if p == "fp16":
-            return torch.float16
-        if p == "bf16":
-            return torch.bfloat16
-        return None
-
-    if args.dynamic_method and not args.dynamic_param:
-        raise Exception("If using dynamic_method, then dynamic_param is required")
-
-    merge_dtype = str_to_dtype("float")  # matmul method above only seems to work in float32
-    save_dtype = str_to_dtype(args.save_precision)
-    if save_dtype is None:
-        save_dtype = merge_dtype
-
-    print("loading Model...")
-    lora_sd, metadata = load_state_dict(args.model, merge_dtype)
-
-    print("Resizing Lora...")
-    state_dict, old_dim, new_alpha = resize_lora_model(
-        lora_sd, args.new_rank, save_dtype, args.device, args.dynamic_method, args.dynamic_param, args.verbose
-    )
-
-    # update metadata
-    if metadata is None:
-        metadata = {}
-
-    comment = metadata.get("ss_training_comment", "")
-
-    if not args.dynamic_method:
-        metadata["ss_training_comment"] = f"dimension is resized from {old_dim} to {args.new_rank}; {comment}"
-        metadata["ss_network_dim"] = str(args.new_rank)
-        metadata["ss_network_alpha"] = str(new_alpha)
-    else:
-        metadata[
-            "ss_training_comment"
-        ] = f"Dynamic resize with {args.dynamic_method}: {args.dynamic_param} from {old_dim}; {comment}"
-        metadata["ss_network_dim"] = "Dynamic"
-        metadata["ss_network_alpha"] = "Dynamic"
+  if args.save_to is None or not (args.save_to.endswith('.ckpt') or args.save_to.endswith('.pt') or args.save_to.endswith('.pth') or args.save_to.endswith('.safetensors')):
+    raise Exception("The --save_to argument must be specified and must be a .ckpt , .pt, .pth or .safetensors file.")
+
+    
+  def str_to_dtype(p):
+    if p == 'float':
+      return torch.float
+    if p == 'fp16':
+      return torch.float16
+    if p == 'bf16':
+      return torch.bfloat16
+    return None
+
+  if args.dynamic_method and not args.dynamic_param:
+    raise Exception("If using dynamic_method, then dynamic_param is required")
+
+  merge_dtype = str_to_dtype('float')  # matmul method above only seems to work in float32
+  save_dtype = str_to_dtype(args.save_precision)
+  if save_dtype is None:
+    save_dtype = merge_dtype
+
+  print("loading Model...")
+  lora_sd, metadata = load_state_dict(args.model, merge_dtype)
+
+  print("Resizing Lora...")
+  state_dict, old_dim, new_alpha = resize_lora_model(lora_sd, args.new_rank, save_dtype, args.device, args.dynamic_method, args.dynamic_param, args.verbose)
+
+  # update metadata
+  if metadata is None:
+    metadata = {}
+
+  comment = metadata.get("ss_training_comment", "")
+
+  if not args.dynamic_method:
+    metadata["ss_training_comment"] = f"dimension is resized from {old_dim} to {args.new_rank}; {comment}"
+    metadata["ss_network_dim"] = str(args.new_rank)
+    metadata["ss_network_alpha"] = str(new_alpha)
+  else:
+    metadata["ss_training_comment"] = f"Dynamic resize with {args.dynamic_method}: {args.dynamic_param} from {old_dim}; {comment}"
+    metadata["ss_network_dim"] = 'Dynamic'
+    metadata["ss_network_alpha"] = 'Dynamic'
 
-    model_hash, legacy_hash = train_util.precalculate_safetensors_hashes(state_dict, metadata)
-    metadata["sshs_model_hash"] = model_hash
-    metadata["sshs_legacy_hash"] = legacy_hash
+  model_hash, legacy_hash = train_util.precalculate_safetensors_hashes(state_dict, metadata)
+  metadata["sshs_model_hash"] = model_hash
+  metadata["sshs_legacy_hash"] = legacy_hash
 
-    print(f"saving model to: {args.save_to}")
-    save_to_file(args.save_to, state_dict, state_dict, save_dtype, metadata)
+  print(f"saving model to: {args.save_to}")
+  save_to_file(args.save_to, state_dict, state_dict, save_dtype, metadata)
 
 
 def setup_parser() -> argparse.ArgumentParser:
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--save_precision",
-        type=str,
-        default=None,
-        choices=[None, "float", "fp16", "bf16"],
-        help="precision in saving, float if omitted / 保存時の精度、未指定時はfloat",
-    )
-    parser.add_argument("--new_rank", type=int, default=4, help="Specify rank of output LoRA / 出力するLoRAのrank (dim)")
-    parser.add_argument(
-        "--save_to", type=str, default=None, help="destination file name: ckpt or safetensors file / 保存先のファイル名、ckptまたはsafetensors"
-    )
-    parser.add_argument(
-        "--model",
-        type=str,
-        default=None,
-        help="LoRA model to resize at to new rank: ckpt or safetensors file / 読み込むLoRAモデル、ckptまたはsafetensors",
-    )
-    parser.add_argument("--device", type=str, default=None, help="device to use, cuda for GPU / 計算を行うデバイス、cuda でGPUを使う")
-    parser.add_argument("--verbose", action="store_true", help="Display verbose resizing information / rank変更時の詳細情報を出力する")
-    parser.add_argument(
-        "--dynamic_method",
-        type=str,
-        default=None,
-        choices=[None, "sv_ratio", "sv_fro", "sv_cumulative"],
-        help="Specify dynamic resizing method, --new_rank is used as a hard limit for max rank",
-    )
-    parser.add_argument("--dynamic_param", type=float, default=None, help="Specify target for dynamic reduction")
-
-    return parser
-
-
-if __name__ == "__main__":
-    parser = setup_parser()
-
-    args = parser.parse_args()
-    resize(args)
+  parser = argparse.ArgumentParser()
+
+  parser.add_argument("--save_precision", type=str, default=None,
+                      choices=[None, "float", "fp16", "bf16"], help="precision in saving, float if omitted / 保存時の精度、未指定時はfloat")
+  parser.add_argument("--new_rank", type=int, default=4,
+                      help="Specify rank of output LoRA / 出力するLoRAのrank (dim)")
+  parser.add_argument("--save_to", type=str, default=None,
+                      help="destination file name: ckpt or safetensors file / 保存先のファイル名、ckptまたはsafetensors")
+  parser.add_argument("--model", type=str, default=None,
+                      help="LoRA model to resize at to new rank: ckpt or safetensors file / 読み込むLoRAモデル、ckptまたはsafetensors")
+  parser.add_argument("--device", type=str, default=None, help="device to use, cuda for GPU / 計算を行うデバイス、cuda でGPUを使う")
+  parser.add_argument("--verbose", action="store_true", 
+                      help="Display verbose resizing information / rank変更時の詳細情報を出力する")
+  parser.add_argument("--dynamic_method", type=str, default=None, choices=[None, "sv_ratio", "sv_fro", "sv_cumulative"],
+                      help="Specify dynamic resizing method, --new_rank is used as a hard limit for max rank")
+  parser.add_argument("--dynamic_param", type=float, default=None,
+                      help="Specify target for dynamic reduction")
+       
+  return parser
+
+
+if __name__ == '__main__':
+  parser = setup_parser()
+
+  args = parser.parse_args()
+  resize(args)

From f2491ee0ac4727b49110764dd2862d8aa316b048 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 24 Sep 2023 12:10:56 +0900
Subject: [PATCH 16/31] change block name doesn't contain '.' at end

---
 networks/resize_lora.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/networks/resize_lora.py b/networks/resize_lora.py
index 39d4c9072..03fc545e7 100644
--- a/networks/resize_lora.py
+++ b/networks/resize_lora.py
@@ -219,7 +219,7 @@ def resize_lora_model(lora_sd, new_rank, save_dtype, device, dynamic_method, dyn
     for key, value in tqdm(lora_sd.items()):
       weight_name = None
       if 'lora_down' in key:
-        block_down_name = key.rsplit('lora_down', 1)[0]
+        block_down_name = key.rsplit('.lora_down', 1)[0]
         weight_name = key.rsplit(".", 1)[-1]
         lora_down_weight = value
       else:
@@ -227,8 +227,8 @@ def resize_lora_model(lora_sd, new_rank, save_dtype, device, dynamic_method, dyn
 
       # find corresponding lora_up and alpha
       block_up_name = block_down_name
-      lora_up_weight = lora_sd.get(block_up_name + 'lora_up.' + weight_name, None)
-      lora_alpha = lora_sd.get(block_down_name + 'alpha', None)
+      lora_up_weight = lora_sd.get(block_up_name + '.lora_up.' + weight_name, None)
+      lora_alpha = lora_sd.get(block_down_name + '.alpha', None)
 
       weights_loaded = (lora_down_weight is not None and lora_up_weight is not None)
 
@@ -263,9 +263,9 @@ def resize_lora_model(lora_sd, new_rank, save_dtype, device, dynamic_method, dyn
           verbose_str+=f"\n"
 
         new_alpha = param_dict['new_alpha']
-        o_lora_sd[block_down_name + "lora_down.weight"] = param_dict["lora_down"].to(save_dtype).contiguous()
-        o_lora_sd[block_up_name + "lora_up.weight"] = param_dict["lora_up"].to(save_dtype).contiguous()
-        o_lora_sd[block_up_name + "alpha"] = torch.tensor(param_dict['new_alpha']).to(save_dtype)
+        o_lora_sd[block_down_name + "." + "lora_down.weight"] = param_dict["lora_down"].to(save_dtype).contiguous()
+        o_lora_sd[block_up_name + "." + "lora_up.weight"] = param_dict["lora_up"].to(save_dtype).contiguous()
+        o_lora_sd[block_up_name + "." "alpha"] = torch.tensor(param_dict['new_alpha']).to(save_dtype)
 
         block_down_name = None
         block_up_name = None

From 375785523174723a85d3d46170db0145b27c907b Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 24 Sep 2023 14:34:31 +0900
Subject: [PATCH 17/31] rename train_lllite_alt to train_lllite

---
 sdxl_train_control_net_lllite.py              | 118 +++++++++++-------
 ...py => sdxl_train_control_net_lllite_old.py | 118 +++++++-----------
 2 files changed, 118 insertions(+), 118 deletions(-)
 rename sdxl_train_control_net_lllite_alt.py => sdxl_train_control_net_lllite_old.py (87%)

diff --git a/sdxl_train_control_net_lllite.py b/sdxl_train_control_net_lllite.py
index f8169bdbf..61ebfb581 100644
--- a/sdxl_train_control_net_lllite.py
+++ b/sdxl_train_control_net_lllite.py
@@ -1,3 +1,6 @@
+# cond_imageをU-Netのforwardで渡すバージョンのControlNet-LLLite検証用学習コード
+# training code for ControlNet-LLLite with passing cond_image to U-Net's forward
+
 import argparse
 import gc
 import json
@@ -20,6 +23,7 @@
     pass
 from torch.nn.parallel import DistributedDataParallel as DDP
 from accelerate.utils import set_seed
+import accelerate
 from diffusers import DDPMScheduler, ControlNetModel
 from safetensors.torch import load_file
 from library import sai_model_spec, sdxl_model_util, sdxl_original_unet, sdxl_train_util
@@ -41,7 +45,7 @@
     apply_noise_offset,
     scale_v_prediction_loss_like_noise_prediction,
 )
-import networks.control_net_lllite as control_net_lllite
+import networks.control_net_lllite_for_train as control_net_lllite_for_train
 
 
 # TODO 他のスクリプトと共通化する
@@ -148,9 +152,6 @@ def train(args):
         ckpt_info,
     ) = sdxl_train_util.load_target_model(args, accelerator, sdxl_model_util.MODEL_VERSION_SDXL_BASE_V1_0, weight_dtype)
 
-    # モデルに xformers とか memory efficient attention を組み込む
-    train_util.replace_unet_modules(unet, args.mem_eff_attn, args.xformers, args.sdpa)
-
     # 学習を準備する
     if cache_latents:
         vae.to(accelerator.device, dtype=vae_dtype)
@@ -184,22 +185,53 @@ def train(args):
             )
         accelerator.wait_for_everyone()
 
-    # prepare ControlNet
-    network = control_net_lllite.ControlNetLLLite(unet, args.cond_emb_dim, args.network_dim, args.network_dropout)
-    network.apply_to()
+    # prepare ControlNet-LLLite
+    control_net_lllite_for_train.replace_unet_linear_and_conv2d()
 
     if args.network_weights is not None:
-        info = network.load_weights(args.network_weights)
-        accelerator.print(f"load ControlNet weights from {args.network_weights}: {info}")
+        accelerator.print(f"initialize U-Net with ControlNet-LLLite")
+        with accelerate.init_empty_weights():
+            unet_lllite = control_net_lllite_for_train.SdxlUNet2DConditionModelControlNetLLLite()
+        unet_lllite.to(accelerator.device, dtype=weight_dtype)
+
+        unet_sd = unet.state_dict()
+        info = unet_lllite.load_lllite_weights(args.network_weights, unet_sd)
+        accelerator.print(f"load ControlNet-LLLite weights from {args.network_weights}: {info}")
+    else:
+        # cosumes large memory, so send to GPU before creating the LLLite model
+        accelerator.print("sending U-Net to GPU")
+        unet.to(accelerator.device, dtype=weight_dtype)
+        unet_sd = unet.state_dict()
+
+        # init LLLite weights
+        accelerator.print(f"initialize U-Net with ControlNet-LLLite")
+
+        if args.lowram:
+            with accelerate.init_on_device(accelerator.device):
+                unet_lllite = control_net_lllite_for_train.SdxlUNet2DConditionModelControlNetLLLite()
+        else:
+            unet_lllite = control_net_lllite_for_train.SdxlUNet2DConditionModelControlNetLLLite()
+        unet_lllite.to(weight_dtype)
+
+        info = unet_lllite.load_lllite_weights(None, unet_sd)
+        accelerator.print(f"init U-Net with ControlNet-LLLite weights: {info}")
+    del unet_sd, unet
+
+    unet: control_net_lllite_for_train.SdxlUNet2DConditionModelControlNetLLLite = unet_lllite
+    del unet_lllite
+
+    unet.apply_lllite(args.cond_emb_dim, args.network_dim, args.network_dropout)
+
+    # モデルに xformers とか memory efficient attention を組み込む
+    train_util.replace_unet_modules(unet, args.mem_eff_attn, args.xformers, args.sdpa)
 
     if args.gradient_checkpointing:
         unet.enable_gradient_checkpointing()
-        network.enable_gradient_checkpointing()  # may have no effect
 
     # 学習に必要なクラスを準備する
     accelerator.print("prepare optimizer, data loader etc.")
 
-    trainable_params = list(network.prepare_optimizer_params())
+    trainable_params = list(unet.prepare_params())
     print(f"trainable params count: {len(trainable_params)}")
     print(f"number of trainable parameters: {sum(p.numel() for p in trainable_params if p.requires_grad)}")
 
@@ -232,37 +264,32 @@ def train(args):
     lr_scheduler = train_util.get_scheduler_fix(args, optimizer, accelerator.num_processes)
 
     # 実験的機能：勾配も含めたfp16/bf16学習を行う　モデル全体をfp16/bf16にする
-    if args.full_fp16:
-        assert (
-            args.mixed_precision == "fp16"
-        ), "full_fp16 requires mixed precision='fp16' / full_fp16を使う場合はmixed_precision='fp16'を指定してください。"
-        accelerator.print("enable full fp16 training.")
-        unet.to(weight_dtype)
-        network.to(weight_dtype)
-    elif args.full_bf16:
-        assert (
-            args.mixed_precision == "bf16"
-        ), "full_bf16 requires mixed precision='bf16' / full_bf16を使う場合はmixed_precision='bf16'を指定してください。"
-        accelerator.print("enable full bf16 training.")
-        unet.to(weight_dtype)
-        network.to(weight_dtype)
+    # if args.full_fp16:
+    #     assert (
+    #         args.mixed_precision == "fp16"
+    #     ), "full_fp16 requires mixed precision='fp16' / full_fp16を使う場合はmixed_precision='fp16'を指定してください。"
+    #     accelerator.print("enable full fp16 training.")
+    #     unet.to(weight_dtype)
+    # elif args.full_bf16:
+    #     assert (
+    #         args.mixed_precision == "bf16"
+    #     ), "full_bf16 requires mixed precision='bf16' / full_bf16を使う場合はmixed_precision='bf16'を指定してください。"
+    #     accelerator.print("enable full bf16 training.")
+    #     unet.to(weight_dtype)
+
+    unet.to(weight_dtype)
 
     # acceleratorがなんかよろしくやってくれるらしい
-    unet, network, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
-        unet, network, optimizer, train_dataloader, lr_scheduler
-    )
-    network: control_net_lllite.ControlNetLLLite
+    unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(unet, optimizer, train_dataloader, lr_scheduler)
 
     # transform DDP after prepare (train_network here only)
-    unet, network = train_util.transform_models_if_DDP([unet, network])
+    unet = train_util.transform_models_if_DDP([unet])[0]
 
     if args.gradient_checkpointing:
         unet.train()  # according to TI example in Diffusers, train is required -> これオリジナルのU-Netしたので本当は外せる
     else:
         unet.eval()
 
-    network.prepare_grad_etc()
-
     # TextEncoderの出力をキャッシュするときにはCPUへ移動する
     if args.cache_text_encoder_outputs:
         # move Text Encoders for sampling images. Text Encoder doesn't work on CPU with fp16
@@ -328,7 +355,13 @@ def train(args):
     del train_dataset_group
 
     # function for saving/removing
-    def save_model(ckpt_name, unwrapped_nw, steps, epoch_no, force_sync_upload=False):
+    def save_model(
+        ckpt_name,
+        unwrapped_nw: control_net_lllite_for_train.SdxlUNet2DConditionModelControlNetLLLite,
+        steps,
+        epoch_no,
+        force_sync_upload=False,
+    ):
         os.makedirs(args.output_dir, exist_ok=True)
         ckpt_file = os.path.join(args.output_dir, ckpt_name)
 
@@ -336,7 +369,7 @@ def save_model(ckpt_name, unwrapped_nw, steps, epoch_no, force_sync_upload=False
         sai_metadata = train_util.get_sai_model_spec(None, args, True, True, False)
         sai_metadata["modelspec.architecture"] = sai_model_spec.ARCH_SD_XL_V1_BASE + "/control-net-lllite"
 
-        unwrapped_nw.save_weights(ckpt_file, save_dtype, sai_metadata)
+        unwrapped_nw.save_lllite_weights(ckpt_file, save_dtype, sai_metadata)
         if args.huggingface_repo_id is not None:
             huggingface_util.upload(args, ckpt_file, "/" + ckpt_name, force_sync_upload=force_sync_upload)
 
@@ -351,11 +384,9 @@ def remove_model(old_ckpt_name):
         accelerator.print(f"\nepoch {epoch+1}/{num_train_epochs}")
         current_epoch.value = epoch + 1
 
-        network.on_epoch_start()  # train()
-
         for step, batch in enumerate(train_dataloader):
             current_step.value = global_step
-            with accelerator.accumulate(network):
+            with accelerator.accumulate(unet):
                 with torch.no_grad():
                     if "latents" in batch and batch["latents"] is not None:
                         latents = batch["latents"].to(accelerator.device)
@@ -412,10 +443,9 @@ def remove_model(old_ckpt_name):
                 with accelerator.autocast():
                     # conditioning imageをControlNetに渡す / pass conditioning image to ControlNet
                     # 内部でcond_embに変換される / it will be converted to cond_emb inside
-                    network.set_cond_image(controlnet_image)
 
                     # それらの値を使いつつ、U-Netでノイズを予測する / predict noise with U-Net using those values
-                    noise_pred = unet(noisy_latents, timesteps, text_embedding, vector_embedding)
+                    noise_pred = unet(noisy_latents, timesteps, text_embedding, vector_embedding, controlnet_image)
 
                 if args.v_parameterization:
                     # v-parameterization training
@@ -440,7 +470,7 @@ def remove_model(old_ckpt_name):
 
                 accelerator.backward(loss)
                 if accelerator.sync_gradients and args.max_grad_norm != 0.0:
-                    params_to_clip = network.get_trainable_params()
+                    params_to_clip = unet.get_trainable_params()
                     accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
 
                 optimizer.step()
@@ -459,7 +489,7 @@ def remove_model(old_ckpt_name):
                     accelerator.wait_for_everyone()
                     if accelerator.is_main_process:
                         ckpt_name = train_util.get_step_ckpt_name(args, "." + args.save_model_as, global_step)
-                        save_model(ckpt_name, accelerator.unwrap_model(network), global_step, epoch)
+                        save_model(ckpt_name, accelerator.unwrap_model(unet), global_step, epoch)
 
                         if args.save_state:
                             train_util.save_and_remove_state_stepwise(args, accelerator, global_step)
@@ -498,7 +528,7 @@ def remove_model(old_ckpt_name):
             saving = (epoch + 1) % args.save_every_n_epochs == 0 and (epoch + 1) < num_train_epochs
             if is_main_process and saving:
                 ckpt_name = train_util.get_epoch_ckpt_name(args, "." + args.save_model_as, epoch + 1)
-                save_model(ckpt_name, accelerator.unwrap_model(network), global_step, epoch + 1)
+                save_model(ckpt_name, accelerator.unwrap_model(unet), global_step, epoch + 1)
 
                 remove_epoch_no = train_util.get_remove_epoch_no(args, epoch + 1)
                 if remove_epoch_no is not None:
@@ -513,7 +543,7 @@ def remove_model(old_ckpt_name):
         # end of epoch
 
     if is_main_process:
-        network = accelerator.unwrap_model(network)
+        unet = accelerator.unwrap_model(unet)
 
     accelerator.end_training()
 
@@ -522,7 +552,7 @@ def remove_model(old_ckpt_name):
 
     if is_main_process:
         ckpt_name = train_util.get_last_ckpt_name(args, "." + args.save_model_as)
-        save_model(ckpt_name, network, global_step, num_train_epochs, force_sync_upload=True)
+        save_model(ckpt_name, unet, global_step, num_train_epochs, force_sync_upload=True)
 
         print("model saved.")
 
diff --git a/sdxl_train_control_net_lllite_alt.py b/sdxl_train_control_net_lllite_old.py
similarity index 87%
rename from sdxl_train_control_net_lllite_alt.py
rename to sdxl_train_control_net_lllite_old.py
index 61ebfb581..f8169bdbf 100644
--- a/sdxl_train_control_net_lllite_alt.py
+++ b/sdxl_train_control_net_lllite_old.py
@@ -1,6 +1,3 @@
-# cond_imageをU-Netのforwardで渡すバージョンのControlNet-LLLite検証用学習コード
-# training code for ControlNet-LLLite with passing cond_image to U-Net's forward
-
 import argparse
 import gc
 import json
@@ -23,7 +20,6 @@
     pass
 from torch.nn.parallel import DistributedDataParallel as DDP
 from accelerate.utils import set_seed
-import accelerate
 from diffusers import DDPMScheduler, ControlNetModel
 from safetensors.torch import load_file
 from library import sai_model_spec, sdxl_model_util, sdxl_original_unet, sdxl_train_util
@@ -45,7 +41,7 @@
     apply_noise_offset,
     scale_v_prediction_loss_like_noise_prediction,
 )
-import networks.control_net_lllite_for_train as control_net_lllite_for_train
+import networks.control_net_lllite as control_net_lllite
 
 
 # TODO 他のスクリプトと共通化する
@@ -152,6 +148,9 @@ def train(args):
         ckpt_info,
     ) = sdxl_train_util.load_target_model(args, accelerator, sdxl_model_util.MODEL_VERSION_SDXL_BASE_V1_0, weight_dtype)
 
+    # モデルに xformers とか memory efficient attention を組み込む
+    train_util.replace_unet_modules(unet, args.mem_eff_attn, args.xformers, args.sdpa)
+
     # 学習を準備する
     if cache_latents:
         vae.to(accelerator.device, dtype=vae_dtype)
@@ -185,53 +184,22 @@ def train(args):
             )
         accelerator.wait_for_everyone()
 
-    # prepare ControlNet-LLLite
-    control_net_lllite_for_train.replace_unet_linear_and_conv2d()
+    # prepare ControlNet
+    network = control_net_lllite.ControlNetLLLite(unet, args.cond_emb_dim, args.network_dim, args.network_dropout)
+    network.apply_to()
 
     if args.network_weights is not None:
-        accelerator.print(f"initialize U-Net with ControlNet-LLLite")
-        with accelerate.init_empty_weights():
-            unet_lllite = control_net_lllite_for_train.SdxlUNet2DConditionModelControlNetLLLite()
-        unet_lllite.to(accelerator.device, dtype=weight_dtype)
-
-        unet_sd = unet.state_dict()
-        info = unet_lllite.load_lllite_weights(args.network_weights, unet_sd)
-        accelerator.print(f"load ControlNet-LLLite weights from {args.network_weights}: {info}")
-    else:
-        # cosumes large memory, so send to GPU before creating the LLLite model
-        accelerator.print("sending U-Net to GPU")
-        unet.to(accelerator.device, dtype=weight_dtype)
-        unet_sd = unet.state_dict()
-
-        # init LLLite weights
-        accelerator.print(f"initialize U-Net with ControlNet-LLLite")
-
-        if args.lowram:
-            with accelerate.init_on_device(accelerator.device):
-                unet_lllite = control_net_lllite_for_train.SdxlUNet2DConditionModelControlNetLLLite()
-        else:
-            unet_lllite = control_net_lllite_for_train.SdxlUNet2DConditionModelControlNetLLLite()
-        unet_lllite.to(weight_dtype)
-
-        info = unet_lllite.load_lllite_weights(None, unet_sd)
-        accelerator.print(f"init U-Net with ControlNet-LLLite weights: {info}")
-    del unet_sd, unet
-
-    unet: control_net_lllite_for_train.SdxlUNet2DConditionModelControlNetLLLite = unet_lllite
-    del unet_lllite
-
-    unet.apply_lllite(args.cond_emb_dim, args.network_dim, args.network_dropout)
-
-    # モデルに xformers とか memory efficient attention を組み込む
-    train_util.replace_unet_modules(unet, args.mem_eff_attn, args.xformers, args.sdpa)
+        info = network.load_weights(args.network_weights)
+        accelerator.print(f"load ControlNet weights from {args.network_weights}: {info}")
 
     if args.gradient_checkpointing:
         unet.enable_gradient_checkpointing()
+        network.enable_gradient_checkpointing()  # may have no effect
 
     # 学習に必要なクラスを準備する
     accelerator.print("prepare optimizer, data loader etc.")
 
-    trainable_params = list(unet.prepare_params())
+    trainable_params = list(network.prepare_optimizer_params())
     print(f"trainable params count: {len(trainable_params)}")
     print(f"number of trainable parameters: {sum(p.numel() for p in trainable_params if p.requires_grad)}")
 
@@ -264,32 +232,37 @@ def train(args):
     lr_scheduler = train_util.get_scheduler_fix(args, optimizer, accelerator.num_processes)
 
     # 実験的機能：勾配も含めたfp16/bf16学習を行う　モデル全体をfp16/bf16にする
-    # if args.full_fp16:
-    #     assert (
-    #         args.mixed_precision == "fp16"
-    #     ), "full_fp16 requires mixed precision='fp16' / full_fp16を使う場合はmixed_precision='fp16'を指定してください。"
-    #     accelerator.print("enable full fp16 training.")
-    #     unet.to(weight_dtype)
-    # elif args.full_bf16:
-    #     assert (
-    #         args.mixed_precision == "bf16"
-    #     ), "full_bf16 requires mixed precision='bf16' / full_bf16を使う場合はmixed_precision='bf16'を指定してください。"
-    #     accelerator.print("enable full bf16 training.")
-    #     unet.to(weight_dtype)
-
-    unet.to(weight_dtype)
+    if args.full_fp16:
+        assert (
+            args.mixed_precision == "fp16"
+        ), "full_fp16 requires mixed precision='fp16' / full_fp16を使う場合はmixed_precision='fp16'を指定してください。"
+        accelerator.print("enable full fp16 training.")
+        unet.to(weight_dtype)
+        network.to(weight_dtype)
+    elif args.full_bf16:
+        assert (
+            args.mixed_precision == "bf16"
+        ), "full_bf16 requires mixed precision='bf16' / full_bf16を使う場合はmixed_precision='bf16'を指定してください。"
+        accelerator.print("enable full bf16 training.")
+        unet.to(weight_dtype)
+        network.to(weight_dtype)
 
     # acceleratorがなんかよろしくやってくれるらしい
-    unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(unet, optimizer, train_dataloader, lr_scheduler)
+    unet, network, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+        unet, network, optimizer, train_dataloader, lr_scheduler
+    )
+    network: control_net_lllite.ControlNetLLLite
 
     # transform DDP after prepare (train_network here only)
-    unet = train_util.transform_models_if_DDP([unet])[0]
+    unet, network = train_util.transform_models_if_DDP([unet, network])
 
     if args.gradient_checkpointing:
         unet.train()  # according to TI example in Diffusers, train is required -> これオリジナルのU-Netしたので本当は外せる
     else:
         unet.eval()
 
+    network.prepare_grad_etc()
+
     # TextEncoderの出力をキャッシュするときにはCPUへ移動する
     if args.cache_text_encoder_outputs:
         # move Text Encoders for sampling images. Text Encoder doesn't work on CPU with fp16
@@ -355,13 +328,7 @@ def train(args):
     del train_dataset_group
 
     # function for saving/removing
-    def save_model(
-        ckpt_name,
-        unwrapped_nw: control_net_lllite_for_train.SdxlUNet2DConditionModelControlNetLLLite,
-        steps,
-        epoch_no,
-        force_sync_upload=False,
-    ):
+    def save_model(ckpt_name, unwrapped_nw, steps, epoch_no, force_sync_upload=False):
         os.makedirs(args.output_dir, exist_ok=True)
         ckpt_file = os.path.join(args.output_dir, ckpt_name)
 
@@ -369,7 +336,7 @@ def save_model(
         sai_metadata = train_util.get_sai_model_spec(None, args, True, True, False)
         sai_metadata["modelspec.architecture"] = sai_model_spec.ARCH_SD_XL_V1_BASE + "/control-net-lllite"
 
-        unwrapped_nw.save_lllite_weights(ckpt_file, save_dtype, sai_metadata)
+        unwrapped_nw.save_weights(ckpt_file, save_dtype, sai_metadata)
         if args.huggingface_repo_id is not None:
             huggingface_util.upload(args, ckpt_file, "/" + ckpt_name, force_sync_upload=force_sync_upload)
 
@@ -384,9 +351,11 @@ def remove_model(old_ckpt_name):
         accelerator.print(f"\nepoch {epoch+1}/{num_train_epochs}")
         current_epoch.value = epoch + 1
 
+        network.on_epoch_start()  # train()
+
         for step, batch in enumerate(train_dataloader):
             current_step.value = global_step
-            with accelerator.accumulate(unet):
+            with accelerator.accumulate(network):
                 with torch.no_grad():
                     if "latents" in batch and batch["latents"] is not None:
                         latents = batch["latents"].to(accelerator.device)
@@ -443,9 +412,10 @@ def remove_model(old_ckpt_name):
                 with accelerator.autocast():
                     # conditioning imageをControlNetに渡す / pass conditioning image to ControlNet
                     # 内部でcond_embに変換される / it will be converted to cond_emb inside
+                    network.set_cond_image(controlnet_image)
 
                     # それらの値を使いつつ、U-Netでノイズを予測する / predict noise with U-Net using those values
-                    noise_pred = unet(noisy_latents, timesteps, text_embedding, vector_embedding, controlnet_image)
+                    noise_pred = unet(noisy_latents, timesteps, text_embedding, vector_embedding)
 
                 if args.v_parameterization:
                     # v-parameterization training
@@ -470,7 +440,7 @@ def remove_model(old_ckpt_name):
 
                 accelerator.backward(loss)
                 if accelerator.sync_gradients and args.max_grad_norm != 0.0:
-                    params_to_clip = unet.get_trainable_params()
+                    params_to_clip = network.get_trainable_params()
                     accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
 
                 optimizer.step()
@@ -489,7 +459,7 @@ def remove_model(old_ckpt_name):
                     accelerator.wait_for_everyone()
                     if accelerator.is_main_process:
                         ckpt_name = train_util.get_step_ckpt_name(args, "." + args.save_model_as, global_step)
-                        save_model(ckpt_name, accelerator.unwrap_model(unet), global_step, epoch)
+                        save_model(ckpt_name, accelerator.unwrap_model(network), global_step, epoch)
 
                         if args.save_state:
                             train_util.save_and_remove_state_stepwise(args, accelerator, global_step)
@@ -528,7 +498,7 @@ def remove_model(old_ckpt_name):
             saving = (epoch + 1) % args.save_every_n_epochs == 0 and (epoch + 1) < num_train_epochs
             if is_main_process and saving:
                 ckpt_name = train_util.get_epoch_ckpt_name(args, "." + args.save_model_as, epoch + 1)
-                save_model(ckpt_name, accelerator.unwrap_model(unet), global_step, epoch + 1)
+                save_model(ckpt_name, accelerator.unwrap_model(network), global_step, epoch + 1)
 
                 remove_epoch_no = train_util.get_remove_epoch_no(args, epoch + 1)
                 if remove_epoch_no is not None:
@@ -543,7 +513,7 @@ def remove_model(old_ckpt_name):
         # end of epoch
 
     if is_main_process:
-        unet = accelerator.unwrap_model(unet)
+        network = accelerator.unwrap_model(network)
 
     accelerator.end_training()
 
@@ -552,7 +522,7 @@ def remove_model(old_ckpt_name):
 
     if is_main_process:
         ckpt_name = train_util.get_last_ckpt_name(args, "." + args.save_model_as)
-        save_model(ckpt_name, unet, global_step, num_train_epochs, force_sync_upload=True)
+        save_model(ckpt_name, network, global_step, num_train_epochs, force_sync_upload=True)
 
         print("model saved.")
 

From 477b5260aafb5bd9502c7ddddf25665d83eed6b6 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 24 Sep 2023 14:47:13 +0900
Subject: [PATCH 18/31] fix sai metadata for sdxl closes #824

---
 networks/extract_lora_from_models.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/networks/extract_lora_from_models.py b/networks/extract_lora_from_models.py
index eed30350d..dba7cd4e2 100644
--- a/networks/extract_lora_from_models.py
+++ b/networks/extract_lora_from_models.py
@@ -14,7 +14,7 @@
 
 
 CLAMP_QUANTILE = 0.99
-MIN_DIFF = 1e-4
+MIN_DIFF = 1e-1
 
 
 def save_to_file(file_name, model, state_dict, dtype):
@@ -200,7 +200,7 @@ def str_to_dtype(p):
     if not args.no_metadata:
         title = os.path.splitext(os.path.basename(args.save_to))[0]
         sai_metadata = sai_model_spec.build_metadata(
-            None, args.v2, args.v_parameterization, False, True, False, time.time(), title=title
+            None, args.v2, args.v_parameterization, args.sdxl, True, False, time.time(), title=title
         )
         metadata.update(sai_metadata)
 

From 20e929e27e13b09198814f9c29df935c6c2d9c24 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 24 Sep 2023 16:04:50 +0900
Subject: [PATCH 19/31] fix to work iter_same_seed

---
 gen_img_diffusers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gen_img_diffusers.py b/gen_img_diffusers.py
index 0ea66cde2..70ca67942 100644
--- a/gen_img_diffusers.py
+++ b/gen_img_diffusers.py
@@ -3152,7 +3152,7 @@ def process_batch(batch: List[BatchData], highres_fix, highres_1st=False):
                             print("predefined seeds are exhausted")
                             seed = None
                     elif args.iter_same_seed:
-                        seeds = iter_seed
+                        seed = iter_seed
                     else:
                         seed = None  # 前のを消す
 

From 7e736da30c1f67bc27cfc4f8d3148ed203569d17 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 24 Sep 2023 17:46:57 +0900
Subject: [PATCH 20/31] update versions of accelerate and diffusers

---
 README.md                  |  9 ++++++++-
 networks/lora_diffusers.py | 19 ++++++++++++-------
 requirements.txt           |  6 ++----
 3 files changed, 22 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index 7d46dc9d8..497baff59 100644
--- a/README.md
+++ b/README.md
@@ -22,7 +22,14 @@ __Stable Diffusion web UI now seems to support LoRA trained by ``sd-scripts``.__
 
 The feature of SDXL training is now available in sdxl branch as an experimental feature. 
 
-Sep 3, 2023: The feature will be merged into the main branch soon. Following are the changes from the previous version. 
+Sep 24, 2023: The feature will be merged into the main branch very soon. Following are the changes from the previous version. 
+
+- `accelerate` is updated to 0.23.0, and `diffusers` is updated to 0.21.2. Please update them with the upgrade instructions below.
+- Intel ARC support with IPEX is added. [#825](https://github.com/kohya-ss/sd-scripts/pull/825)
+- Other changes and fixes.
+- Thanks for contributions from Disty0, sdbds, jvkap, rockerBOO, Symbiomatrix and others!
+
+Sep 3, 2023: 
 
 - ControlNet-LLLite is added. See [documentation](./docs/train_lllite_README.md) for details.
 - JPEG XL is supported. [#786](https://github.com/kohya-ss/sd-scripts/pull/786) 
diff --git a/networks/lora_diffusers.py b/networks/lora_diffusers.py
index c41111bea..47d75ac4d 100644
--- a/networks/lora_diffusers.py
+++ b/networks/lora_diffusers.py
@@ -117,7 +117,7 @@ def __init__(
         super().__init__()
         self.lora_name = lora_name
 
-        if org_module.__class__.__name__ == "Conv2d":
+        if org_module.__class__.__name__ == "Conv2d" or org_module.__class__.__name__ == "LoRACompatibleConv":
             in_dim = org_module.in_channels
             out_dim = org_module.out_channels
         else:
@@ -126,7 +126,7 @@ def __init__(
 
         self.lora_dim = lora_dim
 
-        if org_module.__class__.__name__ == "Conv2d":
+        if org_module.__class__.__name__ == "Conv2d" or org_module.__class__.__name__ == "LoRACompatibleConv":
             kernel_size = org_module.kernel_size
             stride = org_module.stride
             padding = org_module.padding
@@ -166,7 +166,8 @@ def unapply_to(self):
             self.org_module[0].forward = self.org_forward
 
     # forward with lora
-    def forward(self, x):
+    # scale is used LoRACompatibleConv, but we ignore it because we have multiplier
+    def forward(self, x, scale=1.0):
         if not self.enabled:
             return self.org_forward(x)
         return self.org_forward(x) + self.lora_up(self.lora_down(x)) * self.multiplier * self.scale
@@ -318,8 +319,12 @@ def create_modules(
             for name, module in root_module.named_modules():
                 if module.__class__.__name__ in target_replace_modules:
                     for child_name, child_module in module.named_modules():
-                        is_linear = child_module.__class__.__name__ == "Linear"
-                        is_conv2d = child_module.__class__.__name__ == "Conv2d"
+                        is_linear = (
+                            child_module.__class__.__name__ == "Linear" or child_module.__class__.__name__ == "LoRACompatibleLinear"
+                        )
+                        is_conv2d = (
+                            child_module.__class__.__name__ == "Conv2d" or child_module.__class__.__name__ == "LoRACompatibleConv"
+                        )
 
                         if is_linear or is_conv2d:
                             lora_name = prefix + "." + name + "." + child_name
@@ -359,7 +364,7 @@ def create_modules(
             skipped_te += skipped
         print(f"create LoRA for Text Encoder: {len(self.text_encoder_loras)} modules.")
         if len(skipped_te) > 0:
-            print(f"skipped {len(skipped_te)} modules because of missing weight.")
+            print(f"skipped {len(skipped_te)} modules because of missing weight for text encoder.")
 
         # extend U-Net target modules to include Conv2d 3x3
         target_modules = LoRANetwork.UNET_TARGET_REPLACE_MODULE + LoRANetwork.UNET_TARGET_REPLACE_MODULE_CONV2D_3X3
@@ -368,7 +373,7 @@ def create_modules(
         self.unet_loras, skipped_un = create_modules(True, None, unet, target_modules)
         print(f"create LoRA for U-Net: {len(self.unet_loras)} modules.")
         if len(skipped_un) > 0:
-            print(f"skipped {len(skipped_un)} modules because of missing weight.")
+            print(f"skipped {len(skipped_un)} modules because of missing weight for U-Net.")
 
         # assertion
         names = set()
diff --git a/requirements.txt b/requirements.txt
index 9909ad753..4ca393f52 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,6 @@
-accelerate==0.19.0
+accelerate==0.23.0
 transformers==4.30.2
-diffusers[torch]==0.18.2
+diffusers[torch]==0.21.2
 ftfy==6.1.1
 # albumentations==1.3.0
 opencv-python==4.7.0.68
@@ -15,8 +15,6 @@ easygui==0.98.3
 toml==0.10.2
 voluptuous==0.13.1
 huggingface-hub==0.15.1
-# for loading Diffusers' SDXL
-invisible-watermark==0.2.0 
 # for BLIP captioning
 # requests==2.28.2
 # timm==0.6.12

From 28272de97ae1a112216fd1872c989a9506bddc3e Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 24 Sep 2023 17:48:51 +0900
Subject: [PATCH 21/31] update readme

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 497baff59..0879190cc 100644
--- a/README.md
+++ b/README.md
@@ -24,7 +24,7 @@ The feature of SDXL training is now available in sdxl branch as an experimental
 
 Sep 24, 2023: The feature will be merged into the main branch very soon. Following are the changes from the previous version. 
 
-- `accelerate` is updated to 0.23.0, and `diffusers` is updated to 0.21.2. Please update them with the upgrade instructions below.
+- `accelerate` is updated to 0.23.0, and `diffusers` is updated to 0.21.2. Dependency for `invisible-watermark` is removed. Please update them with the upgrade instructions below.
 - Intel ARC support with IPEX is added. [#825](https://github.com/kohya-ss/sd-scripts/pull/825)
 - Other changes and fixes.
 - Thanks for contributions from Disty0, sdbds, jvkap, rockerBOO, Symbiomatrix and others!

From 14aa2923cffe8ba4eddb75b76a55841fe4f15046 Mon Sep 17 00:00:00 2001
From: laksjdjf <aikasama@hotmail.co.jp>
Date: Thu, 28 Sep 2023 14:39:32 +0900
Subject: [PATCH 22/31] Support concat LoRA

---
 networks/merge_lora.py      | 39 +++++++++++++++++++++++++++++++-----
 networks/sdxl_merge_lora.py | 40 ++++++++++++++++++++++++++++++++-----
 2 files changed, 69 insertions(+), 10 deletions(-)

diff --git a/networks/merge_lora.py b/networks/merge_lora.py
index c8d743f56..71492621e 100644
--- a/networks/merge_lora.py
+++ b/networks/merge_lora.py
@@ -110,7 +110,7 @@ def merge_to_sd_model(text_encoder, unet, models, ratios, merge_dtype):
                 module.weight = torch.nn.Parameter(weight)
 
 
-def merge_lora_models(models, ratios, merge_dtype):
+def merge_lora_models(models, ratios, merge_dtype, concat=False, shuffle=False):
     base_alphas = {}  # alpha for merged model
     base_dims = {}
 
@@ -158,6 +158,12 @@ def merge_lora_models(models, ratios, merge_dtype):
         for key in lora_sd.keys():
             if "alpha" in key:
                 continue
+            if "lora_up" in key and concat:
+                concat_dim = 1
+            elif "lora_down" in key and concat:
+                concat_dim = 0
+            else:
+                concat_dim = None
 
             lora_module_name = key[: key.rfind(".lora_")]
 
@@ -165,12 +171,16 @@ def merge_lora_models(models, ratios, merge_dtype):
             alpha = alphas[lora_module_name]
 
             scale = math.sqrt(alpha / base_alpha) * ratio
+            scale = abs(scale) if "lora_up" in key else scale # マイナスの重みに対応する。
 
             if key in merged_sd:
                 assert (
-                    merged_sd[key].size() == lora_sd[key].size()
+                    merged_sd[key].size() == lora_sd[key].size() or concat_dim is not None
                 ), f"weights shape mismatch merging v1 and v2, different dims? / 重みのサイズが合いません。v1とv2、または次元数の異なるモデルはマージできません"
-                merged_sd[key] = merged_sd[key] + lora_sd[key] * scale
+                if concat_dim is not None:
+                    merged_sd[key] = torch.cat([merged_sd[key], lora_sd[key] * scale], dim=concat_dim)
+                else:
+                    merged_sd[key] = merged_sd[key] + lora_sd[key] * scale
             else:
                 merged_sd[key] = lora_sd[key] * scale
 
@@ -178,6 +188,13 @@ def merge_lora_models(models, ratios, merge_dtype):
     for lora_module_name, alpha in base_alphas.items():
         key = lora_module_name + ".alpha"
         merged_sd[key] = torch.tensor(alpha)
+        if shuffle:
+            key_down = lora_module_name + ".lora_down.weight"
+            key_up = lora_module_name + ".lora_up.weight"
+            dim = merged_sd[key_down].shape[0]
+            perm = torch.randperm(dim)
+            merged_sd[key_down] = merged_sd[key_down][perm]
+            merged_sd[key_up] = merged_sd[key_up][:,perm]
 
     print("merged model")
     print(f"dim: {list(set(base_dims.values()))}, alpha: {list(set(base_alphas.values()))}")
@@ -256,7 +273,7 @@ def str_to_dtype(p):
             args.v2, args.save_to, text_encoder, unet, args.sd_model, 0, 0, sai_metadata, save_dtype, vae
         )
     else:
-        state_dict, metadata, v2 = merge_lora_models(args.models, args.ratios, merge_dtype)
+        state_dict, metadata, v2 = merge_lora_models(args.models, args.ratios, merge_dtype, args.concat, args.shuffle)
 
         print(f"calculating hashes and creating metadata...")
 
@@ -317,7 +334,19 @@ def setup_parser() -> argparse.ArgumentParser:
         help="do not save sai modelspec metadata (minimum ss_metadata for LoRA is saved) / "
         + "sai modelspecのメタデータを保存しない（LoRAの最低限のss_metadataは保存される）",
     )
-
+    parser.add_argument(
+        "--concat",
+        action="store_true",
+        help="concat lora instead of merge (The dim(rank) of the output LoRA is the sum of the input dims) / "
+        + "マージの代わりに結合する（LoRAのdim(rank)は入力dimの合計になる）",
+    )
+    parser.add_argument(
+        "--shuffle",
+        action="store_true",
+        help="shuffle lora weight./ "
+        + "LoRAの重みをシャッフルする",
+    )
+    
     return parser
 
 
diff --git a/networks/sdxl_merge_lora.py b/networks/sdxl_merge_lora.py
index 0608c01f9..c513eb59f 100644
--- a/networks/sdxl_merge_lora.py
+++ b/networks/sdxl_merge_lora.py
@@ -113,7 +113,7 @@ def merge_to_sd_model(text_encoder1, text_encoder2, unet, models, ratios, merge_
                 module.weight = torch.nn.Parameter(weight)
 
 
-def merge_lora_models(models, ratios, merge_dtype):
+def merge_lora_models(models, ratios, merge_dtype, concat=False, shuffle=False):
     base_alphas = {}  # alpha for merged model
     base_dims = {}
 
@@ -161,6 +161,13 @@ def merge_lora_models(models, ratios, merge_dtype):
         for key in tqdm(lora_sd.keys()):
             if "alpha" in key:
                 continue
+            
+            if "lora_up" in key and concat:
+                concat_dim = 1
+            elif "lora_down" in key and concat:
+                concat_dim = 0
+            else:
+                concat_dim = None
 
             lora_module_name = key[: key.rfind(".lora_")]
 
@@ -168,12 +175,16 @@ def merge_lora_models(models, ratios, merge_dtype):
             alpha = alphas[lora_module_name]
 
             scale = math.sqrt(alpha / base_alpha) * ratio
-
+            scale = abs(scale) if "lora_up" in key else scale # マイナスの重みに対応する。
+            
             if key in merged_sd:
                 assert (
-                    merged_sd[key].size() == lora_sd[key].size()
+                    merged_sd[key].size() == lora_sd[key].size() or concat_dim is not None
                 ), f"weights shape mismatch merging v1 and v2, different dims? / 重みのサイズが合いません。v1とv2、または次元数の異なるモデルはマージできません"
-                merged_sd[key] = merged_sd[key] + lora_sd[key] * scale
+                if concat_dim is not None:
+                    merged_sd[key] = torch.cat([merged_sd[key], lora_sd[key] * scale], dim=concat_dim)
+                else:
+                    merged_sd[key] = merged_sd[key] + lora_sd[key] * scale
             else:
                 merged_sd[key] = lora_sd[key] * scale
 
@@ -181,6 +192,13 @@ def merge_lora_models(models, ratios, merge_dtype):
     for lora_module_name, alpha in base_alphas.items():
         key = lora_module_name + ".alpha"
         merged_sd[key] = torch.tensor(alpha)
+        if shuffle:
+            key_down = lora_module_name + ".lora_down.weight"
+            key_up = lora_module_name + ".lora_up.weight"
+            dim = merged_sd[key_down].shape[0]
+            perm = torch.randperm(dim)
+            merged_sd[key_down] = merged_sd[key_down][perm]
+            merged_sd[key_up] = merged_sd[key_up][:,perm]
 
     print("merged model")
     print(f"dim: {list(set(base_dims.values()))}, alpha: {list(set(base_alphas.values()))}")
@@ -252,7 +270,7 @@ def str_to_dtype(p):
             args.save_to, text_model1, text_model2, unet, 0, 0, ckpt_info, vae, logit_scale, sai_metadata, save_dtype
         )
     else:
-        state_dict, metadata = merge_lora_models(args.models, args.ratios, merge_dtype)
+        state_dict, metadata = merge_lora_models(args.models, args.ratios, merge_dtype, args.concat, args.shuffle)
 
         print(f"calculating hashes and creating metadata...")
 
@@ -307,6 +325,18 @@ def setup_parser() -> argparse.ArgumentParser:
         help="do not save sai modelspec metadata (minimum ss_metadata for LoRA is saved) / "
         + "sai modelspecのメタデータを保存しない（LoRAの最低限のss_metadataは保存される）",
     )
+    parser.add_argument(
+        "--concat",
+        action="store_true",
+        help="concat lora instead of merge (The dim(rank) of the output LoRA is the sum of the input dims) / "
+        + "マージの代わりに結合する（LoRAのdim(rank)は入力dimの合計になる）",
+    )
+    parser.add_argument(
+        "--shuffle",
+        action="store_true",
+        help="shuffle lora weight./ "
+        + "LoRAの重みをシャッフルする",
+    )
 
     return parser
 

From 209eafb63163d081d12327a5120eb956971987e5 Mon Sep 17 00:00:00 2001
From: Disty0 <semihgulec2005@gmail.com>
Date: Thu, 28 Sep 2023 14:02:25 +0300
Subject: [PATCH 23/31] IPEX attention optimizations

---
 library/ipex/__init__.py  |  3 +--
 library/ipex/attention.py | 33 +++++++++++++++++++--------------
 library/ipex/diffusers.py |  9 +++++----
 3 files changed, 25 insertions(+), 20 deletions(-)

diff --git a/library/ipex/__init__.py b/library/ipex/__init__.py
index 19ec8eea1..43accd9f3 100644
--- a/library/ipex/__init__.py
+++ b/library/ipex/__init__.py
@@ -144,7 +144,7 @@ def ipex_init(): # pylint: disable=too-many-statements
         ipex._C._DeviceProperties.minor = 2
 
         #Fix functions with ipex:
-        torch.cuda.mem_get_info = lambda device=None: [(torch.xpu.get_device_properties(device).total_memory - torch.xpu.memory_allocated(device)), torch.xpu.get_device_properties(device).total_memory]
+        torch.cuda.mem_get_info = lambda device=None: [(torch.xpu.get_device_properties(device).total_memory - torch.xpu.memory_reserved(device)), torch.xpu.get_device_properties(device).total_memory]
         torch._utils._get_available_device_type = lambda: "xpu"
         torch.has_cuda = True
         torch.cuda.has_half = True
@@ -156,7 +156,6 @@ def ipex_init(): # pylint: disable=too-many-statements
         torch.cuda.get_device_properties.minor = 7
         torch.cuda.ipc_collect = lambda *args, **kwargs: None
         torch.cuda.utilization = lambda *args, **kwargs: 0
-        # getDeviceIdListForCard is renamed since https://github.com/intel/intel-extension-for-pytorch/commit/835b41fd5c8b6facf9efee8312f20699850ee592
         if hasattr(torch.xpu, 'getDeviceIdListForCard'):
             torch.cuda.getDeviceIdListForCard = torch.xpu.getDeviceIdListForCard
             torch.cuda.get_device_id_list_per_card = torch.xpu.getDeviceIdListForCard
diff --git a/library/ipex/attention.py b/library/ipex/attention.py
index e38689f21..84848b6a6 100644
--- a/library/ipex/attention.py
+++ b/library/ipex/attention.py
@@ -10,13 +10,15 @@ def torch_bmm(input, mat2, *, out=None):
 
     #ARC GPUs can't allocate more than 4GB to a single block, Slice it:
     batch_size_attention, input_tokens, mat2_shape = input.shape[0], input.shape[1], mat2.shape[2]
-    block_multiply = 2.4 if input.dtype == torch.float32 else 1.2
-    block_size = (batch_size_attention * input_tokens * mat2_shape) / 1024 * block_multiply #MB
+    block_multiply = input.element_size()
+    slice_block_size = input_tokens * mat2_shape / 1024 / 1024 * block_multiply
+    block_size = batch_size_attention * slice_block_size
+
     split_slice_size = batch_size_attention
-    if block_size >= 4000:
+    if block_size > 4:
         do_split = True
         #Find something divisible with the input_tokens
-        while ((split_slice_size * input_tokens * mat2_shape) / 1024 * block_multiply) > 4000:
+        while (split_slice_size * slice_block_size) > 4:
             split_slice_size = split_slice_size // 2
             if split_slice_size <= 1:
                 split_slice_size = 1
@@ -24,12 +26,12 @@ def torch_bmm(input, mat2, *, out=None):
     else:
         do_split = False
 
-    split_block_size = (split_slice_size * input_tokens * mat2_shape) / 1024 * block_multiply #MB
     split_2_slice_size = input_tokens
-    if split_block_size >= 4000:
+    if split_slice_size * slice_block_size > 4:
+        slice_block_size2 = split_slice_size * mat2_shape / 1024 / 1024 * block_multiply
         do_split_2 = True
         #Find something divisible with the input_tokens
-        while ((split_slice_size * split_2_slice_size * mat2_shape) / 1024 * block_multiply) > 4000:
+        while (split_2_slice_size * slice_block_size2) > 4:
             split_2_slice_size = split_2_slice_size // 2
             if split_2_slice_size <= 1:
                 split_2_slice_size = 1
@@ -71,13 +73,16 @@ def scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0.
     else:
         shape_one, batch_size_attention, query_tokens, shape_four = query.shape
         no_shape_one = False
-    block_multiply = 3.6 if query.dtype == torch.float32 else 1.8
-    block_size = (shape_one * batch_size_attention * query_tokens * shape_four) / 1024 * block_multiply #MB
+
+    block_multiply = query.element_size()
+    slice_block_size = shape_one * query_tokens * shape_four / 1024 / 1024 * block_multiply
+    block_size = batch_size_attention * slice_block_size
+
     split_slice_size = batch_size_attention
-    if block_size >= 4000:
+    if block_size > 4:
         do_split = True
         #Find something divisible with the shape_one
-        while ((shape_one * split_slice_size * query_tokens * shape_four) / 1024 * block_multiply) > 4000:
+        while (split_slice_size * slice_block_size) > 4:
             split_slice_size = split_slice_size // 2
             if split_slice_size <= 1:
                 split_slice_size = 1
@@ -85,12 +90,12 @@ def scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0.
     else:
         do_split = False
 
-    split_block_size = (shape_one * split_slice_size * query_tokens * shape_four) / 1024 * block_multiply #MB
     split_2_slice_size = query_tokens
-    if split_block_size >= 4000:
+    if split_slice_size * slice_block_size > 4:
+        slice_block_size2 = shape_one * split_slice_size * shape_four / 1024 / 1024 * block_multiply
         do_split_2 = True
         #Find something divisible with the batch_size_attention
-        while ((shape_one * split_slice_size * split_2_slice_size * shape_four) / 1024 * block_multiply) > 4000:
+        while (split_2_slice_size * slice_block_size2) > 4:
             split_2_slice_size = split_2_slice_size // 2
             if split_2_slice_size <= 1:
                 split_2_slice_size = 1
diff --git a/library/ipex/diffusers.py b/library/ipex/diffusers.py
index 4c39896ed..005ee49f0 100644
--- a/library/ipex/diffusers.py
+++ b/library/ipex/diffusers.py
@@ -55,13 +55,14 @@ def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, a
         )
 
         #ARC GPUs can't allocate more than 4GB to a single block, Slice it:
-        block_multiply = 2.4 if query.dtype == torch.float32 else 1.2
-        block_size = (batch_size_attention * query_tokens * shape_three) / 1024 * block_multiply #MB
+        block_multiply = query.element_size()
+        slice_block_size = self.slice_size * shape_three / 1024 / 1024 * block_multiply
+        block_size = query_tokens * slice_block_size
         split_2_slice_size = query_tokens
-        if block_size >= 4000:
+        if block_size > 4:
             do_split_2 = True
             #Find something divisible with the query_tokens
-            while ((self.slice_size * split_2_slice_size * shape_three) / 1024 * block_multiply) > 4000:
+            while (split_2_slice_size * slice_block_size) > 4:
                 split_2_slice_size = split_2_slice_size // 2
                 if split_2_slice_size <= 1:
                     split_2_slice_size = 1

From 6bd6cd9c516546b69365f7500793f674cda8c282 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 1 Oct 2023 12:17:54 +0900
Subject: [PATCH 24/31] update doc

---
 docs/train_network_README-ja.md | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/docs/train_network_README-ja.md b/docs/train_network_README-ja.md
index e620a8642..2205a7736 100644
--- a/docs/train_network_README-ja.md
+++ b/docs/train_network_README-ja.md
@@ -181,6 +181,8 @@ python networks\extract_lora_from_dylora.py --model "foldername/dylora-model.saf
 
 詳細は[PR #355](https://github.com/kohya-ss/sd-scripts/pull/355) をご覧ください。
 
+SDXLは現在サポートしていません。
+
 フルモデルの25個のブロックの重みを指定できます。最初のブロックに該当するLoRAは存在しませんが、階層別LoRA適用等との互換性のために25個としています。またconv2d3x3に拡張しない場合も一部のブロックにはLoRAが存在しませんが、記述を統一するため常に25個の値を指定してください。
 
 `--network_args` で以下の引数を指定してください。
@@ -246,6 +248,8 @@ network_args = [ "block_dims=2,4,4,4,8,8,8,8,12,12,12,12,16,12,12,12,12,8,8,8,8,
 
 merge_lora.pyでStable DiffusionのモデルにLoRAの学習結果をマージしたり、複数のLoRAモデルをマージしたりできます。
 
+SDXL向けにはsdxl_merge_lora.pyを用意しています。オプション等は同一ですので、以下のmerge_lora.pyを読み替えてください。
+
 ### Stable DiffusionのモデルにLoRAのモデルをマージする
 
 マージ後のモデルは通常のStable Diffusionのckptと同様に扱えます。たとえば以下のようなコマンドラインになります。
@@ -276,29 +280,29 @@ python networks\merge_lora.py --sd_model ..\model\model.ckpt
 
 ### 複数のLoRAのモデルをマージする
 
-__複数のLoRAをマージする場合は原則として `svd_merge_lora.py` を使用してください。__ 単純なup同士やdown同士のマージでは、計算結果が正しくなくなるためです。
-
-`merge_lora.py` によるマージは差分抽出法でLoRAを生成する場合等、ごく限られた場合でのみ有効です。
+--concatオプションを指定すると、複数のLoRAを単純に結合して新しいLoRAモデルを作成できます。ファイルサイズ（およびdim/rank）は指定したLoRAの合計サイズになります（マージ時にdim (rank)を変更する場合は `svd_merge_lora.py` を使用してください）。
 
 たとえば以下のようなコマンドラインになります。
 
 ```
-python networks\merge_lora.py 
+python networks\merge_lora.py --save_precision bf16 
     --save_to ..\lora_train1\model-char1-style1-merged.safetensors 
-    --models ..\lora_train1\last.safetensors ..\lora_train2\last.safetensors --ratios 0.6 0.4
+    --models ..\lora_train1\last.safetensors ..\lora_train2\last.safetensors 
+    --ratios 1.0 -1.0 --concat --shuffle
 ```
 
---sd_modelオプションは指定不要です。
+--concatオプションを指定します。
+
+また--shuffleオプションを追加し、重みをシャッフルします。シャッフルしないとマージ後のLoRAから元のLoRAを取り出せるため、コピー機学習などの場合には学習元データが明らかになります。ご注意ください。
 
 --save_toオプションにマージ後のLoRAモデルの保存先を指定します（.ckptまたは.safetensors、拡張子で自動判定）。
 
 --modelsに学習したLoRAのモデルファイルを指定します。三つ以上も指定可能です。
 
---ratiosにそれぞれのモデルの比率（どのくらい重みを元モデルに反映するか）を0~1.0の数値で指定します。二つのモデルを一対一でマージす場合は、「0.5 0.5」になります。「1.0 1.0」では合計の重みが大きくなりすぎて、恐らく結果はあまり望ましくないものになると思われます。
+--ratiosにそれぞれのモデルの比率（どのくらい重みを元モデルに反映するか）を0~1.0の数値で指定します。二つのモデルを一対一でマージする場合は、「0.5 0.5」になります。「1.0 1.0」では合計の重みが大きくなりすぎて、恐らく結果はあまり望ましくないものになると思われます。
 
 v1で学習したLoRAとv2で学習したLoRA、rank（次元数）の異なるLoRAはマージできません。U-NetだけのLoRAとU-Net+Text EncoderのLoRAはマージできるはずですが、結果は未知数です。
 
-
 ### その他のオプション
 
 * precision
@@ -306,6 +310,7 @@ v1で学習したLoRAとv2で学習したLoRA、rank（次元数）の異なるL
 * save_precision
   * モデル保存時の精度をfloat、fp16、bf16から指定できます。省略時はprecisionと同じ精度になります。
 
+他にもいくつかのオプションがありますので、--helpで確認してください。
 
 ## 複数のrankが異なるLoRAのモデルをマージする
 

From 81419f7f320930a0e3deb330f374fca6dffc671f Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 1 Oct 2023 16:37:23 +0900
Subject: [PATCH 25/31] Fix to work training U-Net only LoRA for SD1/2

---
 train_network.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/train_network.py b/train_network.py
index 0e2e0fa9f..6d98037ca 100644
--- a/train_network.py
+++ b/train_network.py
@@ -426,7 +426,10 @@ def train(self, args):
                 t_enc.train()
 
                 # set top parameter requires_grad = True for gradient checkpointing works
-                t_enc.text_model.embeddings.requires_grad_(True)
+                if train_text_encoder:
+                    t_enc.text_model.embeddings.requires_grad_(True)
+                else:
+                    unet.parameters().__next__().requires_grad_(True)
         else:
             unet.eval()
             for t_enc in text_encoders:

From 4cc919607a55efa038cc1f4866ee09dcca39e6b5 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 1 Oct 2023 16:41:48 +0900
Subject: [PATCH 26/31] fix placing of requires_grad_ of U-Net

---
 train_network.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/train_network.py b/train_network.py
index 6d98037ca..1a1713259 100644
--- a/train_network.py
+++ b/train_network.py
@@ -12,10 +12,13 @@
 
 from tqdm import tqdm
 import torch
+
 try:
     import intel_extension_for_pytorch as ipex
+
     if torch.xpu.is_available():
         from library.ipex import ipex_init
+
         ipex_init()
 except Exception:
     pass
@@ -428,8 +431,10 @@ def train(self, args):
                 # set top parameter requires_grad = True for gradient checkpointing works
                 if train_text_encoder:
                     t_enc.text_model.embeddings.requires_grad_(True)
-                else:
-                    unet.parameters().__next__().requires_grad_(True)
+
+            # set top parameter requires_grad = True for gradient checkpointing works
+            if not train_text_encoder:  # train U-Net only
+                unet.parameters().__next__().requires_grad_(True)
         else:
             unet.eval()
             for t_enc in text_encoders:

From c918489259336eb36b21e88be76b0185e8304000 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 1 Oct 2023 20:34:12 +0900
Subject: [PATCH 27/31] update readme

---
 README-ja.md |  83 ++++-----
 README.md    | 486 +++++++++++++--------------------------------------
 2 files changed, 155 insertions(+), 414 deletions(-)

diff --git a/README-ja.md b/README-ja.md
index 865e0d35a..29c33a659 100644
--- a/README-ja.md
+++ b/README-ja.md
@@ -1,3 +1,7 @@
+SDXLがサポートされました。sdxlブランチはmainブランチにマージされました。リポジトリを更新したときにはUpgradeの手順を実行してください。また accelerate のバージョンが上がっていますので、accelerate config を再度実行してください。
+
+SDXL学習については[こちら](./README.md#sdxl-training)をご覧ください（英語です）。
+
 ## リポジトリについて
 Stable Diffusionの学習、画像生成、その他のスクリプトを入れたリポジトリです。
 
@@ -9,13 +13,12 @@ GUIやPowerShellスクリプトなど、より使いやすくする機能が[bma
 
 * DreamBooth、U-NetおよびText Encoderの学習をサポート
 * fine-tuning、同上
+* LoRAの学習をサポート
 * 画像生成
 * モデル変換（Stable Diffision ckpt/safetensorsとDiffusersの相互変換）
 
 ## 使用法について
 
-当リポジトリ内およびnote.comに記事がありますのでそちらをご覧ください（将来的にはすべてこちらへ移すかもしれません）。
-
 * [学習について、共通編](./docs/train_README-ja.md) : データ整備やオプションなど
     * [データセット設定](./docs/config_README-ja.md)
 * [DreamBoothの学習について](./docs/train_db_README-ja.md)
@@ -41,11 +44,13 @@ PowerShellを使う場合、venvを使えるようにするためには以下の
 
 ## Windows環境でのインストール
 
-以下の例ではPyTorchは1.12.1／CUDA 11.6版をインストールします。CUDA 11.3版やPyTorch 1.13を使う場合は適宜書き換えください。
+スクリプトはPyTorch 2.0.1でテストしています。PyTorch 1.12.1でも動作すると思われます。
+
+以下の例ではPyTorchは2.0.1／CUDA 11.8版をインストールします。CUDA 11.6版やPyTorch 1.12.1を使う場合は適宜書き換えください。
 
 （なお、python -m venv～の行で「python」とだけ表示された場合、py -m venv～のようにpythonをpyに変更してください。）
 
-通常の（管理者ではない）PowerShellを開き以下を順に実行します。
+PowerShellを使う場合、通常の（管理者ではない）PowerShellを開き以下を順に実行します。
 
 ```powershell
 git clone https://github.com/kohya-ss/sd-scripts.git
@@ -54,43 +59,14 @@ cd sd-scripts
 python -m venv venv
 .\venv\Scripts\activate
 
-pip install torch==1.12.1+cu116 torchvision==0.13.1+cu116 --extra-index-url https://download.pytorch.org/whl/cu116
+pip install torch==2.0.1+cu118 torchvision==0.15.2+cu118 --index-url https://download.pytorch.org/whl/cu118
 pip install --upgrade -r requirements.txt
-pip install -U -I --no-deps https://github.com/C43H66N12O12S2/stable-diffusion-webui/releases/download/f/xformers-0.0.14.dev0-cp310-cp310-win_amd64.whl
-
-cp .\bitsandbytes_windows\*.dll .\venv\Lib\site-packages\bitsandbytes\
-cp .\bitsandbytes_windows\cextension.py .\venv\Lib\site-packages\bitsandbytes\cextension.py
-cp .\bitsandbytes_windows\main.py .\venv\Lib\site-packages\bitsandbytes\cuda_setup\main.py
+pip install xformers==0.0.20
 
 accelerate config
 ```
 
-<!-- 
-pip install torch==1.13.1+cu117 torchvision==0.14.1+cu117 --extra-index-url https://download.pytorch.org/whl/cu117
-pip install --use-pep517 --upgrade -r requirements.txt
-pip install -U -I --no-deps xformers==0.0.16
--->
-
-コマンドプロンプトでは以下になります。
-
-
-```bat
-git clone https://github.com/kohya-ss/sd-scripts.git
-cd sd-scripts
-
-python -m venv venv
-.\venv\Scripts\activate
-
-pip install torch==1.12.1+cu116 torchvision==0.13.1+cu116 --extra-index-url https://download.pytorch.org/whl/cu116
-pip install --upgrade -r requirements.txt
-pip install -U -I --no-deps https://github.com/C43H66N12O12S2/stable-diffusion-webui/releases/download/f/xformers-0.0.14.dev0-cp310-cp310-win_amd64.whl
-
-copy /y .\bitsandbytes_windows\*.dll .\venv\Lib\site-packages\bitsandbytes\
-copy /y .\bitsandbytes_windows\cextension.py .\venv\Lib\site-packages\bitsandbytes\cextension.py
-copy /y .\bitsandbytes_windows\main.py .\venv\Lib\site-packages\bitsandbytes\cuda_setup\main.py
-
-accelerate config
-```
+コマンドプロンプトでも同一です。
 
 （注:``python -m venv venv`` のほうが ``python -m venv --system-site-packages venv`` より安全そうなため書き換えました。globalなpythonにパッケージがインストールしてあると、後者だといろいろと問題が起きます。）
 
@@ -111,29 +87,40 @@ accelerate configの質問には以下のように答えてください。（bf1
 ※場合によって ``ValueError: fp16 mixed precision requires a GPU`` というエラーが出ることがあるようです。この場合、6番目の質問（
 ``What GPU(s) (by id) should be used for training on this machine as a comma-separated list? [all]:``）に「0」と答えてください。（id `0`のGPUが使われます。）
 
-### PyTorchとxformersのバージョンについて
+### オプション：`bitsandbytes`（8bit optimizer）を使う
 
-他のバージョンでは学習がうまくいかない場合があるようです。特に他の理由がなければ指定のバージョンをお使いください。
+`bitsandbytes`はオプションになりました。Linuxでは通常通りpipでインストールできます（0.41.1または以降のバージョンを推奨）。
 
-### オプション：Lion8bitを使う
+Windowsでは0.35.0または0.41.1を推奨します。
 
-Lion8bitを使う場合には`bitsandbytes`を0.38.0以降にアップグレードする必要があります。`bitsandbytes`をアンインストールし、Windows環境では例えば[こちら](https://github.com/jllllll/bitsandbytes-windows-webui)などからWindows版のwhlファイルをインストールしてください。たとえば以下のような手順になります。
+- `bitsandbytes` 0.35.0: 安定しているとみられるバージョンです。AdamW8bitは使用できますが、他のいくつかの8bit optimizer、学習時の`full_bf16`オプションは使用できません。
+- `bitsandbytes` 0.41.1: Lion8bit、PagedAdamW8bit、PagedLion8bitをサポートします。`full_bf16`が使用できます。
 
-```powershell
-pip install https://github.com/jllllll/bitsandbytes-windows-webui/raw/main/bitsandbytes-0.38.1-py3-none-any.whl
-```
+注：`bitsandbytes` 0.35.0から0.41.0までのバージョンには問題があるようです。 https://github.com/TimDettmers/bitsandbytes/issues/659
 
-アップグレード時には`pip install .`でこのリポジトリを更新し、必要に応じて他のパッケージもアップグレードしてください。
+以下の手順に従い、`bitsandbytes`をインストールしてください。
 
-### オプション：PagedAdamW8bitとPagedLion8bitを使う
+### 0.35.0を使う場合
 
-PagedAdamW8bitとPagedLion8bitを使う場合には`bitsandbytes`を0.39.0以降にアップグレードする必要があります。`bitsandbytes`をアンインストールし、Windows環境では例えば[こちら](https://github.com/jllllll/bitsandbytes-windows-webui)などからWindows版のwhlファイルをインストールしてください。たとえば以下のような手順になります。
+PowerShellの例です。コマンドプロンプトではcpの代わりにcopyを使ってください。
 
 ```powershell
-pip install https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.39.1-py3-none-win_amd64.whl
+cd sd-scripts
+.\venv\Scripts\activate
+pip install bitsandbytes==0.35.0
+
+cp .\bitsandbytes_windows\*.dll .\venv\Lib\site-packages\bitsandbytes\
+cp .\bitsandbytes_windows\cextension.py .\venv\Lib\site-packages\bitsandbytes\cextension.py
+cp .\bitsandbytes_windows\main.py .\venv\Lib\site-packages\bitsandbytes\cuda_setup\main.py
 ```
 
-アップグレード時には`pip install .`でこのリポジトリを更新し、必要に応じて他のパッケージもアップグレードしてください。
+### 0.41.1を使う場合
+
+jllllll氏の配布されている[こちら](https://github.com/jllllll/bitsandbytes-windows-webui) または他の場所から、Windows用のwhlファイルをインストールしてください。
+
+```powershell
+python -m pip install bitsandbytes==0.41.1 --prefer-binary --extra-index-url=https://jllllll.github.io/bitsandbytes-windows-webui
+```
 
 ## アップグレード
 
diff --git a/README.md b/README.md
index 0879190cc..dc8e25ad6 100644
--- a/README.md
+++ b/README.md
@@ -1,9 +1,11 @@
+__SDXL is now supported. The sdxl branch has been merged into the main branch. If you update the repository, please follow the upgrade instructions. Also, the version of accelerate has been updated, so please run accelerate config again.__ The documentation for SDXL training is [here](./README.md#sdxl-training).
+
 This repository contains training, generation and utility scripts for Stable Diffusion.
 
-[__Change History__](#change-history) is moved to the bottom of the page.
+[__Change History__](#change-history) is moved to the bottom of the page. 
 更新履歴は[ページ末尾](#change-history)に移しました。
 
-[日本語版README](./README-ja.md)
+[日本語版READMEはこちら](./README-ja.md)
 
 For easier use (GUI and PowerShell scripts etc...), please visit [the repository maintained by bmaltais](https://github.com/bmaltais/kohya_ss). Thanks to @bmaltais!
 
@@ -16,142 +18,13 @@ This repository contains the scripts for:
 * Image generation
 * Model conversion (supports 1.x and 2.x, Stable Diffision ckpt/safetensors and Diffusers)
 
-__Stable Diffusion web UI now seems to support LoRA trained by ``sd-scripts``.__ Thank you for great work!!! 
-
-## About SDXL training
-
-The feature of SDXL training is now available in sdxl branch as an experimental feature. 
-
-Sep 24, 2023: The feature will be merged into the main branch very soon. Following are the changes from the previous version. 
-
-- `accelerate` is updated to 0.23.0, and `diffusers` is updated to 0.21.2. Dependency for `invisible-watermark` is removed. Please update them with the upgrade instructions below.
-- Intel ARC support with IPEX is added. [#825](https://github.com/kohya-ss/sd-scripts/pull/825)
-- Other changes and fixes.
-- Thanks for contributions from Disty0, sdbds, jvkap, rockerBOO, Symbiomatrix and others!
-
-Sep 3, 2023: 
-
-- ControlNet-LLLite is added. See [documentation](./docs/train_lllite_README.md) for details.
-- JPEG XL is supported. [#786](https://github.com/kohya-ss/sd-scripts/pull/786) 
-- Peak memory usage is reduced. [#791](https://github.com/kohya-ss/sd-scripts/pull/791)
-- Input perturbation noise is added. See [#798](https://github.com/kohya-ss/sd-scripts/pull/798) for details.
-- Dataset subset now has `caption_prefix` and `caption_suffix` options. The strings are added to the beginning and the end of the captions before shuffling. You can specify the options in `.toml`.
-- Other minor changes.
-- Thanks for contributions from Isotr0py, vvern999, lansing  and others!
-
-Aug 13, 2023: 
-
-- LoRA-FA is added experimentally. Specify `--network_module networks.lora_fa` option instead of `--network_module networks.lora`. The trained model can be used as a normal LoRA model.
-
-Aug 12, 2023: 
-
-- The default value of noise offset when omitted has been changed to 0 from 0.0357.
-- The different learning rates for each U-Net block are now supported. Specify with `--block_lr` option. Specify 23 values separated by commas like `--block_lr 1e-3,1e-3 ... 1e-3`.
-  - 23 values correspond to `0: time/label embed, 1-9: input blocks 0-8, 10-12: mid blocks 0-2, 13-21: output blocks 0-8, 22: out`.
-
-Aug 6, 2023: 
-
-- [SAI Model Spec](https://github.com/Stability-AI/ModelSpec) metadata is now supported partially. `hash_sha256` is not supported yet.
-  - The main items are set automatically. 
-  - You can set title, author, description, license and tags with `--metadata_xxx` options in each training script.
-  - Merging scripts also support minimum SAI Model Spec metadata. See the help message for the usage.
-  - Metadata editor will be available soon.
-- SDXL LoRA has `sdxl_base_v1-0` now  for `ss_base_model_version` metadata item, instead of `v0-9`.
-
-Aug 4, 2023: 
-
-- `bitsandbytes` is now optional. Please install it if you want to use it. The insructions are in the later section.
-- `albumentations` is not required anymore.
-- An issue for pooled output for Textual Inversion training is fixed.
-- `--v_pred_like_loss ratio` option is added. This option adds the loss like v-prediction loss in SDXL training. `0.1` means that the loss is added 10% of the v-prediction loss. The default value is None (disabled).
-  - In v-prediction, the loss is higher in the early timesteps (near the noise). This option can be used to increase the loss in the early timesteps.
-- Arbitrary options can be used for Diffusers' schedulers. For example `--lr_scheduler_args "lr_end=1e-8"`.
-- `sdxl_gen_imgs.py` supports batch size > 1.
-- Fix ControlNet to work with attention couple and reginal LoRA in `gen_img_diffusers.py`.
-
-Summary of the feature:
-
-- `tools/cache_latents.py` is added. This script can be used to cache the latents to disk in advance. 
-  - The options are almost the same as `sdxl_train.py'. See the help message for the usage.
-  - Please launch the script as follows:
-    `accelerate launch  --num_cpu_threads_per_process 1 tools/cache_latents.py ...`
-  - This script should work with multi-GPU, but it is not tested in my environment.
-
-- `tools/cache_text_encoder_outputs.py` is added. This script can be used to cache the text encoder outputs to disk in advance. 
-  - The options are almost the same as `cache_latents.py' and `sdxl_train.py'. See the help message for the usage.
-
-- `sdxl_train.py` is a script for SDXL fine-tuning. The usage is almost the same as `fine_tune.py`, but it also supports DreamBooth dataset.
-  - `--full_bf16` option is added. Thanks to KohakuBlueleaf!
-    - This option enables the full bfloat16 training (includes gradients). This option is useful to reduce the GPU memory usage. 
-    - However, bitsandbytes==0.35 doesn't seem to support this. Please use a newer version of bitsandbytes or another optimizer.
-    - I cannot find bitsandbytes>0.35.0 that works correctly on Windows.
-    - In addition, the full bfloat16 training might be unstable. Please use it at your own risk.
-- `prepare_buckets_latents.py` now supports SDXL fine-tuning.
-- `sdxl_train_network.py` is a script for LoRA training for SDXL. The usage is almost the same as `train_network.py`.
-- Both scripts has following additional options:
-  - `--cache_text_encoder_outputs` and `--cache_text_encoder_outputs_to_disk`: Cache the outputs of the text encoders. This option is useful to reduce the GPU memory usage. This option cannot be used with options for shuffling or dropping the captions.
-  - `--no_half_vae`: Disable the half-precision (mixed-precision) VAE. VAE for SDXL seems to produce NaNs in some cases. This option is useful to avoid the NaNs.
-- The image generation during training is now available. `--no_half_vae` option also works to avoid black images.
-
-- `--weighted_captions` option is not supported yet for both scripts.
-- `--min_timestep` and `--max_timestep` options are added to each training script. These options can be used to train U-Net with different timesteps. The default values are 0 and 1000.
-
-- `sdxl_train_textual_inversion.py` is a script for Textual Inversion training for SDXL. The usage is almost the same as `train_textual_inversion.py`.
-  - `--cache_text_encoder_outputs` is not supported.
-  - `token_string` must be alphabet only currently, due to the limitation of the open-clip tokenizer.
-  - There are two options for captions:
-    1. Training with captions. All captions must include the token string. The token string is replaced with multiple tokens.
-    2. Use `--use_object_template` or `--use_style_template` option. The captions are generated from the template. The existing captions are ignored.
-  - See below for the format of the embeddings.
-  
-- `sdxl_gen_img.py` is added. This script can be used to generate images with SDXL, including LoRA. See the help message for the usage.
-  - Textual Inversion is supported, but the name for the embeds in the caption becomes alphabet only. For example, `neg_hand_v1.safetensors` can be activated with `neghandv`.
-
-`requirements.txt` is updated to support SDXL training. 
-
-### Tips for SDXL training
-
-- The default resolution of SDXL is 1024x1024.
-- The fine-tuning can be done with 24GB GPU memory with the batch size of 1. For 24GB GPU, the following options are recommended __for the fine-tuning with 24GB GPU memory__:
-  - Train U-Net only.
-  - Use gradient checkpointing.
-  - Use `--cache_text_encoder_outputs` option and caching latents.
-  - Use Adafactor optimizer. RMSprop 8bit or Adagrad 8bit may work. AdamW 8bit doesn't seem to work.
-- The LoRA training can be done with 8GB GPU memory (10GB recommended). For reducing the GPU memory usage, the following options are recommended:
-  - Train U-Net only.
-  - Use gradient checkpointing.
-  - Use `--cache_text_encoder_outputs` option and caching latents.
-  - Use one of 8bit optimizers or Adafactor optimizer.
-  - Use lower dim (-8 for 8GB GPU).
-- `--network_train_unet_only` option is highly recommended for SDXL LoRA. Because SDXL has two text encoders, the result of the training will be unexpected.
-- PyTorch 2 seems to use slightly less GPU memory than PyTorch 1.
-- `--bucket_reso_steps` can be set to 32 instead of the default value 64. Smaller values than 32 will not work for SDXL training.
-
-Example of the optimizer settings for Adafactor with the fixed learning rate:
-```toml
-optimizer_type = "adafactor"
-optimizer_args = [ "scale_parameter=False", "relative_step=False", "warmup_init=False" ]
-lr_scheduler = "constant_with_warmup"
-lr_warmup_steps = 100
-learning_rate = 4e-7 # SDXL original learning rate
-```
-
-### Format of Textual Inversion embeddings
-
-```python
-from safetensors.torch import save_file
-
-state_dict = {"clip_g": embs_for_text_encoder_1280, "clip_l": embs_for_text_encoder_768}
-save_file(state_dict, file)
-```
-
 ## About requirements.txt
 
 These files do not contain requirements for PyTorch. Because the versions of them depend on your environment. Please install PyTorch at first (see installation guide below.) 
 
-The scripts are tested with PyTorch 1.12.1 and 2.0.1, Diffusers 0.18.2.
+The scripts are tested with Pytorch 2.0.1. 1.12.1 is not tested but should work.
 
-## Links to how-to-use documents
+## Links to usage documentation
 
 Most of the documents are written in Japanese.
 
@@ -191,9 +64,9 @@ cd sd-scripts
 python -m venv venv
 .\venv\Scripts\activate
 
-pip install torch==1.12.1+cu116 torchvision==0.13.1+cu116 --extra-index-url https://download.pytorch.org/whl/cu116
+pip install torch==2.0.1+cu118 torchvision==0.15.2+cu118 --index-url https://download.pytorch.org/whl/cu118
 pip install --upgrade -r requirements.txt
-pip install -U -I --no-deps https://github.com/C43H66N12O12S2/stable-diffusion-webui/releases/download/f/xformers-0.0.14.dev0-cp310-cp310-win_amd64.whl
+pip install xformers==0.0.20
 
 accelerate config
 ```
@@ -222,31 +95,6 @@ note: Some user reports ``ValueError: fp16 mixed precision requires a GPU`` is o
 
 (Single GPU with id `0` will be used.)
 
-### Experimental: Use PyTorch 2.0
-
-In this case, you need to install PyTorch 2.0 and xformers 0.0.20. Instead of the above, please type the following:
-
-```powershell
-git clone https://github.com/kohya-ss/sd-scripts.git
-cd sd-scripts
-
-python -m venv venv
-.\venv\Scripts\activate
-
-pip install torch==2.0.1+cu118 torchvision==0.15.2+cu118 --index-url https://download.pytorch.org/whl/cu118
-pip install --upgrade -r requirements.txt
-pip install xformers==0.0.20
-
-accelerate config
-```
-
-Answers to accelerate config should be the same as above.
-
-### about PyTorch and xformers
-
-Other versions of PyTorch and xformers seem to have problems with training.
-If there is no other reason, please install the specified version.
-
 ### Optional: Use `bitsandbytes` (8bit optimizer)
 
 For 8bit optimizer, you need to install `bitsandbytes`. For Linux, please install `bitsandbytes` as usual (0.41.1 or later is recommended.)
@@ -313,214 +161,120 @@ The majority of scripts is licensed under ASL 2.0 (including codes from Diffuser
 
 [BLIP](https://github.com/salesforce/BLIP): BSD-3-Clause
 
+
+## SDXL training
+
+The documentation in this section will be moved to a separate document later.
+
+### Training scripts for SDXL
+
+- `sdxl_train.py` is a script for SDXL fine-tuning. The usage is almost the same as `fine_tune.py`, but it also supports DreamBooth dataset.
+  - `--full_bf16` option is added. Thanks to KohakuBlueleaf!
+    - This option enables the full bfloat16 training (includes gradients). This option is useful to reduce the GPU memory usage. 
+    - The full bfloat16 training might be unstable. Please use it at your own risk.
+  - The different learning rates for each U-Net block are now supported in sdxl_train.py. Specify with `--block_lr` option. Specify 23 values separated by commas like `--block_lr 1e-3,1e-3 ... 1e-3`.
+    - 23 values correspond to `0: time/label embed, 1-9: input blocks 0-8, 10-12: mid blocks 0-2, 13-21: output blocks 0-8, 22: out`.
+- `prepare_buckets_latents.py` now supports SDXL fine-tuning.
+
+- `sdxl_train_network.py` is a script for LoRA training for SDXL. The usage is almost the same as `train_network.py`.
+
+- Both scripts has following additional options:
+  - `--cache_text_encoder_outputs` and `--cache_text_encoder_outputs_to_disk`: Cache the outputs of the text encoders. This option is useful to reduce the GPU memory usage. This option cannot be used with options for shuffling or dropping the captions.
+  - `--no_half_vae`: Disable the half-precision (mixed-precision) VAE. VAE for SDXL seems to produce NaNs in some cases. This option is useful to avoid the NaNs.
+
+- `--weighted_captions` option is not supported yet for both scripts.
+
+- `sdxl_train_textual_inversion.py` is a script for Textual Inversion training for SDXL. The usage is almost the same as `train_textual_inversion.py`.
+  - `--cache_text_encoder_outputs` is not supported.
+  - There are two options for captions:
+    1. Training with captions. All captions must include the token string. The token string is replaced with multiple tokens.
+    2. Use `--use_object_template` or `--use_style_template` option. The captions are generated from the template. The existing captions are ignored.
+  - See below for the format of the embeddings.
+
+- `--min_timestep` and `--max_timestep` options are added to each training script. These options can be used to train U-Net with different timesteps. The default values are 0 and 1000.
+
+### Utility scripts for SDXL
+
+- `tools/cache_latents.py` is added. This script can be used to cache the latents to disk in advance. 
+  - The options are almost the same as `sdxl_train.py'. See the help message for the usage.
+  - Please launch the script as follows:
+    `accelerate launch  --num_cpu_threads_per_process 1 tools/cache_latents.py ...`
+  - This script should work with multi-GPU, but it is not tested in my environment.
+
+- `tools/cache_text_encoder_outputs.py` is added. This script can be used to cache the text encoder outputs to disk in advance. 
+  - The options are almost the same as `cache_latents.py` and `sdxl_train.py`. See the help message for the usage.
+
+- `sdxl_gen_img.py` is added. This script can be used to generate images with SDXL, including LoRA, Textual Inversion and ControlNet-LLLite. See the help message for the usage.
+
+### Tips for SDXL training
+
+- The default resolution of SDXL is 1024x1024.
+- The fine-tuning can be done with 24GB GPU memory with the batch size of 1. For 24GB GPU, the following options are recommended __for the fine-tuning with 24GB GPU memory__:
+  - Train U-Net only.
+  - Use gradient checkpointing.
+  - Use `--cache_text_encoder_outputs` option and caching latents.
+  - Use Adafactor optimizer. RMSprop 8bit or Adagrad 8bit may work. AdamW 8bit doesn't seem to work.
+- The LoRA training can be done with 8GB GPU memory (10GB recommended). For reducing the GPU memory usage, the following options are recommended:
+  - Train U-Net only.
+  - Use gradient checkpointing.
+  - Use `--cache_text_encoder_outputs` option and caching latents.
+  - Use one of 8bit optimizers or Adafactor optimizer.
+  - Use lower dim (4 to 8 for 8GB GPU).
+- `--network_train_unet_only` option is highly recommended for SDXL LoRA. Because SDXL has two text encoders, the result of the training will be unexpected.
+- PyTorch 2 seems to use slightly less GPU memory than PyTorch 1.
+- `--bucket_reso_steps` can be set to 32 instead of the default value 64. Smaller values than 32 will not work for SDXL training.
+
+Example of the optimizer settings for Adafactor with the fixed learning rate:
+```toml
+optimizer_type = "adafactor"
+optimizer_args = [ "scale_parameter=False", "relative_step=False", "warmup_init=False" ]
+lr_scheduler = "constant_with_warmup"
+lr_warmup_steps = 100
+learning_rate = 4e-7 # SDXL original learning rate
+```
+
+### Format of Textual Inversion embeddings for SDXL
+
+```python
+from safetensors.torch import save_file
+
+state_dict = {"clip_g": embs_for_text_encoder_1280, "clip_l": embs_for_text_encoder_768}
+save_file(state_dict, file)
+```
+
+### ControlNet-LLLite
+
+ControlNet-LLLite, a novel method for ControlNet with SDXL, is added. See [documentation](./docs/train_lllite_README.md) for details.
+
+
 ## Change History
 
-### 15 Jun. 2023, 2023/06/15
-
-- Prodigy optimizer is supported in each training script. It is a member of D-Adaptation and is effective for DyLoRA training. [PR #585](https://github.com/kohya-ss/sd-scripts/pull/585) Please see the PR for details. Thanks to sdbds!
-  - Install the package with `pip install prodigyopt`. Then specify the option like `--optimizer_type="prodigy"`.
-- Arbitrary Dataset is supported in each training script (except XTI). You can use it by defining a Dataset class that returns images and captions.
-  - Prepare a Python script and define a class that inherits `train_util.MinimalDataset`. Then specify the option like `--dataset_class package.module.DatasetClass` in each training script.
-  - Please refer to `MinimalDataset` for implementation. I will prepare a sample later.
-- The following features have been added to the generation script.
-  - Added an option `--highres_fix_disable_control_net` to disable ControlNet in the 2nd stage of Highres. Fix. Please try it if the image is disturbed by some ControlNet such as Canny.
-  - Added Variants similar to sd-dynamic-propmpts in the prompt.
-    - If you specify `{spring|summer|autumn|winter}`, one of them will be randomly selected.
-    - If you specify `{2$$chocolate|vanilla|strawberry}`, two of them will be randomly selected.
-    - If you specify `{1-2$$ and $$chocolate|vanilla|strawberry}`, one or two of them will be randomly selected and connected by ` and `.
-    - You can specify the number of candidates in the range `0-2`. You cannot omit one side like `-2` or `1-`.
-    - It can also be specified for the prompt option.
-    - If you specify `e` or `E`, all candidates will be selected and the prompt will be repeated multiple times (`--images_per_prompt` is ignored). It may be useful for creating X/Y plots.
-    - You can also specify `--am {e$$0.2|0.4|0.6|0.8|1.0},{e$$0.4|0.7|1.0} --d 1234`. In this case, 15 prompts will be generated with 5*3.
-    - There is no weighting function.
-
-- 各学習スクリプトでProdigyオプティマイザがサポートされました。D-Adaptationの仲間でDyLoRAの学習に有効とのことです。 [PR #585](https://github.com/kohya-ss/sd-scripts/pull/585)  詳細はPRをご覧ください。sdbds氏に感謝します。
-  - `pip install prodigyopt` としてパッケージをインストールしてください。また `--optimizer_type="prodigy"` のようにオプションを指定します。
-- 各学習スクリプトで任意のDatasetをサポートしました（XTIを除く）。画像とキャプションを返すDatasetクラスを定義することで、学習スクリプトから利用できます。
-  - Pythonスクリプトを用意し、`train_util.MinimalDataset`を継承するクラスを定義してください。そして各学習スクリプトのオプションで `--dataset_class package.module.DatasetClass` のように指定してください。
-  - 実装方法は `MinimalDataset` を参考にしてください。のちほどサンプルを用意します。
-- 生成スクリプトに以下の機能追加を行いました。
-  - Highres. Fixの2nd stageでControlNetを無効化するオプション `--highres_fix_disable_control_net` を追加しました。Canny等一部のControlNetで画像が乱れる場合にお試しください。
-  - プロンプトでsd-dynamic-propmptsに似たVariantをサポートしました。
-    - `{spring|summer|autumn|winter}` のように指定すると、いずれかがランダムに選択されます。
-    - `{2$$chocolate|vanilla|strawberry}` のように指定すると、いずれか2個がランダムに選択されます。
-    - `{1-2$$ and $$chocolate|vanilla|strawberry}` のように指定すると、1個か2個がランダムに選択され ` and ` で接続されます。
-    - 個数のレンジ指定では`0-2`のように0個も指定可能です。`-2`や`1-`のような片側の省略はできません。
-    - プロンプトオプションに対しても指定可能です。
-    - `{e$$chocolate|vanilla|strawberry}` のように`e`または`E`を指定すると、すべての候補が選択されプロンプトが複数回繰り返されます（`--images_per_prompt`は無視されます）。X/Y plotの作成に便利かもしれません。
-    - `--am {e$$0.2|0.4|0.6|0.8|1.0},{e$$0.4|0.7|1.0} --d 1234`のような指定も可能です。この場合、5*3で15回のプロンプトが生成されます。
-    - Weightingの機能はありません。
-
-### 8 Jun. 2023, 2023/06/08
-
-- Fixed a bug where clip skip did not work when training with weighted captions (`--weighted_captions` specified) and when generating sample images during training.
-- 重みづけキャプションでの学習時（`--weighted_captions`指定時）および学習中のサンプル画像生成時にclip skipが機能しない不具合を修正しました。
-
-### 6 Jun. 2023, 2023/06/06
-
-- Fix `train_network.py` to probably work with older versions of LyCORIS.
-- `gen_img_diffusers.py` now supports `BREAK` syntax.
-- `train_network.py`がLyCORISの以前のバージョンでも恐らく動作するよう修正しました。
-- `gen_img_diffusers.py` で `BREAK` 構文をサポートしました。
-
-### 3 Jun. 2023, 2023/06/03
-
-- Max Norm Regularization is now available in `train_network.py`. [PR #545](https://github.com/kohya-ss/sd-scripts/pull/545) Thanks to AI-Casanova!
-  - Max Norm Regularization is a technique to stabilize network training by limiting the norm of network weights. It may be effective in suppressing overfitting of LoRA and improving stability when used with other LoRAs. See PR for details.
-  - Specify as `--scale_weight_norms=1.0`. It seems good to try from `1.0`.
-  - The networks other than LoRA in this repository (such as LyCORIS) do not support this option.
-
-- Three types of dropout have been added to `train_network.py` and LoRA network.
-  - Dropout is a technique to suppress overfitting and improve network performance by randomly setting some of the network outputs to 0.
-  - `--network_dropout` is a normal dropout at the neuron level. In the case of LoRA, it is applied to the output of down. Proposed in [PR #545](https://github.com/kohya-ss/sd-scripts/pull/545) Thanks to AI-Casanova!
-    - `--network_dropout=0.1` specifies the dropout probability to `0.1`.
-    - Note that the specification method is different from LyCORIS.
-  - For LoRA network, `--network_args` can specify `rank_dropout` to dropout each rank with specified probability. Also `module_dropout` can be specified to dropout each module with specified probability.
-    - Specify as `--network_args "rank_dropout=0.2" "module_dropout=0.1"`.
-  - `--network_dropout`, `rank_dropout`, and `module_dropout` can be specified at the same time.
-  - Values of 0.1 to 0.3 may be good to try. Values greater than 0.5 should not be specified.
-  - `rank_dropout` and `module_dropout` are original techniques of this repository. Their effectiveness has not been verified yet.
-  - The networks other than LoRA in this repository (such as LyCORIS) do not support these options.
-
-- Added an option `--scale_v_pred_loss_like_noise_pred` to scale v-prediction loss like noise prediction in each training script.
-  - By scaling the loss according to the time step, the weights of global noise prediction and local noise prediction become the same, and the improvement of details may be expected.
-  - See [this article](https://xrg.hatenablog.com/entry/2023/06/02/202418) by xrg for details (written in Japanese). Thanks to xrg for the great suggestion!
-
-- Max Norm Regularizationが`train_network.py`で使えるようになりました。[PR #545](https://github.com/kohya-ss/sd-scripts/pull/545) AI-Casanova氏に感謝します。
-  - Max Norm Regularizationは、ネットワークの重みのノルムを制限することで、ネットワークの学習を安定させる手法です。LoRAの過学習の抑制、他のLoRAと併用した時の安定性の向上が期待できるかもしれません。詳細はPRを参照してください。
-  - `--scale_weight_norms=1.0`のように `--scale_weight_norms` で指定してください。`1.0`から試すと良いようです。
-  - LyCORIS等、当リポジトリ以外のネットワークは現時点では未対応です。
-
-- `train_network.py` およびLoRAに計三種類のdropoutを追加しました。
-  - dropoutはネットワークの一部の出力をランダムに0にすることで、過学習の抑制、ネットワークの性能向上等を図る手法です。
-  - `--network_dropout` はニューロン単位の通常のdropoutです。LoRAの場合、downの出力に対して適用されます。[PR #545](https://github.com/kohya-ss/sd-scripts/pull/545) で提案されました。AI-Casanova氏に感謝します。
-    - `--network_dropout=0.1` などとすることで、dropoutの確率を指定できます。
-    - LyCORISとは指定方法が異なりますのでご注意ください。
-  - LoRAの場合、`--network_args`に`rank_dropout`を指定することで各rankを指定確率でdropoutします。また同じくLoRAの場合、`--network_args`に`module_dropout`を指定することで各モジュールを指定確率でdropoutします。
-    - `--network_args "rank_dropout=0.2" "module_dropout=0.1"` のように指定します。
-  - `--network_dropout`、`rank_dropout` 、 `module_dropout` は同時に指定できます。
-  - それぞれの値は0.1~0.3程度から試してみると良いかもしれません。0.5を超える値は指定しない方が良いでしょう。
-  - `rank_dropout`および`module_dropout`は当リポジトリ独自の手法です。有効性の検証はまだ行っていません。
-  - これらのdropoutはLyCORIS等、当リポジトリ以外のネットワークは現時点では未対応です。
-
-- 各学習スクリプトにv-prediction lossをnoise predictionと同様の値にスケールするオプション`--scale_v_pred_loss_like_noise_pred`を追加しました。
-  - タイムステップに応じてlossをスケールすることで、 大域的なノイズの予測と局所的なノイズの予測の重みが同じになり、ディテールの改善が期待できるかもしれません。
-  - 詳細はxrg氏のこちらの記事をご参照ください：[noise_predictionモデルとv_predictionモデルの損失 - 勾配降下党青年局](https://xrg.hatenablog.com/entry/2023/06/02/202418) xrg氏の素晴らしい記事に感謝します。
-
-### 31 May 2023, 2023/05/31
-
-- Show warning when image caption file does not exist during training. [PR #533](https://github.com/kohya-ss/sd-scripts/pull/533) Thanks to TingTingin!
-  - Warning is also displayed when using class+identifier dataset. Please ignore if it is intended.
-- `train_network.py` now supports merging network weights before training. [PR #542](https://github.com/kohya-ss/sd-scripts/pull/542) Thanks to u-haru!
-  - `--base_weights` option specifies LoRA or other model files (multiple files are allowed) to merge.
-  - `--base_weights_multiplier` option specifies multiplier of the weights to merge (multiple values are allowed). If omitted or less than `base_weights`, 1.0 is used.
-  - This is useful for incremental learning. See PR for details.
-- Show warning and continue training when uploading to HuggingFace fails.
-
-- 学習時に画像のキャプションファイルが存在しない場合、警告が表示されるようになりました。 [PR #533](https://github.com/kohya-ss/sd-scripts/pull/533) TingTingin氏に感謝します。
-  - class+identifier方式のデータセットを利用している場合も警告が表示されます。意図している通りの場合は無視してください。
-- `train_network.py` に学習前にモデルにnetworkの重みをマージする機能が追加されました。 [PR #542](https://github.com/kohya-ss/sd-scripts/pull/542) u-haru氏に感謝します。
-  - `--base_weights` オプションでLoRA等のモデルファイル（複数可）を指定すると、それらの重みをマージします。
-  - `--base_weights_multiplier` オプションでマージする重みの倍率（複数可）を指定できます。省略時または`base_weights`よりも数が少ない場合は1.0になります。
-  - 差分追加学習などにご利用ください。詳細はPRをご覧ください。
-- HuggingFaceへのアップロードに失敗した場合、警告を表示しそのまま学習を続行するよう変更しました。
-
-### 25 May 2023, 2023/05/25
-
-- [D-Adaptation v3.0](https://github.com/facebookresearch/dadaptation) is now supported. [PR #530](https://github.com/kohya-ss/sd-scripts/pull/530) Thanks to sdbds!
-  - `--optimizer_type` now accepts `DAdaptAdamPreprint`, `DAdaptAdanIP`, and `DAdaptLion`.
-  - `DAdaptAdam` is now new. The old `DAdaptAdam` is available with `DAdaptAdamPreprint`.
-  - Simply specifying `DAdaptation` will use `DAdaptAdamPreprint` (same behavior as before).
-  - You need to install D-Adaptation v3.0. After activating venv, please do `pip install -U dadaptation`.
-  - See PR and D-Adaptation documentation for details.
-- [D-Adaptation v3.0](https://github.com/facebookresearch/dadaptation)がサポートされました。 [PR #530](https://github.com/kohya-ss/sd-scripts/pull/530)  sdbds氏に感謝します。
-  - `--optimizer_type`に`DAdaptAdamPreprint`、`DAdaptAdanIP`、`DAdaptLion` が追加されました。
-  - `DAdaptAdam`が新しくなりました。今までの`DAdaptAdam`は`DAdaptAdamPreprint`で使用できます。
-  - 単に `DAdaptation` を指定すると`DAdaptAdamPreprint`が使用されます（今までと同じ動き）。
-  - D-Adaptation v3.0のインストールが必要です。venvを有効にした後 `pip install -U dadaptation` としてください。
-  - 詳細はPRおよびD-Adaptationのドキュメントを参照してください。
-
-### 22 May 2023, 2023/05/22
-
-- Fixed several bugs.
-  - The state is saved even when the `--save_state` option is not specified in `fine_tune.py` and `train_db.py`. [PR #521](https://github.com/kohya-ss/sd-scripts/pull/521) Thanks to akshaal!
-  - Cannot load LoRA without `alpha`. [PR #527](https://github.com/kohya-ss/sd-scripts/pull/527) Thanks to Manjiz!
-  - Minor changes to console output during sample generation. [PR #515](https://github.com/kohya-ss/sd-scripts/pull/515) Thanks to yanhuifair!
-- The generation script now uses xformers for VAE as well.
-- いくつかのバグ修正を行いました。
-  -  `fine_tune.py`と`train_db.py`で`--save_state`オプション未指定時にもstateが保存される。 [PR #521](https://github.com/kohya-ss/sd-scripts/pull/521) akshaal氏に感謝します。
-  - `alpha`を持たないLoRAを読み込めない。[PR #527](https://github.com/kohya-ss/sd-scripts/pull/527) Manjiz氏に感謝します。
-  - サンプル生成時のコンソール出力の軽微な変更。[PR #515](https://github.com/kohya-ss/sd-scripts/pull/515) yanhuifair氏に感謝します。
-- 生成スクリプトでVAEについてもxformersを使うようにしました。
-
-### 16 May 2023, 2023/05/16
-
-- Fixed an issue where an error would occur if the encoding of the prompt file was different from the default. [PR #510](https://github.com/kohya-ss/sd-scripts/pull/510) Thanks to sdbds!
-  - Please save the prompt file in UTF-8.
-- プロンプトファイルのエンコーディングがデフォルトと異なる場合にエラーが発生する問題を修正しました。 [PR #510](https://github.com/kohya-ss/sd-scripts/pull/510) sdbds氏に感謝します。
-  - プロンプトファイルはUTF-8で保存してください。
-
-### 15 May 2023, 2023/05/15
-
-- Added [English translation of documents](https://github.com/darkstorm2150/sd-scripts#links-to-usage-documentation) by darkstorm2150. Thank you very much!
-- The prompt for sample generation during training can now be specified in `.toml` or `.json`. [PR #504](https://github.com/kohya-ss/sd-scripts/pull/504) Thanks to Linaqruf!
-  - For details on prompt description, please see the PR.
-
-- darkstorm2150氏に[ドキュメント類を英訳](https://github.com/darkstorm2150/sd-scripts#links-to-usage-documentation)していただきました。ありがとうございます！
-- 学習中のサンプル生成のプロンプトを`.toml`または`.json`で指定可能になりました。 [PR #504](https://github.com/kohya-ss/sd-scripts/pull/504) Linaqruf氏に感謝します。
-  - プロンプト記述の詳細は当該PRをご覧ください。
-
-### 11 May 2023, 2023/05/11
-
-- Added an option `--dim_from_weights` to `train_network.py` to automatically determine the dim(rank) from the weight file. [PR #491](https://github.com/kohya-ss/sd-scripts/pull/491) Thanks to AI-Casanova!
-  - It is useful in combination with `resize_lora.py`. Please see the PR for details.
-- Fixed a bug where the noise resolution was incorrect with Multires noise. [PR #489](https://github.com/kohya-ss/sd-scripts/pull/489) Thanks to sdbds!
-  - Please see the PR for details.
-- The image generation scripts can now use img2img and highres fix at the same time.
-- Fixed a bug where the hint image of ControlNet was incorrectly BGR instead of RGB in the image generation scripts.
-- Added a feature to the image generation scripts to use the memory-efficient VAE.
-  - If you specify a number with the `--vae_slices` option, the memory-efficient VAE will be used. The maximum output size will be larger, but it will be slower. Please specify a value of about `16` or `32`.
-  - The implementation of the VAE is in `library/slicing_vae.py`.
-
-- `train_network.py`にdim(rank)を重みファイルから自動決定するオプション`--dim_from_weights`が追加されました。 [PR #491](https://github.com/kohya-ss/sd-scripts/pull/491) AI-Casanova氏に感謝します。
-  - `resize_lora.py`と組み合わせると有用です。詳細はPRもご参照ください。
-- Multires noiseでノイズ解像度が正しくない不具合が修正されました。 [PR #489](https://github.com/kohya-ss/sd-scripts/pull/489)  sdbds氏に感謝します。
-  - 詳細は当該PRをご参照ください。
-- 生成スクリプトでimg2imgとhighres fixを同時に使用できるようにしました。
-- 生成スクリプトでControlNetのhint画像が誤ってBGRだったのをRGBに修正しました。
-- 生成スクリプトで省メモリ化VAEを使えるよう機能追加しました。
-  - `--vae_slices`オプションに数値を指定すると、省メモリ化VAEを用います。出力可能な最大サイズが大きくなりますが、遅くなります。`16`または`32`程度の値を指定してください。
-  - VAEの実装は`library/slicing_vae.py`にあります。
-
-### 7 May 2023, 2023/05/07
-
-- The documentation has been moved to the `docs` folder. If you have links, please change them.
-- Removed `gradio` from `requirements.txt`.
-- DAdaptAdaGrad, DAdaptAdan, and DAdaptSGD are now supported by DAdaptation. [PR#455](https://github.com/kohya-ss/sd-scripts/pull/455) Thanks to sdbds!
-  - DAdaptation needs to be installed. Also, depending on the optimizer, DAdaptation may need to be updated. Please update with `pip install --upgrade dadaptation`.
-- Added support for pre-calculation of LoRA weights in image generation scripts. Specify `--network_pre_calc`.
-  - The prompt option `--am` is available. Also, it is disabled when Regional LoRA is used.
-- Added Adaptive noise scale to each training script. Specify a number with `--adaptive_noise_scale` to enable it.
-  - __Experimental option. It may be removed or changed in the future.__
-  - This is an original implementation that automatically adjusts the value of the noise offset according to the absolute value of the mean of each channel of the latents. It is expected that appropriate noise offsets will be set for bright and dark images, respectively.
-  - Specify it together with `--noise_offset`.
-  - The actual value of the noise offset is calculated as `noise_offset + abs(mean(latents, dim=(2,3))) * adaptive_noise_scale`. Since the latent is close to a normal distribution, it may be a good idea to specify a value of about 1/10 to the same as the noise offset.
-  - Negative values can also be specified, in which case the noise offset will be clipped to 0 or more.
-- Other minor fixes.
-
-- ドキュメントを`docs`フォルダに移動しました。リンク等を張られている場合は変更をお願いいたします。
-- `requirements.txt`から`gradio`を削除しました。
-- DAdaptationで新しくDAdaptAdaGrad、DAdaptAdan、DAdaptSGDがサポートされました。[PR#455](https://github.com/kohya-ss/sd-scripts/pull/455) sdbds氏に感謝します。
-  - dadaptationのインストールが必要です。またオプティマイザによってはdadaptationの更新が必要です。`pip install --upgrade dadaptation`で更新してください。
-- 画像生成スクリプトでLoRAの重みの事前計算をサポートしました。`--network_pre_calc`を指定してください。
-  - プロンプトオプションの`--am`が利用できます。またRegional LoRA使用時には無効になります。
-- 各学習スクリプトにAdaptive noise scaleを追加しました。`--adaptive_noise_scale`で数値を指定すると有効になります。
-  - __実験的オプションです。将来的に削除、仕様変更される可能性があります。__
-  - Noise offsetの値を、latentsの各チャネルの平均値の絶対値に応じて自動調整するオプションです。独自の実装で、明るい画像、暗い画像に対してそれぞれ適切なnoise offsetが設定されることが期待されます。
-  - `--noise_offset` と同時に指定してください。
-  - 実際のNoise offsetの値は `noise_offset + abs(mean(latents, dim=(2,3))) * adaptive_noise_scale` で計算されます。 latentは正規分布に近いためnoise_offsetの1/10～同程度の値を指定するとよいかもしれません。
-  - 負の値も指定でき、その場合はnoise offsetは0以上にclipされます。
-- その他の細かい修正を行いました。
+### Oct 1. 2023 / 2023/10/1
+
+- SDXL training is now available in the main branch. The sdxl branch is merged into the main branch.
+
+- [SAI Model Spec](https://github.com/Stability-AI/ModelSpec) metadata is now supported partially. `hash_sha256` is not supported yet.
+  - The main items are set automatically. 
+  - You can set title, author, description, license and tags with `--metadata_xxx` options in each training script.
+  - Merging scripts also support minimum SAI Model Spec metadata. See the help message for the usage.
+  - Metadata editor will be available soon.
+
+- `bitsandbytes` is now optional. Please install it if you want to use it. The insructions are in the later section.
+
+- `albumentations` is not required anymore.
+
+- `--v_pred_like_loss ratio` option is added. This option adds the loss like v-prediction loss in SDXL training. `0.1` means that the loss is added 10% of the v-prediction loss. The default value is None (disabled).
+  - In v-prediction, the loss is higher in the early timesteps (near the noise). This option can be used to increase the loss in the early timesteps.
+
+- Arbitrary options can be used for Diffusers' schedulers. For example `--lr_scheduler_args "lr_end=1e-8"`.
+
+- LoRA-FA is added experimentally. Specify `--network_module networks.lora_fa` option instead of `--network_module networks.lora`. The trained model can be used as a normal LoRA model.
+- JPEG XL is supported. [#786](https://github.com/kohya-ss/sd-scripts/pull/786) 
+- Input perturbation noise is added. See [#798](https://github.com/kohya-ss/sd-scripts/pull/798) for details.
+- Dataset subset now has `caption_prefix` and `caption_suffix` options. The strings are added to the beginning and the end of the captions before shuffling. You can specify the options in `.toml`.
+- Intel ARC support with IPEX is added. [#825](https://github.com/kohya-ss/sd-scripts/pull/825)
+- Other bug fixes and improvements.
+
 
 Please read [Releases](https://github.com/kohya-ss/sd-scripts/releases) for recent updates.
 最近の更新情報は [Release](https://github.com/kohya-ss/sd-scripts/releases) をご覧ください。

From 75e888dcad11318dcc8e7361e1fd43e9974b5fc2 Mon Sep 17 00:00:00 2001
From: bmaltais <bernard@ducourier.com>
Date: Sun, 1 Oct 2023 13:15:43 -0400
Subject: [PATCH 28/31] Update WNADB and presets

---
 modules.txt                                   | Bin 0 -> 7014 bytes
 .../lora/SDXL - LoRA AI_Now ADamW v1.0.json   |  97 ++++++++++++++++++
 ...XL - LoRA kudou-reira dadaptadam v1.0.json |  94 +++++++++++++++++
 ...XL - LoRA kudou-reira dadaptadam v1.1.json |  94 +++++++++++++++++
 .../SDXL - LoRA kudou-reira prodigy v4.0.json |  94 +++++++++++++++++
 requirements.txt                              |   2 +-
 6 files changed, 380 insertions(+), 1 deletion(-)
 create mode 100644 modules.txt
 create mode 100644 presets/lora/SDXL - LoRA AI_Now ADamW v1.0.json
 create mode 100644 presets/lora/SDXL - LoRA kudou-reira dadaptadam v1.0.json
 create mode 100644 presets/lora/SDXL - LoRA kudou-reira dadaptadam v1.1.json
 create mode 100644 presets/lora/SDXL - LoRA kudou-reira prodigy v4.0.json

diff --git a/modules.txt b/modules.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7c9399841864b7522cc17102ba69ee43b871a3e3
GIT binary patch
literal 7014
zcma)>OK)RE5QTe<#D73yiA=^$CJ&^MV1bYjBSm5{VioxvC${6UoyjCW9ynj0TT^Z1
z0E(Pk->$AY^{(#x_2;mR%C0O+uWZV5d8eO#`M#X#^QL@VzAD2q*3VQF)3VjKz5YV=
zMil*W)zB@<Lz$IDrQB7TK{?g>pi-I_G`Hfhmo&JreyPt{CyVu+c+AUMvX+$wJka$!
znxzmwmX&a<#Ro*}T5DgRhdR(NE6qaKSw%1K3_4zG>CEoHa?$ME2;)Sn+lDjy1M5zl
zA0>Aq4A9svpgq^;w3D!tgtaW#t2jaZpk4bfXs~0T!7d@qwu75B841Z=QikF-kp-Uh
zZL06zlppm8o9@aF<wQU8N_{IyBXJl@=1OnxWYPRN(b{pIz4D|L<UHtUC+ts>h1D10
zveh%uFswT7^u}~+y?xNziQYU3(X`UB{<<8rKdTYf?;`M2f7jx(*6zS@Bd#m)J+J<Q
z_p*E|>9=}6C~x)tog#5ipD!itt?16hb*!hp`0#zEzgK#5u05k}*HF*U&9pwyIzN|s
z;@hd7$D)Masi;O()`fmw$Rp5<^mL{*^26R92^aZtU%d++qBcFalm#<;$Q{S@xbDJY
zduje4E{^55qR7Z63%8<J)GQ-n2Nj8>@cW|1+*s@6VPr#J`llpp=4jn_mfp&?<l3UV
zR0-y6<bjXX(2IO^PLk8a7JjETz=z%TDtQnTWGZpLJ*YBio3Y|~EIw!$9QuvzCgRLz
zBR-k&P^NvzRWyIBaT$5!It6BD8L^5cNOH{^>uD<Pws~N8);Y?bl~2TeAiJtH%RZm#
zJ@L9}_LKjyAI-PYZX+5f$>HGRN-`~<g=kuJb0wQ=2i9_p#7h2P&6)IfO(JTrSaO5T
zsGy=gVX+o>)@v-a(yD7dT4&XaDvn>n>#VlB!H}|vFl#&$reU@3Wku#r0y3j#b+6oc
ztiHkvwukE-d<I?I%p@Iq9p;29FnB{Awjy$=KZ~k07IB0|9iHj^PTG>wtU7bhDSb7Q
zq*-SV-lomxoi$raTe|Bk^_aEP)vLb<&%RuX=U%l6ZJ=_*g3LCVYvn-^hA(@PM0dEY
zl<rjE2x-SCd)ghluz<06*YTZnEJsIZgrtx`CX-9q6{&yWL1Hu3Z)8m*x)Zm&T#2CF
zh$eV}iE16PZe@Xf)zn#P`w=I%qKnu(jJ)V{)SWg0`>KAC<3aQJ3vcRmXh2t-*ABVX
zZ!(4K>q#cvGPG!`=0f&h9trO`Zx^-R&9yi5!-{w{Btb#mQ?>1xp59R9UCsClPb`v?
z2UX&ecrpjkQQQYTdAU|^yAFfZ(sOs>mKs}MSA$>0X|3*0w+tOX>56zKgve<l$1#d0
z?u5hB)mRpqR-NHt{-CxDS5)GPRZ#UCZ1>{MOaZT@w2L|q3OL!K)P@&j%Z+F}Nd%qi
z6L^f1xMLEUS84q&rXsFPL!451Jt0L+^L%ttey)>JJ13ne4#>lvP)=k?X5yTCLiSu%
z*h?}Ng_dk(<|Wg~gskpaEm>X5XY{9M?SVGuZF?}h0WJ3SI<;>rT&@SMdeK*x;^i#B
zu03IJ7smeBm7ZtcyswtHmsZSkS%t8?Dd^lpRZtwmn2{Fz=up3FC$m+0Jv4zQb3V8t
z{-Gkah#ObGm2{w!jXECanAL`<FRg`5n4$1m%=?av)MKLct*DU1TE;n<2RHXbdO^}a
z>1<eOZ%=f4Sr?l0XX;N@TRZ#3GE$>Q@gN$IHr5kzT=gQ-SCu=lK=)^oM<09M65$Cd
zS7oN*y?lp<@4GC=j<L#lDC(uru==+4++*E=RrImWOfMkl2`%xr(bHk%nKtpnj<Ir?
zUGCId?H%bmNZO7^oDq%ezmO%!r9%$KcDN!E4VVA1gWAocHW7L}W(yulb%jAHx#y{E
zr&w84?_iI#CsvNS7ZID&I^C10XxPP}Yfzlm&<zSsPi-|scjp?bSf`ywSoOT?tO%OJ
z*~e7jwQE_M(;PVsN)V<FWD0VaqUajLV$51l*&jyB1jYJXezS$(=Lk#+uy}3@n=nPC
zR#?-ukaZ?G5IF|%FV<L#E353wnXz4^LxP4JYO}A^%yn;4*U3{T;YyUYdO?RJectiO
z!jT+Sey(0($|O_af!6(skKfiFs&+Qu%o*7Z_MCYgdB`C8y#HDhb7|!%DJj|s11^Eo
zodqoD+<M_znlKH#Rx_|FpMfkN$kE+X@!^!_yPJr}%vWlkXN}gIVOiG;&_;gauc&v{
z!1XS$INQC0NTiq1tBAI?lZNfHYkJR1^-9j`Mtm&<AB2-h0`JGE3Tt6``xmM|%1`Be
z`K@xv$@8Ny6Y=PZ<Us2xft8u-JWZu+_FFy_@E}4%wj+!9v-E$`ygsMf&v=Xq8r`0=
zT&oS9G+H#Yhj~NyjShl^9bssce0Soys1pd)FDKW~ebB_&AuZM=gFH20U${E5O^Khv
zdu<KlMvN{Q+12ajzUXu8!Di03;OHpjx6j1PR6GSz5L|uH-qcKap4xo#0u86g#EFGA
z(t`6Wxz0Rdo}rN=8*KJE?_;jSGdt;mHa?TPb|ru2LEpOg)D&DpD^Bxt1ZozueX9}P
zbe8x`#`FT-_NH~voDaBBJnZ(Z)!RrINnd<7sn)Zt`Ry1!P8s<>tQKYd&S{;P%E_(W
zPX&xVlya>7F*<4(+By<FYh*mY;otUBA>ryN@BhhTw(2RX$CBHVui~Eix#A#FBf@-w
zOZjw9CLGV#fikmzN=U9Ve{kA^k|;Ae=V0pNI0^aY#(g4mu3U~p+a^&F&77!+#l12)
z&=P5Z^+E5+h`7niO5j=&e&;s0osjWsaQ2$-jO>S~XEBSoBD1GG;n{~#yK?7uvc;n)
z``r|mblh1`gPk$4V^Ch-K@a6Njr?hK!v?P0$vsZcaqD8d<n1vDABDuWb~fg0f;?*2
zPBG(2D(|eIJ*~YXtVb7j|E9{a%Xd!M6SKehgkkC%X?FY^fDBK<_WX;G(x2T4vNzec
zkFpFM84FSEJc|=6j+5}S&q#D(ZntCIH8buIAG=D0)~^1J6MG6YpK7dg3gkZ_)VS6w
zj(q5zss-bgEIr}4eG*pg1~_x&ITtCeiavvLbKra1=(nzVaI@|oWJ69z+y!z*bZ)s{
r1t)w7+R(V)Obefnco^>1+C7~qHcppV6mQ!1FVcYj7{S?^XAJ!fjyMNU

literal 0
HcmV?d00001

diff --git a/presets/lora/SDXL - LoRA AI_Now ADamW v1.0.json b/presets/lora/SDXL - LoRA AI_Now ADamW v1.0.json
new file mode 100644
index 000000000..061e415c5
--- /dev/null
+++ b/presets/lora/SDXL - LoRA AI_Now ADamW v1.0.json	
@@ -0,0 +1,97 @@
+{
+    "LoRA_type": "Standard",
+    "adaptive_noise_scale": 0.00375,
+    "additional_parameters": "",
+    "block_alphas": "",
+    "block_dims": "",
+    "block_lr_zero_threshold": "",
+    "bucket_no_upscale": true,
+    "bucket_reso_steps": 64,
+    "cache_latents": true,
+    "cache_latents_to_disk": true,
+    "caption_dropout_every_n_epochs": 0.0,
+    "caption_dropout_rate": 0,
+    "caption_extension": ".txt-no",
+    "clip_skip": "1",
+    "color_aug": false,
+    "conv_alpha": 32,
+    "conv_block_alphas": "",
+    "conv_block_dims": "",
+    "conv_dim": 32,
+    "decompose_both": false,
+    "dim_from_weights": false,
+    "down_lr_weight": "",
+    "enable_bucket": true,
+    "epoch": 160,
+    "factor": -1,
+    "flip_aug": false,
+    "full_bf16": false,
+    "full_fp16": false,
+    "gradient_accumulation_steps": 1,
+    "gradient_checkpointing": true,
+    "keep_tokens": "0",
+    "learning_rate": 0.0001,
+    "lora_network_weights": "",
+    "lr_scheduler": "constant",
+    "lr_scheduler_args": "",
+    "lr_scheduler_num_cycles": "1",
+    "lr_scheduler_power": "",
+    "lr_warmup": 0,
+    "max_bucket_reso": 2048,
+    "max_data_loader_n_workers": "0",
+    "max_resolution": "1024,1024",
+    "max_timestep": 800,
+    "max_token_length": "75",
+    "max_train_epochs": "",
+    "max_train_steps": "320",
+    "mem_eff_attn": false,
+    "mid_lr_weight": "",
+    "min_bucket_reso": 256,
+    "min_snr_gamma": 5,
+    "min_timestep": 200,
+    "mixed_precision": "bf16",
+    "module_dropout": 0,
+    "multires_noise_discount": 0,
+    "multires_noise_iterations": 0,
+    "network_alpha": 32,
+    "network_dim": 32,
+    "network_dropout": 0,
+    "no_token_padding": false,
+    "noise_offset": 0.0375,
+    "noise_offset_type": "Original",
+    "num_cpu_threads_per_process": 2,
+    "optimizer": "AdamW",
+    "optimizer_args": "",
+    "persistent_data_loader_workers": false,
+    "prior_loss_weight": 1.0,
+    "random_crop": false,
+    "rank_dropout": 0,
+    "save_every_n_epochs": 5,
+    "save_every_n_steps": 0,
+    "save_last_n_steps": 0,
+    "save_last_n_steps_state": 0,
+    "save_precision": "bf16",
+    "scale_v_pred_loss_like_noise_pred": false,
+    "scale_weight_norms": 0,
+    "sdxl": true,
+    "sdxl_cache_text_encoder_outputs": false,
+    "sdxl_no_half_vae": true,
+    "seed": "12345",
+    "shuffle_caption": false,
+    "stop_text_encoder_training": 0,
+    "text_encoder_lr": 0.0001,
+    "train_batch_size": 4,
+    "train_on_input": true,
+    "training_comment": "trigger: lxndrn woman",
+    "unet_lr": 0.0001,
+    "unit": 1,
+    "up_lr_weight": "",
+    "use_cp": false,
+    "use_wandb": false,
+    "v2": false,
+    "v_parameterization": false,
+    "v_pred_like_loss": 0,
+    "vae_batch_size": 0,
+    "weighted_captions": false,
+    "xformers": "xformers"
+}
\ No newline at end of file
diff --git a/presets/lora/SDXL - LoRA kudou-reira dadaptadam v1.0.json b/presets/lora/SDXL - LoRA kudou-reira dadaptadam v1.0.json
new file mode 100644
index 000000000..3e5af9f62
--- /dev/null
+++ b/presets/lora/SDXL - LoRA kudou-reira dadaptadam v1.0.json	
@@ -0,0 +1,94 @@
+{
+    "LoRA_type": "Standard",
+    "adaptive_noise_scale": 0,
+    "additional_parameters": "--network_train_unet_only",
+    "block_alphas": "",
+    "block_dims": "",
+    "block_lr_zero_threshold": "",
+    "bucket_no_upscale": true,
+    "bucket_reso_steps": 64,
+    "cache_latents": true,
+    "cache_latents_to_disk": true,
+    "caption_dropout_every_n_epochs": 0.0,
+    "caption_dropout_rate": 0,
+    "caption_extension": ".txt",
+    "clip_skip": "1",
+    "color_aug": false,
+    "conv_alpha": 1,
+    "conv_alphas": "",
+    "conv_dim": 1,
+    "conv_dims": "",
+    "decompose_both": false,
+    "dim_from_weights": false,
+    "down_lr_weight": "",
+    "enable_bucket": true,
+    "epoch": 25,
+    "factor": -1,
+    "flip_aug": false,
+    "full_bf16": false,
+    "full_fp16": false,
+    "gradient_accumulation_steps": 1.0,
+    "gradient_checkpointing": true,
+    "keep_tokens": 2,
+    "learning_rate": 1.0,
+    "lora_network_weights": "",
+    "lr_scheduler": "cosine",
+    "lr_scheduler_num_cycles": "",
+    "lr_scheduler_power": "",
+    "lr_warmup": 5,
+    "max_bucket_reso": 2048,
+    "max_data_loader_n_workers": "0",
+    "max_resolution": "1024,1024",
+    "max_timestep": 1000,
+    "max_token_length": "225",
+    "max_train_epochs": "",
+    "mem_eff_attn": false,
+    "mid_lr_weight": "",
+    "min_bucket_reso": 256,
+    "min_snr_gamma": 5,
+    "min_timestep": 0,
+    "mixed_precision": "bf16",
+    "module_dropout": 0,
+    "multires_noise_discount": 0,
+    "multires_noise_iterations": 0,
+    "network_alpha": 256,
+    "network_dim": 256,
+    "network_dropout": 0,
+    "no_token_padding": false,
+    "noise_offset": 0.0357,
+    "noise_offset_type": "Original",
+    "num_cpu_threads_per_process": 2,
+    "optimizer": "DAdaptAdam",
+    "optimizer_args": "\"decouple=True\" \"weight_decay=0.2\" \"betas=0.9,0.99\" \"growth_rate=1.02\"",
+    "persistent_data_loader_workers": false,
+    "prior_loss_weight": 1.0,
+    "random_crop": false,
+    "rank_dropout": 0,
+    "save_every_n_epochs": 1,
+    "save_every_n_steps": 0,
+    "save_last_n_steps": 0,
+    "save_last_n_steps_state": 0,
+    "save_precision": "bf16",
+    "scale_v_pred_loss_like_noise_pred": false,
+    "scale_weight_norms": 0,
+    "sdxl": true,
+    "sdxl_cache_text_encoder_outputs": false,
+    "sdxl_no_half_vae": true,
+    "seed": "1337",
+    "shuffle_caption": true,
+    "stop_text_encoder_training": 0,
+    "text_encoder_lr": 1.0,
+    "train_batch_size": 6,
+    "train_on_input": true,
+    "training_comment": "",
+    "unet_lr": 1.0,
+    "unit": 1,
+    "up_lr_weight": "",
+    "use_cp": false,
+    "use_wandb": true,
+    "v2": false,
+    "v_parameterization": false,
+    "vae_batch_size": 0,
+    "weighted_captions": false,
+    "xformers": true
+}
\ No newline at end of file
diff --git a/presets/lora/SDXL - LoRA kudou-reira dadaptadam v1.1.json b/presets/lora/SDXL - LoRA kudou-reira dadaptadam v1.1.json
new file mode 100644
index 000000000..5cb0296bc
--- /dev/null
+++ b/presets/lora/SDXL - LoRA kudou-reira dadaptadam v1.1.json	
@@ -0,0 +1,94 @@
+{
+    "LoRA_type": "Standard",
+    "adaptive_noise_scale": 0,
+    "additional_parameters": "--network_train_unet_only",
+    "block_alphas": "",
+    "block_dims": "",
+    "block_lr_zero_threshold": "",
+    "bucket_no_upscale": true,
+    "bucket_reso_steps": 64,
+    "cache_latents": true,
+    "cache_latents_to_disk": true,
+    "caption_dropout_every_n_epochs": 0.0,
+    "caption_dropout_rate": 0,
+    "caption_extension": ".txt",
+    "clip_skip": "1",
+    "color_aug": false,
+    "conv_alpha": 1,
+    "conv_alphas": "",
+    "conv_dim": 1,
+    "conv_dims": "",
+    "decompose_both": false,
+    "dim_from_weights": false,
+    "down_lr_weight": "",
+    "enable_bucket": true,
+    "epoch": 25,
+    "factor": -1,
+    "flip_aug": false,
+    "full_bf16": false,
+    "full_fp16": false,
+    "gradient_accumulation_steps": 1.0,
+    "gradient_checkpointing": true,
+    "keep_tokens": 2,
+    "learning_rate": 1.0,
+    "lora_network_weights": "",
+    "lr_scheduler": "cosine",
+    "lr_scheduler_num_cycles": "",
+    "lr_scheduler_power": "",
+    "lr_warmup": 5,
+    "max_bucket_reso": 2048,
+    "max_data_loader_n_workers": "0",
+    "max_resolution": "1024,1024",
+    "max_timestep": 1000,
+    "max_token_length": "225",
+    "max_train_epochs": "",
+    "mem_eff_attn": false,
+    "mid_lr_weight": "",
+    "min_bucket_reso": 256,
+    "min_snr_gamma": 5,
+    "min_timestep": 0,
+    "mixed_precision": "bf16",
+    "module_dropout": 0,
+    "multires_noise_discount": 0,
+    "multires_noise_iterations": 0,
+    "network_alpha": 64,
+    "network_dim": 64,
+    "network_dropout": 0,
+    "no_token_padding": false,
+    "noise_offset": 0.0357,
+    "noise_offset_type": "Original",
+    "num_cpu_threads_per_process": 2,
+    "optimizer": "DAdaptAdam",
+    "optimizer_args": "\"decouple=True\" \"weight_decay=0.1\" \"betas=0.9,0.91\"",
+    "persistent_data_loader_workers": false,
+    "prior_loss_weight": 1.0,
+    "random_crop": false,
+    "rank_dropout": 0,
+    "save_every_n_epochs": 1,
+    "save_every_n_steps": 0,
+    "save_last_n_steps": 0,
+    "save_last_n_steps_state": 0,
+    "save_precision": "bf16",
+    "scale_v_pred_loss_like_noise_pred": false,
+    "scale_weight_norms": 0,
+    "sdxl": true,
+    "sdxl_cache_text_encoder_outputs": false,
+    "sdxl_no_half_vae": true,
+    "seed": "1337",
+    "shuffle_caption": true,
+    "stop_text_encoder_training": 0,
+    "text_encoder_lr": 1.0,
+    "train_batch_size": 6,
+    "train_on_input": true,
+    "training_comment": "",
+    "unet_lr": 1.0,
+    "unit": 1,
+    "up_lr_weight": "",
+    "use_cp": false,
+    "use_wandb": true,
+    "v2": false,
+    "v_parameterization": false,
+    "vae_batch_size": 0,
+    "weighted_captions": false,
+    "xformers": true
+}
\ No newline at end of file
diff --git a/presets/lora/SDXL - LoRA kudou-reira prodigy v4.0.json b/presets/lora/SDXL - LoRA kudou-reira prodigy v4.0.json
new file mode 100644
index 000000000..755743728
--- /dev/null
+++ b/presets/lora/SDXL - LoRA kudou-reira prodigy v4.0.json	
@@ -0,0 +1,94 @@
+{
+    "LoRA_type": "Standard",
+    "adaptive_noise_scale": 0,
+    "additional_parameters": "--lr_scheduler_type \"CosineAnnealingLR\" --lr_scheduler_args \"T_max=30\" --network_train_unet_only",
+    "block_alphas": "",
+    "block_dims": "",
+    "block_lr_zero_threshold": "",
+    "bucket_no_upscale": true,
+    "bucket_reso_steps": 64,
+    "cache_latents": true,
+    "cache_latents_to_disk": true,
+    "caption_dropout_every_n_epochs": 0.0,
+    "caption_dropout_rate": 0,
+    "caption_extension": ".txt",
+    "clip_skip": "1",
+    "color_aug": false,
+    "conv_alpha": 1,
+    "conv_block_alphas": "",
+    "conv_block_dims": "",
+    "conv_dim": 1,
+    "decompose_both": false,
+    "dim_from_weights": false,
+    "down_lr_weight": "",
+    "enable_bucket": true,
+    "epoch": 30,
+    "factor": -1,
+    "flip_aug": false,
+    "full_bf16": false,
+    "full_fp16": false,
+    "gradient_accumulation_steps": 1.0,
+    "gradient_checkpointing": true,
+    "keep_tokens": 2,
+    "learning_rate": 1.0,
+    "lora_network_weights": "",
+    "lr_scheduler": "cosine_with_restarts",
+    "lr_scheduler_num_cycles": "",
+    "lr_scheduler_power": "",
+    "lr_warmup": 0,
+    "max_bucket_reso": 2048,
+    "max_data_loader_n_workers": "0",
+    "max_resolution": "1024,1024",
+    "max_timestep": 1000,
+    "max_token_length": "225",
+    "max_train_epochs": "",
+    "mem_eff_attn": false,
+    "mid_lr_weight": "",
+    "min_bucket_reso": 256,
+    "min_snr_gamma": 5,
+    "min_timestep": 0,
+    "mixed_precision": "bf16",
+    "module_dropout": 0,
+    "multires_noise_discount": 0,
+    "multires_noise_iterations": 0,
+    "network_alpha": 256,
+    "network_dim": 256,
+    "network_dropout": 0,
+    "no_token_padding": false,
+    "noise_offset": 0.0357,
+    "noise_offset_type": "Original",
+    "num_cpu_threads_per_process": 2,
+    "optimizer": "Prodigy",
+    "optimizer_args": "decouple=True weight_decay=0.45 d_coef=2 use_bias_correction=True safeguard_warmup=True",
+    "persistent_data_loader_workers": false,
+    "prior_loss_weight": 1.0,
+    "random_crop": false,
+    "rank_dropout": 0,
+    "save_every_n_epochs": 1,
+    "save_every_n_steps": 0,
+    "save_last_n_steps": 0,
+    "save_last_n_steps_state": 0,
+    "save_precision": "bf16",
+    "scale_v_pred_loss_like_noise_pred": false,
+    "scale_weight_norms": 0,
+    "sdxl": true,
+    "sdxl_cache_text_encoder_outputs": false,
+    "sdxl_no_half_vae": true,
+    "seed": "1337",
+    "shuffle_caption": true,
+    "stop_text_encoder_training_pct": 0,
+    "text_encoder_lr": 1.0,
+    "train_batch_size": 6,
+    "train_on_input": true,
+    "training_comment": "",
+    "unet_lr": 1.0,
+    "unit": 1,
+    "up_lr_weight": "",
+    "use_cp": false,
+    "use_wandb": true,
+    "v2": false,
+    "v_parameterization": false,
+    "vae_batch_size": 0,
+    "weighted_captions": false,
+    "xformers": true
+}
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 7b9b71870..90be8a7b4 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -26,6 +26,6 @@ tk==0.1.0
 toml==0.10.2
 transformers==4.30.2
 voluptuous==0.13.1
-wandb==0.15.0
+wandb==0.15.11
 # for kohya_ss library
 -e . # no_verify leave this to specify not checking this a verification stage

From b4ce7e04dae1beb21aa4c65130f53f492a7f0a80 Mon Sep 17 00:00:00 2001
From: bmaltais <bernard@ducourier.com>
Date: Sun, 1 Oct 2023 14:44:08 -0400
Subject: [PATCH 29/31] Add zh--CN localisation

---
 README.md | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/README.md b/README.md
index 8000ef9f6..628ee56ec 100644
--- a/README.md
+++ b/README.md
@@ -625,6 +625,29 @@ ControlNet-LLLite, a novel method for ControlNet with SDXL, is added. See [docum
 
 ## Change History
 
+* 2023/10/01 (v22.0.0)
+  - Merging main branch of sd-scripts:
+    - [SAI Model Spec](https://github.com/Stability-AI/ModelSpec) metadata is now supported partially. `hash_sha256` is not supported yet.
+      - The main items are set automatically. 
+      - You can set title, author, description, license and tags with `--metadata_xxx` options in each training script.
+      - Merging scripts also support minimum SAI Model Spec metadata. See the help message for the usage.
+      - Metadata editor will be available soon.
+
+    - `--v_pred_like_loss ratio` option is added. This option adds the loss like v-prediction loss in SDXL training. `0.1` means that the loss is added 10% of the v-prediction loss. The default value is None (disabled).
+      - In v-prediction, the loss is higher in the early timesteps (near the noise). This option can be used to increase the loss in the early timesteps.
+
+    - Arbitrary options can be used for Diffusers' schedulers. For example `--lr_scheduler_args "lr_end=1e-8"`.
+
+    - LoRA-FA is added experimentally. Specify `--network_module networks.lora_fa` option instead of `--network_module networks.lora`. The trained model can be used as a normal LoRA model.
+    - JPEG XL is supported. [#786](https://github.com/kohya-ss/sd-scripts/pull/786) 
+    - Input perturbation noise is added. See [#798](https://github.com/kohya-ss/sd-scripts/pull/798) for details.
+    - Dataset subset now has `caption_prefix` and `caption_suffix` options. The strings are added to the beginning and the end of the captions before shuffling. You can specify the options in `.toml`.
+    - Intel ARC support with IPEX is added. [#825](https://github.com/kohya-ss/sd-scripts/pull/825)
+    - Other bug fixes and improvements.
+  - New SDXL presets
+  - Update wandb module version
+  - Add support for Chinese zh-CN localisation. You can use it with `.\gui.bat --language=zh-CN`
+
 * 2023/09/23 (v21.8.10)
   - Minor point upgrade. Mostly adding a new preset.
   

From 383f4af1376f187a6d4c250721a2d387f6b31cd8 Mon Sep 17 00:00:00 2001
From: bmaltais <bernard@ducourier.com>
Date: Sun, 1 Oct 2023 15:19:58 -0400
Subject: [PATCH 30/31] Add support for finetuning presets

---
 README.md                                     |   1 +
 finetune_gui.py                               | 108 +++++++++++++++---
 presets/finetune/adafactor.json               | 106 ++++++++---------
 presets/finetune/lion.json                    | 106 ++++++++---------
 presets/finetune/prepare_presets.md           |   7 ++
 .../lora/SDXL - LoRA AI_Now ADamW v1.0.json   |   4 +-
 tools/prepare_presets.py                      |   2 +-
 7 files changed, 197 insertions(+), 137 deletions(-)
 create mode 100644 presets/finetune/prepare_presets.md

diff --git a/README.md b/README.md
index 628ee56ec..93f2ced07 100644
--- a/README.md
+++ b/README.md
@@ -647,6 +647,7 @@ ControlNet-LLLite, a novel method for ControlNet with SDXL, is added. See [docum
   - New SDXL presets
   - Update wandb module version
   - Add support for Chinese zh-CN localisation. You can use it with `.\gui.bat --language=zh-CN`
+  - Add presets support to `Finetuning`. You can add your own finetuning user presets under the `/presets/finetune/user_presets` folder.
 
 * 2023/09/23 (v21.8.10)
   - Minor point upgrade. Mostly adding a new preset.
diff --git a/finetune_gui.py b/finetune_gui.py
index 872c51d47..892c8be1f 100644
--- a/finetune_gui.py
+++ b/finetune_gui.py
@@ -180,6 +180,7 @@ def save_configuration(
 
 def open_configuration(
     ask_for_file,
+    apply_preset,
     file_path,
     pretrained_model_name_or_path,
     v2,
@@ -267,11 +268,27 @@ def open_configuration(
     sdxl_no_half_vae,
     min_timestep,
     max_timestep,
+    training_preset,
 ):
     # Get list of function parameters and values
     parameters = list(locals().items())
 
     ask_for_file = True if ask_for_file.get('label') == 'True' else False
+    apply_preset = True if apply_preset.get('label') == 'True' else False
+
+    # Check if we are "applying" a preset or a config
+    if apply_preset:
+        log.info(f'Applying preset {training_preset}...')
+        file_path = f'./presets/finetune/{training_preset}.json'
+    else:
+        # If not applying a preset, set the `training_preset` field to an empty string
+        # Find the index of the `training_preset` parameter using the `index()` method
+        training_preset_index = parameters.index(
+            ('training_preset', training_preset)
+        )
+
+        # Update the value of `training_preset` by directly assigning an empty string value
+        parameters[training_preset_index] = ('training_preset', '')
 
     original_file_path = file_path
 
@@ -291,9 +308,10 @@ def open_configuration(
 
     values = [file_path]
     for key, value in parameters:
+        json_value = my_data.get(key)
         # Set the value in the dictionary to the corresponding value in `my_data`, or the default value if not found
-        if not key in ['ask_for_file', 'file_path']:
-            values.append(my_data.get(key, value))
+        if not key in ['ask_for_file', 'apply_preset', 'file_path']:
+            values.append(json_value if json_value is not None else value)
     return tuple(values)
 
 
@@ -808,6 +826,31 @@ def finetune_tab(headless=False):
                         label='Weighted captions', value=False
                     )
         with gr.Tab('Parameters'):
+            
+            def list_presets(path):
+                json_files = []
+
+                for file in os.listdir(path):
+                    if file.endswith('.json'):
+                        json_files.append(os.path.splitext(file)[0])
+
+                user_presets_path = os.path.join(path, 'user_presets')
+                if os.path.isdir(user_presets_path):
+                    for file in os.listdir(user_presets_path):
+                        if file.endswith('.json'):
+                            preset_name = os.path.splitext(file)[0]
+                            json_files.append(
+                                os.path.join('user_presets', preset_name)
+                            )
+
+                return json_files
+
+            training_preset = gr.Dropdown(
+                label='Presets',
+                choices=list_presets('./presets/finetune'),
+                elem_id='myDropdown',
+            )
+            
             with gr.Tab('Basic', elem_id='basic_tab'):
                 basic_training = BasicTraining(
                     learning_rate_value='1e-5', finetuning=True
@@ -960,31 +1003,64 @@ def finetune_tab(headless=False):
             advanced_training.max_timestep,
         ]
 
-        button_run.click(
-            train_model,
-            inputs=[dummy_headless] + [dummy_db_false] + settings_list,
+        config.button_open_config.click(
+            open_configuration,
+            inputs=[dummy_db_true, dummy_db_false, config.config_file_name]
+            + settings_list
+            + [training_preset],
+            outputs=[config.config_file_name]
+            + settings_list
+            + [training_preset],
             show_progress=False,
         )
 
-        button_stop_training.click(executor.kill_command)
+        # config.button_open_config.click(
+        #     open_configuration,
+        #     inputs=[dummy_db_true, dummy_db_false, config.config_file_name] + settings_list,
+        #     outputs=[config.config_file_name] + settings_list,
+        #     show_progress=False,
+        # )
 
-        button_print.click(
-            train_model,
-            inputs=[dummy_headless] + [dummy_db_true] + settings_list,
+        config.button_load_config.click(
+            open_configuration,
+            inputs=[dummy_db_false, dummy_db_false, config.config_file_name]
+            + settings_list
+            + [training_preset],
+            outputs=[config.config_file_name]
+            + settings_list
+            + [training_preset],
             show_progress=False,
         )
 
-        config.button_open_config.click(
+        # config.button_load_config.click(
+        #     open_configuration,
+        #     inputs=[dummy_db_false, config.config_file_name] + settings_list,
+        #     outputs=[config.config_file_name] + settings_list,
+        #     show_progress=False,
+        # )
+
+        training_preset.input(
             open_configuration,
-            inputs=[dummy_db_true, config.config_file_name] + settings_list,
-            outputs=[config.config_file_name] + settings_list,
+            inputs=[dummy_db_false, dummy_db_true, config.config_file_name]
+            + settings_list
+            + [training_preset],
+            outputs=[gr.Textbox()]
+            + settings_list
+            + [training_preset],
             show_progress=False,
         )
 
-        config.button_load_config.click(
-            open_configuration,
-            inputs=[dummy_db_false, config.config_file_name] + settings_list,
-            outputs=[config.config_file_name] + settings_list,
+        button_run.click(
+            train_model,
+            inputs=[dummy_headless] + [dummy_db_false] + settings_list,
+            show_progress=False,
+        )
+
+        button_stop_training.click(executor.kill_command)
+
+        button_print.click(
+            train_model,
+            inputs=[dummy_headless] + [dummy_db_true] + settings_list,
             show_progress=False,
         )
 
diff --git a/presets/finetune/adafactor.json b/presets/finetune/adafactor.json
index 0e0149dc7..6f7a10926 100644
--- a/presets/finetune/adafactor.json
+++ b/presets/finetune/adafactor.json
@@ -1,61 +1,49 @@
 {
-  "pretrained_model_name_or_path": "runwayml/stable-diffusion-v1-5",
-  "v2": false,
-  "v_parameterization": false,
-  "train_dir": "D:/dataset/paige_spiranac/ft",
-  "image_folder": "D:\\dataset\\paige_spiranac\\lora\\img4_g8\\16_paige_spiranac",
-  "output_dir": "D:/models/test",
-  "logging_dir": "D:/dataset/paige_spiranac/ft/logs",
-  "max_resolution": "512,512",
-  "min_bucket_reso": "256",
-  "max_bucket_reso": "1024",
-  "batch_size": "1",
-  "flip_aug": false,
-  "caption_metadata_filename": "meta_cap.json",
-  "latent_metadata_filename": "meta_lat.json",
-  "full_path": true,
-  "learning_rate": "1e-6",
-  "lr_scheduler": "adafactor",
-  "lr_warmup": "10",
-  "dataset_repeats": "10",
-  "train_batch_size": 4,
-  "epoch": "2",
-  "save_every_n_epochs": "1",
-  "mixed_precision": "bf16",
-  "save_precision": "fp16",
-  "seed": "1234",
-  "num_cpu_threads_per_process": 2,
-  "train_text_encoder": true,
-  "create_caption": true,
-  "create_buckets": false,
-  "save_model_as": "safetensors",
-  "caption_extension": ".txt",
-  "use_8bit_adam": false,
-  "xformers": true,
-  "clip_skip": 1,
-  "save_state": false,
-  "resume": "",
-  "gradient_checkpointing": false,
-  "gradient_accumulation_steps": 1.0,
-  "mem_eff_attn": false,
-  "shuffle_caption": true,
-  "output_name": "paige_spiranac_v1.5e",
-  "max_token_length": "150",
-  "max_train_epochs": "",
-  "max_data_loader_n_workers": "0",
-  "full_fp16": false,
-  "color_aug": false,
-  "model_list": "runwayml/stable-diffusion-v1-5",
-  "cache_latents": true,
-  "use_latent_files": "No",
-  "keep_tokens": 1,
-  "persistent_data_loader_workers": false,
-  "bucket_no_upscale": true,
-  "random_crop": false,
-  "bucket_reso_steps": 1.0,
-  "caption_dropout_every_n_epochs": 0.0,
-  "caption_dropout_rate": 0.1,
-  "optimizer": "Adafactor",
-  "optimizer_args": "scale_parameter=True relative_step=True warmup_init=True weight_decay=2",
-  "noise_offset": ""
+    "batch_size": "1",
+    "bucket_no_upscale": true,
+    "bucket_reso_steps": 1.0,
+    "cache_latents": true,
+    "caption_dropout_every_n_epochs": 0.0,
+    "caption_dropout_rate": 0.1,
+    "caption_extension": ".txt",
+    "clip_skip": 1,
+    "color_aug": false,
+    "create_buckets": false,
+    "create_caption": true,
+    "dataset_repeats": "10",
+    "epoch": "2",
+    "flip_aug": false,
+    "full_fp16": false,
+    "full_path": true,
+    "gradient_accumulation_steps": 1.0,
+    "gradient_checkpointing": false,
+    "keep_tokens": 1,
+    "learning_rate": "1e-6",
+    "lr_scheduler": "adafactor",
+    "lr_warmup": "10",
+    "max_bucket_reso": "1024",
+    "max_data_loader_n_workers": "0",
+    "max_resolution": "512,512",
+    "max_token_length": "150",
+    "max_train_epochs": "",
+    "mem_eff_attn": false,
+    "min_bucket_reso": "256",
+    "mixed_precision": "bf16",
+    "noise_offset": "",
+    "num_cpu_threads_per_process": 2,
+    "optimizer": "Adafactor",
+    "optimizer_args": "scale_parameter=True relative_step=True warmup_init=True weight_decay=2",
+    "persistent_data_loader_workers": false,
+    "random_crop": false,
+    "save_every_n_epochs": "1",
+    "save_precision": "fp16",
+    "seed": "1234",
+    "shuffle_caption": true,
+    "train_batch_size": 4,
+    "train_text_encoder": true,
+    "use_8bit_adam": false,
+    "use_latent_files": "No",
+    "v2": false,
+    "v_parameterization": false,
+    "xformers": true
 }
\ No newline at end of file
diff --git a/presets/finetune/lion.json b/presets/finetune/lion.json
index 982c8a869..0f74105e6 100644
--- a/presets/finetune/lion.json
+++ b/presets/finetune/lion.json
@@ -1,61 +1,49 @@
 {
-  "pretrained_model_name_or_path": "runwayml/stable-diffusion-v1-5",
-  "v2": false,
-  "v_parameterization": false,
-  "train_dir": "D:/dataset/paige_spiranac/ft",
-  "image_folder": "D:\\dataset\\paige_spiranac\\lora\\img4_g8\\16_paige_spiranac",
-  "output_dir": "D:/models/test",
-  "logging_dir": "D:/dataset/paige_spiranac/ft/logs",
-  "max_resolution": "512,512",
-  "min_bucket_reso": "256",
-  "max_bucket_reso": "1024",
-  "batch_size": "1",
-  "flip_aug": false,
-  "caption_metadata_filename": "meta_cap.json",
-  "latent_metadata_filename": "meta_lat.json",
-  "full_path": true,
-  "learning_rate": "0.0000166666666",
-  "lr_scheduler": "cosine",
-  "lr_warmup": "10",
-  "dataset_repeats": "10",
-  "train_batch_size": 4,
-  "epoch": "2",
-  "save_every_n_epochs": "1",
-  "mixed_precision": "bf16",
-  "save_precision": "fp16",
-  "seed": "1234",
-  "num_cpu_threads_per_process": 2,
-  "train_text_encoder": true,
-  "create_caption": true,
-  "create_buckets": false,
-  "save_model_as": "safetensors",
-  "caption_extension": ".txt",
-  "use_8bit_adam": false,
-  "xformers": true,
-  "clip_skip": 1,
-  "save_state": false,
-  "resume": "",
-  "gradient_checkpointing": false,
-  "gradient_accumulation_steps": 1.0,
-  "mem_eff_attn": false,
-  "shuffle_caption": true,
-  "output_name": "paige_spiranac_v1.5e",
-  "max_token_length": "150",
-  "max_train_epochs": "",
-  "max_data_loader_n_workers": "0",
-  "full_fp16": false,
-  "color_aug": false,
-  "model_list": "runwayml/stable-diffusion-v1-5",
-  "cache_latents": true,
-  "use_latent_files": "No",
-  "keep_tokens": 1,
-  "persistent_data_loader_workers": false,
-  "bucket_no_upscale": true,
-  "random_crop": false,
-  "bucket_reso_steps": 1.0,
-  "caption_dropout_every_n_epochs": 0.0,
-  "caption_dropout_rate": 0.1,
-  "optimizer": "Lion",
-  "optimizer_args": "",
-  "noise_offset": ""
+    "batch_size": "1",
+    "bucket_no_upscale": true,
+    "bucket_reso_steps": 1.0,
+    "cache_latents": true,
+    "caption_dropout_every_n_epochs": 0.0,
+    "caption_dropout_rate": 0.1,
+    "caption_extension": ".txt",
+    "clip_skip": 1,
+    "color_aug": false,
+    "create_buckets": false,
+    "create_caption": true,
+    "dataset_repeats": "10",
+    "epoch": "2",
+    "flip_aug": false,
+    "full_fp16": false,
+    "full_path": true,
+    "gradient_accumulation_steps": 1.0,
+    "gradient_checkpointing": false,
+    "keep_tokens": 1,
+    "learning_rate": "0.0000166666666",
+    "lr_scheduler": "cosine",
+    "lr_warmup": "10",
+    "max_bucket_reso": "1024",
+    "max_data_loader_n_workers": "0",
+    "max_resolution": "512,512",
+    "max_token_length": "150",
+    "max_train_epochs": "",
+    "mem_eff_attn": false,
+    "min_bucket_reso": "256",
+    "mixed_precision": "bf16",
+    "noise_offset": "",
+    "num_cpu_threads_per_process": 2,
+    "optimizer": "Lion",
+    "optimizer_args": "",
+    "persistent_data_loader_workers": false,
+    "random_crop": false,
+    "save_every_n_epochs": "1",
+    "save_precision": "fp16",
+    "seed": "1234",
+    "shuffle_caption": true,
+    "train_batch_size": 4,
+    "train_text_encoder": true,
+    "use_8bit_adam": false,
+    "use_latent_files": "No",
+    "v2": false,
+    "v_parameterization": false,
+    "xformers": true
 }
\ No newline at end of file
diff --git a/presets/finetune/prepare_presets.md b/presets/finetune/prepare_presets.md
new file mode 100644
index 000000000..48e2101fe
--- /dev/null
+++ b/presets/finetune/prepare_presets.md
@@ -0,0 +1,7 @@
+# Preparing presets for users
+
+Run the followinf command to prepare new presets for release to users:
+
+```
+python.exe .\tools\prepare_presets.py .\presets\finetune\*.json
+```
\ No newline at end of file
diff --git a/presets/lora/SDXL - LoRA AI_Now ADamW v1.0.json b/presets/lora/SDXL - LoRA AI_Now ADamW v1.0.json
index 061e415c5..da25fba46 100644
--- a/presets/lora/SDXL - LoRA AI_Now ADamW v1.0.json	
+++ b/presets/lora/SDXL - LoRA AI_Now ADamW v1.0.json	
@@ -40,7 +40,7 @@
     "max_bucket_reso": 2048,
     "max_data_loader_n_workers": "0",
     "max_resolution": "1024,1024",
-    "max_timestep": 800,
+    "max_timestep": 900,
     "max_token_length": "75",
     "max_train_epochs": "",
     "max_train_steps": "320",
@@ -48,7 +48,7 @@
     "mid_lr_weight": "",
     "min_bucket_reso": 256,
     "min_snr_gamma": 5,
-    "min_timestep": 200,
+    "min_timestep": 100,
     "mixed_precision": "bf16",
     "module_dropout": 0,
     "multires_noise_discount": 0,
diff --git a/tools/prepare_presets.py b/tools/prepare_presets.py
index 8c00af457..a94b87644 100644
--- a/tools/prepare_presets.py
+++ b/tools/prepare_presets.py
@@ -3,7 +3,7 @@
 import glob
 
 def remove_items_with_keywords(json_file_path):
-    keywords = ["pretrained_model_name_or_path", "dir", "save_model_as", "save_state", "resume", "output_name", "model_list", "sample_", "wandb_api_key"]
+    keywords = ["pretrained_model_name_or_path", "train_dir", "output_dir", "logging_dir", "image_folder", "dir", "caption_metadata_filename", "latent_metadata_filename", "save_model_as", "save_state", "resume", "output_name", "model_list", "sample_", "wandb_api_key"]
     
     with open(json_file_path) as file:
         data = json.load(file)

From 21075346667ff356166acb17b2de3efed7c73f42 Mon Sep 17 00:00:00 2001
From: bmaltais <bernard@ducourier.com>
Date: Sun, 1 Oct 2023 18:26:49 -0400
Subject: [PATCH 31/31] Add finetune preset from AI_Now

---
 .../SDXL - AI_Now PagedAdamW8bit v1.0.json    | 71 +++++++++++++++++++
 1 file changed, 71 insertions(+)
 create mode 100644 presets/finetune/SDXL - AI_Now PagedAdamW8bit v1.0.json

diff --git a/presets/finetune/SDXL - AI_Now PagedAdamW8bit v1.0.json b/presets/finetune/SDXL - AI_Now PagedAdamW8bit v1.0.json
new file mode 100644
index 000000000..0821e5829
--- /dev/null
+++ b/presets/finetune/SDXL - AI_Now PagedAdamW8bit v1.0.json	
@@ -0,0 +1,71 @@
+{
+    "adaptive_noise_scale": 0.00375,
+    "additional_parameters": "",
+    "batch_size": "4",
+    "block_lr": "",
+    "bucket_no_upscale": true,
+    "bucket_reso_steps": 64,
+    "cache_latents": true,
+    "cache_latents_to_disk": false,
+    "caption_dropout_every_n_epochs": 0.0,
+    "caption_dropout_rate": 0,
+    "caption_extension": ".txt",
+    "clip_skip": "1",
+    "color_aug": false,
+    "create_buckets": true,
+    "create_caption": true,
+    "dataset_repeats": "1",
+    "epoch": 240,
+    "flip_aug": false,
+    "full_bf16": true,
+    "full_fp16": false,
+    "full_path": true,
+    "gradient_accumulation_steps": 6.0,
+    "gradient_checkpointing": true,
+    "keep_tokens": "0",
+    "learning_rate": 5e-05,
+    "lr_scheduler": "constant",
+    "lr_scheduler_args": "",
+    "lr_warmup": 0,
+    "max_bucket_reso": "1024",
+    "max_data_loader_n_workers": "0",
+    "max_resolution": "1024,1024",
+    "max_timestep": 900,
+    "max_token_length": "75",
+    "max_train_epochs": "240",
+    "mem_eff_attn": false,
+    "min_bucket_reso": "64",
+    "min_snr_gamma": 5,
+    "min_timestep": 100,
+    "mixed_precision": "bf16",
+    "multires_noise_discount": 0,
+    "multires_noise_iterations": 0,
+    "noise_offset": 0.0375,
+    "noise_offset_type": "Original",
+    "num_cpu_threads_per_process": 2,
+    "optimizer": "PagedAdamW8bit",
+    "optimizer_args": "",
+    "persistent_data_loader_workers": false,
+    "random_crop": false,
+    "save_every_n_epochs": 240,
+    "save_every_n_steps": 0,
+    "save_last_n_steps": 0,
+    "save_last_n_steps_state": 0,
+    "save_precision": "bf16",
+    "scale_v_pred_loss_like_noise_pred": false,
+    "sdxl_cache_text_encoder_outputs": true,
+    "sdxl_checkbox": true,
+    "sdxl_no_half_vae": true,
+    "seed": "1234",
+    "shuffle_caption": false,
+    "train_batch_size": 2,
+    "train_text_encoder": false,
+    "use_latent_files": "No",
+    "use_wandb": false,
+    "v2": false,
+    "v_parameterization": false,
+    "v_pred_like_loss": 0,
+    "vae_batch_size": 0,
+    "weighted_captions": false,
+    "xformers": "xformers"
+}
\ No newline at end of file