Skip to content

Commit

Permalink
Merge pull request #805 from bmaltais/dev2
Browse files Browse the repository at this point in the history
v21.5.11
  • Loading branch information
bmaltais authored May 15, 2023
2 parents feb6728 + ee310e3 commit f4a9d48
Show file tree
Hide file tree
Showing 16 changed files with 1,086 additions and 146 deletions.
19 changes: 19 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,12 @@ This repository provides a Windows-focused Gradio GUI for [Kohya's Stable Diffus

### Table of Contents

<<<<<<< HEAD
- [Tutorials](#tutorials)
=======
[English translation by darkstorm2150 is here](https://github.com/darkstorm2150/sd-scripts#links-to-usage-documentation). Thanks to darkstorm2150!

>>>>>>> 6d6df18387a72193af62c651473fe1369b6a2040
* [Training guide - common](./docs/train_README-ja.md) : data preparation, options etc...
* [Chinese version](./docs/train_README-zh.md)
* [Dataset config](./docs/config_README-ja.md)
Expand Down Expand Up @@ -341,6 +346,20 @@ This will store a backup file with your current locally installed pip packages a

## Change History

* 2023/07/15 (v21.5.11)
- Added an option `--dim_from_weights` to `train_network.py` to automatically determine the dim(rank) from the weight file. [PR #491](https://github.com/kohya-ss/sd-scripts/pull/491) Thanks to AI-Casanova!
- It is useful in combination with `resize_lora.py`. Please see the PR for details.
- Fixed a bug where the noise resolution was incorrect with Multires noise. [PR #489](https://github.com/kohya-ss/sd-scripts/pull/489) Thanks to sdbds!
- Please see the PR for details.
- The image generation scripts can now use img2img and highres fix at the same time.
- Fixed a bug where the hint image of ControlNet was incorrectly BGR instead of RGB in the image generation scripts.
- Added a feature to the image generation scripts to use the memory-efficient VAE.
- If you specify a number with the `--vae_slices` option, the memory-efficient VAE will be used. The maximum output size will be larger, but it will be slower. Please specify a value of about `16` or `32`.
- The implementation of the VAE is in `library/slicing_vae.py`.
- Fix for wandb #ebabchick
- Added [English translation of documents](https://github.com/darkstorm2150/sd-scripts#links-to-usage-documentation) by darkstorm2150. Thank you very much!
- The prompt for sample generation during training can now be specified in `.toml` or `.json`. [PR #504](https://github.com/kohya-ss/sd-scripts/pull/504) Thanks to Linaqruf!
- For details on prompt description, please see the PR.
* 2023/04/07 (v21.5.10)
- Fix issue https://github.com/bmaltais/kohya_ss/issues/734
- The documentation has been moved to the `docs` folder. If you have links, please change them.
Expand Down
2 changes: 1 addition & 1 deletion fine_tune.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,7 +266,7 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
accelerator.init_trackers("finetuning" if args.log_tracker_name is None else args.log_tracker_name)

for epoch in range(num_train_epochs):
print(f"epoch {epoch+1}/{num_train_epochs}")
print(f"\nepoch {epoch+1}/{num_train_epochs}")
current_epoch.value = epoch + 1

for m in training_models:
Expand Down
79 changes: 62 additions & 17 deletions gen_img_diffusers.py
Original file line number Diff line number Diff line change
Expand Up @@ -955,7 +955,7 @@ def __call__(
if torch.cuda.is_available():
torch.cuda.empty_cache()
init_latents = []
for i in tqdm(range(0, batch_size, vae_batch_size)):
for i in tqdm(range(0, min(batch_size, len(init_image)), vae_batch_size)):
init_latent_dist = self.vae.encode(
init_image[i : i + vae_batch_size] if vae_batch_size > 1 else init_image[i].unsqueeze(0)
).latent_dist
Expand Down Expand Up @@ -2091,7 +2091,7 @@ def main(args):
dtype = torch.float32

highres_fix = args.highres_fix_scale is not None
assert not highres_fix or args.image_path is None, f"highres_fix doesn't work with img2img / highres_fixはimg2imgと同時に使えません"
# assert not highres_fix or args.image_path is None, f"highres_fix doesn't work with img2img / highres_fixはimg2imgと同時に使えません"

if args.v_parameterization and not args.v2:
print("v_parameterization should be with v2 / v1でv_parameterizationを使用することは想定されていません")
Expand Down Expand Up @@ -2250,7 +2250,27 @@ def __getattr__(self, item):
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # "mps"を考量してない

# custom pipelineをコピったやつを生成する
if args.vae_slices:
from library.slicing_vae import SlicingAutoencoderKL

sli_vae = SlicingAutoencoderKL(
act_fn="silu",
block_out_channels=(128, 256, 512, 512),
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D", "DownEncoderBlock2D", "DownEncoderBlock2D"],
in_channels=3,
latent_channels=4,
layers_per_block=2,
norm_num_groups=32,
out_channels=3,
sample_size=512,
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D"],
num_slices=args.vae_slices,
)
sli_vae.load_state_dict(vae.state_dict()) # vaeのパラメータをコピーする
vae = sli_vae
del sli_vae
vae.to(dtype).to(device)

text_encoder.to(dtype).to(device)
unet.to(dtype).to(device)
if clip_model is not None:
Expand All @@ -2262,7 +2282,7 @@ def __getattr__(self, item):
if args.network_module:
networks = []
network_default_muls = []
network_pre_calc=args.network_pre_calc
network_pre_calc = args.network_pre_calc

for i, network_module in enumerate(args.network_module):
print("import network module:", network_module)
Expand Down Expand Up @@ -2592,12 +2612,18 @@ def resize_images(imgs, size):

# 画像サイズにオプション指定があるときはリサイズする
if args.W is not None and args.H is not None:
# highres fix を考慮に入れる
w, h = args.W, args.H
if highres_fix:
w = int(w * args.highres_fix_scale + 0.5)
h = int(h * args.highres_fix_scale + 0.5)

if init_images is not None:
print(f"resize img2img source images to {args.W}*{args.H}")
init_images = resize_images(init_images, (args.W, args.H))
print(f"resize img2img source images to {w}*{h}")
init_images = resize_images(init_images, (w, h))
if mask_images is not None:
print(f"resize img2img mask images to {args.W}*{args.H}")
mask_images = resize_images(mask_images, (args.W, args.H))
print(f"resize img2img mask images to {w}*{h}")
mask_images = resize_images(mask_images, (w, h))

regional_network = False
if networks and mask_images:
Expand Down Expand Up @@ -2671,13 +2697,15 @@ def process_batch(batch: List[BatchData], highres_fix, highres_1st=False):
width_1st = width_1st - width_1st % 32
height_1st = height_1st - height_1st % 32

strength_1st = ext.strength if args.highres_fix_strength is None else args.highres_fix_strength

ext_1st = BatchDataExt(
width_1st,
height_1st,
args.highres_fix_steps,
ext.scale,
ext.negative_scale,
ext.strength,
strength_1st,
ext.network_muls,
ext.num_sub_prompts,
)
Expand Down Expand Up @@ -2827,7 +2855,7 @@ def process_batch(batch: List[BatchData], highres_fix, highres_1st=False):
n.set_multiplier(m)
if regional_network:
n.set_current_generation(batch_size, num_sub_prompts, width, height, shared)

if not regional_network and network_pre_calc:
for n in networks:
n.restore_weights()
Expand Down Expand Up @@ -3032,14 +3060,16 @@ def process_batch(batch: List[BatchData], highres_fix, highres_1st=False):
if init_images is not None:
init_image = init_images[global_step % len(init_images)]

# img2imgの場合は、基本的に元画像のサイズで生成する。highres fixの場合はargs.W, args.Hとscaleに従いリサイズ済みなので無視する
# 32単位に丸めたやつにresizeされるので踏襲する
width, height = init_image.size
width = width - width % 32
height = height - height % 32
if width != init_image.size[0] or height != init_image.size[1]:
print(
f"img2img image size is not divisible by 32 so aspect ratio is changed / img2imgの画像サイズが32で割り切れないためリサイズされます。画像が歪みます"
)
if not highres_fix:
width, height = init_image.size
width = width - width % 32
height = height - height % 32
if width != init_image.size[0] or height != init_image.size[1]:
print(
f"img2img image size is not divisible by 32 so aspect ratio is changed / img2imgの画像サイズが32で割り切れないためリサイズされます。画像が歪みます"
)

if mask_images is not None:
mask_image = mask_images[global_step % len(mask_images)]
Expand Down Expand Up @@ -3141,6 +3171,13 @@ def setup_parser() -> argparse.ArgumentParser:
default=None,
help="batch size for VAE, < 1.0 for ratio / VAE処理時のバッチサイズ、1未満の値の場合は通常バッチサイズの比率",
)
parser.add_argument(
"--vae_slices",
type=int,
default=None,
help=
"number of slices to split image into for VAE to reduce VRAM usage, None for no splitting (default), slower if specified. 16 or 32 recommended / VAE処理時にVRAM使用量削減のため画像を分割するスライス数、Noneの場合は分割しない(デフォルト)、指定すると遅くなる。16か32程度を推奨"
)
parser.add_argument("--steps", type=int, default=50, help="number of ddim sampling steps / サンプリングステップ数")
parser.add_argument(
"--sampler",
Expand Down Expand Up @@ -3218,7 +3255,9 @@ def setup_parser() -> argparse.ArgumentParser:
)
parser.add_argument("--network_show_meta", action="store_true", help="show metadata of network model / ネットワークモデルのメタデータを表示する")
parser.add_argument("--network_merge", action="store_true", help="merge network weights to original model / ネットワークの重みをマージする")
parser.add_argument("--network_pre_calc", action="store_true", help="pre-calculate network for generation / ネットワークのあらかじめ計算して生成する")
parser.add_argument(
"--network_pre_calc", action="store_true", help="pre-calculate network for generation / ネットワークのあらかじめ計算して生成する"
)
parser.add_argument(
"--textual_inversion_embeddings",
type=str,
Expand Down Expand Up @@ -3276,6 +3315,12 @@ def setup_parser() -> argparse.ArgumentParser:
parser.add_argument(
"--highres_fix_steps", type=int, default=28, help="1st stage steps for highres fix / highres fixの最初のステージのステップ数"
)
parser.add_argument(
"--highres_fix_strength",
type=float,
default=None,
help="1st stage img2img strength for highres fix / highres fixの最初のステージのimg2img時のstrength、省略時はstrengthと同じ",
)
parser.add_argument(
"--highres_fix_save_1st", action="store_true", help="save 1st stage images for highres fix / highres fixの最初のステージの画像を保存する"
)
Expand Down
7 changes: 7 additions & 0 deletions gui.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
#!/usr/bin/env bash

# If it is run with the sudo command, get the complete LD_LIBRARY_PATH environment variable of the system and assign it to the current environment,
# because it will be used later.
if [ -n "$SUDO_USER" ] || [ -n "$SUDO_COMMAND" ] ; then
echo "The sudo command resets the non-essential environment variables, we keep the LD_LIBRARY_PATH variable."
export LD_LIBRARY_PATH=$(sudo -i printenv LD_LIBRARY_PATH)
fi

# This gets the directory the script is run from so pathing can work relative to the script where needed.
SCRIPT_DIR=$(cd -- "$(dirname -- "$0")" && pwd)

Expand Down
36 changes: 35 additions & 1 deletion library/common_gui.py
Original file line number Diff line number Diff line change
Expand Up @@ -819,6 +819,40 @@ def gradio_training(
optimizer_args,
)

def get_int_or_default(kwargs, key, default_value=0):
value = kwargs.get(key, default_value)
if isinstance(value, int):
return value
elif isinstance(value, str):
return int(value)
elif isinstance(value, float):
return int(value)
else:
print(f'{key} is not an int, float or a string, setting value to {default_value}')
return default_value

def get_float_or_default(kwargs, key, default_value=0.0):
value = kwargs.get(key, default_value)
if isinstance(value, float):
return value
elif isinstance(value, int):
return float(value)
elif isinstance(value, str):
return float(value)
else:
print(f'{key} is not an int, float or a string, setting value to {default_value}')
return default_value

def get_str_or_default(kwargs, key, default_value=""):
value = kwargs.get(key, default_value)
if isinstance(value, str):
return value
elif isinstance(value, int):
return str(value)
elif isinstance(value, str):
return str(value)
else:
return default_value

def run_cmd_training(**kwargs):
run_cmd = ''
Expand Down Expand Up @@ -1092,8 +1126,8 @@ def noise_offset_type_change(noise_offset_type):
save_every_n_steps,
save_last_n_steps,
save_last_n_steps_state,
wandb_api_key,
use_wandb,
wandb_api_key,
)


Expand Down
73 changes: 67 additions & 6 deletions library/custom_train_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ def apply_snr_weight(loss, timesteps, noise_scheduler, gamma):
return loss


# TODO train_utilと分散しているのでどちらかに寄せる


def add_custom_train_arguments(parser: argparse.ArgumentParser, support_weighted_captions: bool = True):
parser.add_argument(
"--min_snr_gamma",
Expand Down Expand Up @@ -346,14 +349,14 @@ def get_weighted_text_embeddings(


# https://wandb.ai/johnowhitaker/multires_noise/reports/Multi-Resolution-Noise-for-Diffusion-Model-Training--VmlldzozNjYyOTU2
def pyramid_noise_like(noise, device, iterations=6, discount=0.3):
b, c, w, h = noise.shape
def pyramid_noise_like(noise, device, iterations=6, discount=0.4):
b, c, w, h = noise.shape # EDIT: w and h get over-written, rename for a different variant!
u = torch.nn.Upsample(size=(w, h), mode="bilinear").to(device)
for i in range(iterations):
r = random.random() * 2 + 2 # Rather than always going 2x,
w, h = max(1, int(w / (r**i))), max(1, int(h / (r**i)))
noise += u(torch.randn(b, c, w, h).to(device)) * discount**i
if w == 1 or h == 1:
wn, hn = max(1, int(w / (r**i))), max(1, int(h / (r**i)))
noise += u(torch.randn(b, c, wn, hn).to(device)) * discount**i
if wn == 1 or hn == 1:
break # Lowest resolution is 1x1
return noise / noise.std() # Scaled back to roughly unit variance

Expand All @@ -369,7 +372,65 @@ def apply_noise_offset(latents, noise, noise_offset, adaptive_noise_scale):

# multiply adaptive noise scale to the mean value and add it to the noise offset
noise_offset = noise_offset + adaptive_noise_scale * latent_mean
noise_offset = torch.clamp(noise_offset, 0.0, None) # in case of adaptive noise scale is negative
noise_offset = torch.clamp(noise_offset, 0.0, None) # in case of adaptive noise scale is negative

noise = noise + noise_offset * torch.randn((latents.shape[0], latents.shape[1], 1, 1), device=latents.device)
return noise


"""
##########################################
# Perlin Noise
def rand_perlin_2d(device, shape, res, fade=lambda t: 6 * t**5 - 15 * t**4 + 10 * t**3):
delta = (res[0] / shape[0], res[1] / shape[1])
d = (shape[0] // res[0], shape[1] // res[1])
grid = (
torch.stack(
torch.meshgrid(torch.arange(0, res[0], delta[0], device=device), torch.arange(0, res[1], delta[1], device=device)),
dim=-1,
)
% 1
)
angles = 2 * torch.pi * torch.rand(res[0] + 1, res[1] + 1, device=device)
gradients = torch.stack((torch.cos(angles), torch.sin(angles)), dim=-1)
tile_grads = (
lambda slice1, slice2: gradients[slice1[0] : slice1[1], slice2[0] : slice2[1]]
.repeat_interleave(d[0], 0)
.repeat_interleave(d[1], 1)
)
dot = lambda grad, shift: (
torch.stack((grid[: shape[0], : shape[1], 0] + shift[0], grid[: shape[0], : shape[1], 1] + shift[1]), dim=-1)
* grad[: shape[0], : shape[1]]
).sum(dim=-1)
n00 = dot(tile_grads([0, -1], [0, -1]), [0, 0])
n10 = dot(tile_grads([1, None], [0, -1]), [-1, 0])
n01 = dot(tile_grads([0, -1], [1, None]), [0, -1])
n11 = dot(tile_grads([1, None], [1, None]), [-1, -1])
t = fade(grid[: shape[0], : shape[1]])
return 1.414 * torch.lerp(torch.lerp(n00, n10, t[..., 0]), torch.lerp(n01, n11, t[..., 0]), t[..., 1])
def rand_perlin_2d_octaves(device, shape, res, octaves=1, persistence=0.5):
noise = torch.zeros(shape, device=device)
frequency = 1
amplitude = 1
for _ in range(octaves):
noise += amplitude * rand_perlin_2d(device, shape, (frequency * res[0], frequency * res[1]))
frequency *= 2
amplitude *= persistence
return noise
def perlin_noise(noise, device, octaves):
_, c, w, h = noise.shape
perlin = lambda: rand_perlin_2d_octaves(device, (w, h), (4, 4), octaves)
noise_perlin = []
for _ in range(c):
noise_perlin.append(perlin())
noise_perlin = torch.stack(noise_perlin).unsqueeze(0) # (1, c, w, h)
noise += noise_perlin # broadcast for each batch
return noise / noise.std() # Scaled back to roughly unit variance
"""
Loading

0 comments on commit f4a9d48

Please sign in to comment.