Stable Diffusion inpaint top body experiments

Sarit Ritwirune

5 min readJul 28, 2023

Come from here. He would like to see his A.I. improved version.

I will change his top body and also change his fashion by inpaint.

Here is my mask.

ControlNet

    controlnets = [
        ControlNetModel.from_pretrained(
            "lllyasviel/control_v11p_sd15_inpaint",
        ).to(device),
    ]

instance

    pipe = StableDiffusionControlNetInpaintPipeline.from_pretrained(
        model_dir,
        controlnet=controlnets[0],
        requires_safety_checker=False,
        safety_checker=None
    ).to(device)

Full source code

import itertools
import math
import os.path
import random

import diffusers
import numpy as np
import torch
# !pip install transformers accelerate
from diffusers import StableDiffusionControlNetInpaintPipeline, ControlNetModel
from diffusers.utils import load_image
from tqdm import tqdm

from set_seed import seed_everything
from read_lora import load_lora_weights_orig
from solve_77_limits import get_pipeline_embeds

human_name: str = "koh"
out_dir: str = f"{human_name}"
device: str = "mps" if torch.backends.mps.is_available() else "cpu"
# device: str = "cpu"  # With this LoRA it can't run with mps. It raises RuntimeError: Invalid buffer size: 58.07 GB
print(device)

init_image = load_image(f"sources/{human_name}/koh.jpeg")
width, height = init_image.size
size_factor: float = 0.5
new_width, new_height = math.floor(width * size_factor / 8) * 8, math.floor(height * size_factor / 8) * 8

init_image = init_image.resize((new_width, new_height))

seed: int = 8811
seed_everything(seed)

generator = torch.Generator(device=device).manual_seed(seed)

mask_body_image = load_image(
    f"sources/{human_name}/mask_koh.png"
)
mask_body_image = mask_body_image.resize((new_width, new_height))


def make_inpaint_condition(image, image_mask):
    image = np.array(image.convert("RGB")).astype(np.float32) / 255.0
    image_mask = np.array(image_mask.convert("L")).astype(np.float32) / 255.0

    assert image.shape[0:1] == image_mask.shape[0:1], "image and image_mask must have the same image size"
    image[image_mask > 0.5] = 1  # set as masked pixel
    image = np.expand_dims(image, 0).transpose(0, 3, 1, 2)
    image = torch.from_numpy(image)
    return image


base_prompt = "a topless masculine man"
additional_prompts = [
    "(masterpiece:1.2), best quality,PIXIV",
]
negative_prompt: str = "(low quality, worst quality:1.4),"
strengths = [1, ]
# guidance_scales = [round(0.1 * _, 3) for _ in range(70, 252, 2)]
guidance_scales = [30, ]
# eta_list = [0, 0.2, 0.4, 0.6, 0.8, 1.0, 1.5, 2.0, 4, 6, 8, 10]
eta_list = [1]

models = [
    ("stable-diffusion-v1-5", "runwayml/stable-diffusion-v1-5"),  # Reference bad
    ("majicmixRealistic_v6", "../ai_directory/majicmixRealistic_v6"),  # Quite good
    ("realisticVisionV40_v40VAE", "../ai_directory/realisticVisionV40_v40VAE"),
    # ("MeinaV10", "../ai_directory/MeinaV10"),  # Anime
    # ("perfectWorld_v4Baked", "../ai_directory/perfectWorld_v4Baked"),  # Ordinary
    ("chilloutmix_NiPrunedFp32Fix", "../ai_directory/chilloutmix_NiPrunedFp32Fix"),
    ("henmixrealV10_henmixrealV10", "../ai_directory/henmixrealV10_henmixrealV10"),
    ("realisticVisionV40_v40VAE", "../ai_directory/realisticVisionV40_v40VAE"),
]
schedulers = [
    ("LMSDiscreteScheduler", diffusers.schedulers.scheduling_lms_discrete.LMSDiscreteScheduler),
    ("DDIMScheduler", diffusers.schedulers.scheduling_ddim.DDIMScheduler),
    ("DPMSolverMultistepScheduler", diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler),
    ("EulerDiscreteScheduler", diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler),
    ("PNDMScheduler", diffusers.schedulers.scheduling_pndm.PNDMScheduler),
    ("DDPMScheduler", diffusers.schedulers.scheduling_ddpm.DDPMScheduler),
    ("EulerAncestralDiscreteScheduler",
     diffusers.schedulers.scheduling_euler_ancestral_discrete.EulerAncestralDiscreteScheduler)
]
loras = [
    (None, None),
    # ("virginie_efira_v01", "../ai_files/virginie_efira_v01.safetensors")
]
lora_multipliers = [0.5, 1.0, 1.5, 2.0]

combined_list = list(itertools.product(
    models, schedulers, loras, lora_multipliers, strengths, guidance_scales, eta_list, additional_prompts)
)

# Shuffle the combined list
random.shuffle(combined_list)

for item in tqdm(combined_list, total=len(combined_list)):
    (model_name, model_dir), (scheduler_name, scheduler), (lora_name, lora_file), lora_multiplier, strength, guidance_scale, eta, add_prompt = item
    controlnets = [
        ControlNetModel.from_pretrained(
            "lllyasviel/control_v11p_sd15_inpaint",
        ).to(device),
    ]
    pipe = StableDiffusionControlNetInpaintPipeline.from_pretrained(
        model_dir,
        controlnet=controlnets[0],
        requires_safety_checker=False,
        safety_checker=None
    ).to(device)
    pipe.requires_safety_check = False
    pipe.safety_checker = None
    if lora_name != None:
        pipe = load_lora_weights_orig(pipe, lora_file, lora_multiplier, device, torch.float32)
    pipe.scheduler = scheduler.from_config(pipe.scheduler.config)

    my_images = [
        make_inpaint_condition(init_image, mask_body_image),
    ]

    prompt: str = f"{base_prompt} {add_prompt}"
    prompt_embeds, negative_prompt_embeds = get_pipeline_embeds(pipe, prompt, negative_prompt, device)

    filename: str = f"{out_dir}/{model_name}_{scheduler_name}_{lora_name}_{lora_multiplier}_{strength}_{guidance_scale}_{eta}_{base_prompt}_{add_prompt[:20]}.png"
    print(f"{filename} is running")
    if not os.path.exists(filename):
        # generate image
        image = pipe(
            prompt_embeds=prompt_embeds,
            negative_prompt_embeds=negative_prompt_embeds,
            image=init_image,
            mask_image=mask_body_image,
            control_image=my_images[0],
            num_inference_steps=100,
            generator=generator,
            eta=eta,
            strength=strength,
            guidance_scale=guidance_scale,
        ).images[0].save(filename)
    else:
        print(f"{filename} is exists")

Additional code

import os
import random
import torch
import numpy as np
from PIL import Image


def seed_everything(seed: int):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = True
    elif torch.backends.mps.is_available():
        torch.mps.manual_seed(seed)
    else:
        pass

"""https://github.com/huggingface/diffusers/issues/3064#issuecomment-1512429695"""
from collections import defaultdict

import torch
from safetensors.torch import load_file


def load_lora_weights_orig(pipeline, checkpoint_path, multiplier, device, dtype):
    LORA_PREFIX_UNET = "lora_unet"
    LORA_PREFIX_TEXT_ENCODER = "lora_te"
    # load LoRA weight from .safetensors
    state_dict = load_file(checkpoint_path, device=device)

    updates = defaultdict(dict)
    for key, value in state_dict.items():
        # it is suggested to print out the key, it usually will be something like below
        # "lora_te_text_model_encoder_layers_0_self_attn_k_proj.lora_down.weight"

        layer, elem = key.split('.', 1)
        updates[layer][elem] = value

    # directly update weight in diffusers model
    for layer, elems in updates.items():

        if "text" in layer:
            layer_infos = layer.split(LORA_PREFIX_TEXT_ENCODER + "_")[-1].split("_")
            curr_layer = pipeline.text_encoder
        else:
            layer_infos = layer.split(LORA_PREFIX_UNET + "_")[-1].split("_")
            curr_layer = pipeline.unet

        # find the target layer
        temp_name = layer_infos.pop(0)
        while len(layer_infos) > -1:
            try:
                curr_layer = curr_layer.__getattr__(temp_name)
                if len(layer_infos) > 0:
                    temp_name = layer_infos.pop(0)
                elif len(layer_infos) == 0:
                    break
            except Exception:
                if len(temp_name) > 0:
                    temp_name += "_" + layer_infos.pop(0)
                else:
                    temp_name = layer_infos.pop(0)

        # get elements for this layer
        weight_up = elems['lora_up.weight'].to(dtype)
        weight_down = elems['lora_down.weight'].to(dtype)
        alpha = elems['alpha']
        if alpha:
            alpha = alpha.item() / weight_up.shape[1]
        else:
            alpha = 1.0

        # update weight
        if len(weight_up.shape) == 4:
            curr_layer.weight.data += multiplier * alpha * torch.mm(weight_up.squeeze(3).squeeze(2),
                                                                    weight_down.squeeze(3).squeeze(2)).unsqueeze(
                2).unsqueeze(3)
        else:
            curr_layer.weight.data += multiplier * alpha * torch.mm(weight_up, weight_down)

    return pipeline

"""
Overcome the limit by using chunk.
https://github.com/huggingface/diffusers/issues/2136
"""
import torch


def get_pipeline_embeds(pipeline, prompt, negative_prompt, device):
    """ Get pipeline embeds for prompts bigger than the maxlength of the pipe
    :param pipeline:
    :param prompt:
    :param negative_prompt:
    :param device:
    :return:
    """
    max_length = pipeline.tokenizer.model_max_length

    # simple way to determine length of tokens
    count_prompt = len(prompt.split(" "))
    count_negative_prompt = len(negative_prompt.split(" "))

    # create the tensor based on which prompt is longer
    if count_prompt >= count_negative_prompt:
        input_ids = pipeline.tokenizer(prompt, return_tensors="pt", truncation=False).input_ids.to(device)
        shape_max_length = input_ids.shape[-1]
        negative_ids = pipeline.tokenizer(negative_prompt, truncation=False, padding="max_length",
                                          max_length=shape_max_length, return_tensors="pt").input_ids.to(device)

    else:
        negative_ids = pipeline.tokenizer(negative_prompt, return_tensors="pt", truncation=False).input_ids.to(device)
        shape_max_length = negative_ids.shape[-1]
        input_ids = pipeline.tokenizer(prompt, return_tensors="pt", truncation=False, padding="max_length",
                                       max_length=shape_max_length).input_ids.to(device)

    concat_embeds = []
    neg_embeds = []
    for i in range(0, shape_max_length, max_length):
        concat_embeds.append(pipeline.text_encoder(input_ids[:, i: i + max_length])[0])
        neg_embeds.append(pipeline.text_encoder(negative_ids[:, i: i + max_length])[0])

    return torch.cat(concat_embeds, dim=1), torch.cat(neg_embeds, dim=1)